Skip to content

Commit 6540bf3

Browse files
committed
unicode: make unicodeobject.c thread-safe
- Use atomic operations to initialize _Py_Identifiers - Create interpreter interned dict at interpreter startup
1 parent 410ba10 commit 6540bf3

File tree

6 files changed

+58
-104
lines changed

6 files changed

+58
-104
lines changed

‎Include/cpython/object.h‎

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,15 @@ PyAPI_FUNC(Py_ssize_t) _Py_GetRefTotal(void);
3737
*/
3838
typedef struct _Py_Identifier {
3939
const char* string;
40-
// Index in PyInterpreterState.unicode.ids.array. It is process-wide
41-
// unique and must be initialized to -1.
42-
Py_ssize_t index;
40+
PyObject *obj;
41+
struct _Py_Identifier *next;
4342
} _Py_Identifier;
4443

4544
#ifndef Py_BUILD_CORE
4645
// For now we are keeping _Py_IDENTIFIER for continued use
4746
// in non-builtin extensions (and naughty PyPI modules).
4847

49-
#define _Py_static_string_init(value) { .string = (value), .index = -1 }
48+
#define _Py_static_string_init(value) { .string = (value), .obj = NULL, .next = NULL }
5049
#define _Py_static_string(varname, value) static _Py_Identifier varname = _Py_static_string_init(value)
5150
#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
5251

‎Include/internal/pycore_qsbr.h‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ struct qsbr_pad {
2121
char __padding[64 - sizeof(struct qsbr)];
2222
};
2323

24+
struct _Py_qsbr_head {
25+
struct _Py_qsbr_head *next;
26+
uint64_t seq;
27+
};
28+
2429
static inline uint64_t
2530
_Py_qsbr_shared_current(struct qsbr_shared *shared)
2631
{

‎Include/internal/pycore_unicodeobject.h‎

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,9 @@ extern PyTypeObject _PyUnicodeASCIIIter_Type;
2525

2626
/* other API */
2727

28-
struct _Py_unicode_runtime_ids {
29-
_PyMutex mutex;
30-
// next_index value must be preserved when Py_Initialize()/Py_Finalize()
31-
// is called multiple times: see _PyUnicode_FromId() implementation.
32-
Py_ssize_t next_index;
33-
};
34-
3528
struct _Py_unicode_runtime_state {
36-
struct _Py_unicode_runtime_ids ids;
29+
// linked list of initialized _Py_Identifiers
30+
_Py_Identifier *head;
3731
};
3832

3933
/* fs_codec.encoding is initialized to NULL.
@@ -45,18 +39,10 @@ struct _Py_unicode_fs_codec {
4539
_Py_error_handler error_handler;
4640
};
4741

48-
struct _Py_unicode_ids {
49-
Py_ssize_t size;
50-
PyObject **array;
51-
};
52-
5342
struct _Py_unicode_state {
5443
struct _Py_unicode_fs_codec fs_codec;
5544

5645
_PyUnicode_Name_CAPI *ucnhash_capi;
57-
58-
// Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
59-
struct _Py_unicode_ids ids;
6046
};
6147

6248
extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);

‎Objects/unicodeobject.c‎

Lines changed: 46 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,7 @@ static inline PyObject* unicode_get_empty(void)
220220
// Return a strong reference to the empty string singleton.
221221
static inline PyObject* unicode_new_empty(void)
222222
{
223-
PyObject *empty = unicode_get_empty();
224-
return Py_NewRef(empty);
223+
return unicode_get_empty();
225224
}
226225

227226
/* This dictionary holds all interned unicode strings. Note that references
@@ -1706,7 +1705,7 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
17061705
static PyObject*
17071706
get_latin1_char(Py_UCS1 ch)
17081707
{
1709-
return Py_NewRef(LATIN1(ch));
1708+
return LATIN1(ch);
17101709
}
17111710

17121711
static PyObject*
@@ -1863,67 +1862,41 @@ resize_array(PyObject **array, Py_ssize_t *capacity)
18631862
return new_array;
18641863
}
18651864

1866-
PyObject *
1867-
_PyUnicode_FromId(_Py_Identifier *id)
1865+
static PyObject *
1866+
initialize_identifier(_Py_Identifier *id)
18681867
{
1869-
PyInterpreterState *interp = _PyInterpreterState_GET();
1870-
struct _Py_unicode_ids *ids = &interp->unicode.ids;
1871-
1872-
Py_ssize_t index = _Py_atomic_size_get(&id->index);
1873-
if (index < 0) {
1874-
struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
1875-
1876-
_PyMutex_lock(&rt_ids->mutex);
1877-
// Check again to detect concurrent access. Another thread can have
1878-
// initialized the index while this thread waited for the lock.
1879-
index = _Py_atomic_size_get(&id->index);
1880-
if (index < 0) {
1881-
assert(rt_ids->next_index < PY_SSIZE_T_MAX);
1882-
index = rt_ids->next_index;
1883-
rt_ids->next_index++;
1884-
_Py_atomic_size_set(&id->index, index);
1885-
}
1886-
_PyMutex_unlock(&rt_ids->mutex);
1887-
}
1888-
assert(index >= 0);
1889-
1890-
PyObject *obj;
1891-
if (index < ids->size) {
1892-
obj = ids->array[index];
1893-
if (obj) {
1894-
// Return a borrowed reference
1895-
return obj;
1896-
}
1897-
}
1898-
1899-
obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
1900-
NULL, NULL);
1868+
PyObject *obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
1869+
NULL, NULL);
19011870
if (!obj) {
19021871
return NULL;
19031872
}
19041873
PyUnicode_InternInPlace(&obj);
19051874

1906-
if (index >= ids->size) {
1907-
// Overallocate to reduce the number of realloc
1908-
Py_ssize_t new_size = Py_MAX(index * 2, 16);
1909-
Py_ssize_t item_size = sizeof(ids->array[0]);
1910-
PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
1911-
if (new_array == NULL) {
1912-
PyErr_NoMemory();
1913-
return NULL;
1875+
assert(_PyObject_IS_IMMORTAL(obj));
1876+
1877+
if (!_Py_atomic_compare_exchange_ptr(&id->obj, NULL, obj)) {
1878+
Py_DECREF(obj);
1879+
return _Py_atomic_load_ptr(&id->obj);
1880+
}
1881+
for (;;) {
1882+
id->next = _Py_atomic_load_ptr(&_PyRuntime.unicode_state.head);
1883+
if (_Py_atomic_compare_exchange_ptr(&_PyRuntime.unicode_state.head, id->next, id)) {
1884+
break;
19141885
}
1915-
memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
1916-
ids->array = new_array;
1917-
ids->size = new_size;
19181886
}
1919-
1920-
// The array stores a strong reference
1921-
ids->array[index] = obj;
1922-
1923-
// Return a borrowed reference
19241887
return obj;
19251888
}
19261889

1890+
PyObject *
1891+
_PyUnicode_FromId(_Py_Identifier *id)
1892+
{
1893+
PyObject *obj = _Py_atomic_load_ptr(&id->obj);
1894+
if (obj) {
1895+
return obj;
1896+
}
1897+
return initialize_identifier(id);
1898+
}
1899+
19271900
static void
19281901
_PyUnicode_Immortalize(PyObject *obj)
19291902
{
@@ -1956,17 +1929,16 @@ _PyUnicode_Immortalize(PyObject *obj)
19561929

19571930

19581931
static void
1959-
unicode_clear_identifiers(struct _Py_unicode_state *state)
1932+
unicode_clear_identifiers(struct _Py_unicode_runtime_state *state)
19601933
{
1961-
struct _Py_unicode_ids *ids = &state->ids;
1962-
for (Py_ssize_t i=0; i < ids->size; i++) {
1963-
Py_XDECREF(ids->array[i]);
1934+
_Py_Identifier *id = state->head;
1935+
while (id) {
1936+
_Py_Identifier *next = id->next;
1937+
id->next = NULL;
1938+
id->obj = NULL;
1939+
id = next;
19641940
}
1965-
ids->size = 0;
1966-
PyMem_Free(ids->array);
1967-
ids->array = NULL;
1968-
// Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
1969-
// after Py_Finalize().
1941+
state->head = NULL;
19701942
}
19711943

19721944
static void
@@ -14593,6 +14565,16 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
1459314565
return _PyStatus_OK();
1459414566
}
1459514567

14568+
/* Create the interned dictionary. This must be done before creating static
14569+
* strings.
14570+
*/
14571+
assert(get_interned_dict() == NULL);
14572+
PyObject *dict = PyDict_New();
14573+
if (!dict) {
14574+
return _PyStatus_NO_MEMORY();
14575+
}
14576+
set_interned_dict(dict);
14577+
1459614578
/* Intern statically allocated string identifiers and deepfreeze strings.
1459714579
* This must be done before any module initialization so that statically
1459814580
* allocated string identifiers are used instead of heap allocated strings.
@@ -14660,14 +14642,6 @@ PyUnicode_InternInPlace(PyObject **p)
1466014642
}
1466114643

1466214644
PyObject *interned = get_interned_dict();
14663-
if (interned == NULL) {
14664-
interned = PyDict_New();
14665-
if (interned == NULL) {
14666-
PyErr_Clear(); /* Don't leave an exception */
14667-
return;
14668-
}
14669-
set_interned_dict(interned);
14670-
}
1467114645

1467214646
if (!_Py_ThreadLocal(s) && !_PyObject_IS_IMMORTAL(s)) {
1467314647
/* Make a copy so that we can safely immortalize the string. */
@@ -15189,21 +15163,18 @@ _PyUnicode_FiniTypes(PyInterpreterState *interp)
1518915163
void
1519015164
_PyUnicode_Fini(PyInterpreterState *interp)
1519115165
{
15192-
struct _Py_unicode_state *state = &interp->unicode;
15193-
1519415166
if (_Py_IsMainInterpreter(interp)) {
1519515167
// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
1519615168
assert(get_interned_dict() == NULL);
1519715169
// bpo-47182: force a unicodedata CAPI capsule re-import on
1519815170
// subsequent initialization of main interpreter.
1519915171
}
1520015172

15201-
_PyUnicode_FiniEncodings(&state->fs_codec);
15173+
_PyUnicode_FiniEncodings(&interp->unicode.fs_codec);
1520215174
interp->unicode.ucnhash_capi = NULL;
1520315175

15204-
unicode_clear_identifiers(state);
15205-
1520615176
if (_Py_IsMainInterpreter(interp)) {
15177+
unicode_clear_identifiers(&_PyRuntime.unicode_state);
1520715178
unicode_free_immortalized(&_PyRuntime);
1520815179
}
1520915180
}

‎Programs/_testembed.c‎

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1896,7 +1896,6 @@ static int test_unicode_id_init(void)
18961896
// is defined, it is manually expanded here.
18971897
static _Py_Identifier PyId_test_unicode_id_init = {
18981898
.string = "test_unicode_id_init",
1899-
.index = -1,
19001899
};
19011900

19021901
// Initialize Python once without using the identifier

‎Python/pystate.c‎

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ Py_DECL_THREAD PyThreadState *_Py_current_tstate;
7373
static void
7474
init_runtime(_PyRuntimeState *runtime,
7575
void *open_code_hook, void *open_code_userdata,
76-
_Py_AuditHookEntry *audit_hook_head,
77-
Py_ssize_t unicode_next_index)
76+
_Py_AuditHookEntry *audit_hook_head)
7877
{
7978
if (runtime->_initialized) {
8079
Py_FatalError("runtime already initialized");
@@ -95,9 +94,6 @@ init_runtime(_PyRuntimeState *runtime,
9594

9695
// Set it to the ID of the main thread of the main interpreter.
9796
runtime->main_thread = PyThread_get_thread_ident();
98-
99-
runtime->unicode_state.ids.next_index = unicode_next_index;
100-
10197
runtime->_initialized = 1;
10298
}
10399

@@ -112,15 +108,13 @@ _PyRuntimeState_Init(_PyRuntimeState *runtime)
112108
_Py_AuditHookEntry *audit_hook_head = runtime->audit_hook_head;
113109
// bpo-42882: Preserve next_index value if Py_Initialize()/Py_Finalize()
114110
// is called multiple times.
115-
Py_ssize_t unicode_next_index = runtime->unicode_state.ids.next_index;
116111

117112
if (runtime->_initialized) {
118113
// Py_Initialize() must be running again.
119114
// Reset to _PyRuntimeState_INIT.
120115
memcpy(runtime, &initial, sizeof(*runtime));
121116
}
122-
init_runtime(runtime, open_code_hook, open_code_userdata, audit_hook_head,
123-
unicode_next_index);
117+
init_runtime(runtime, open_code_hook, open_code_userdata, audit_hook_head);
124118

125119
return _PyStatus_OK();
126120
}

0 commit comments

Comments
 (0)