From 6339ae838ae2f68798dec7f4053736f07a9c01aa Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Tue, 13 Jan 2026 17:53:13 +0000 Subject: [PATCH 01/19] WIP copy BytesWriter into StringWriter implementation --- mypyc/lib-rt/librt_strings.c | 365 +++++++++++++++++++++++++++++++++++ mypyc/lib-rt/librt_strings.h | 8 + 2 files changed, 373 insertions(+) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 1acaefa77fef..f42daa151d4e 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -379,6 +379,371 @@ BytesWriter_len_internal(PyObject *self) { return writer->len << 1; } +// +// StringWriter +// + +static PyTypeObject StringWriterType; + +static bool +_grow_buffer_string(StringWriterObject *data, Py_ssize_t n) { + Py_ssize_t target = data->len + n; + Py_ssize_t size = data->capacity; + do { + size *= 2; + } while (target >= size); + if (data->buf == data->data) { + // Move from embedded buffer to heap-allocated buffer + data->buf = PyMem_Malloc(size); + if (data->buf != NULL) { + memcpy(data->buf, data->data, WRITER_EMBEDDED_BUF_LEN); + } + } else { + data->buf = PyMem_Realloc(data->buf, size); + } + if (unlikely(data->buf == NULL)) { + PyErr_NoMemory(); + return false; + } + data->capacity = size; + return true; +} + +static inline bool +ensure_string_writer_size(StringWriterObject *data, Py_ssize_t n) { + if (likely(data->capacity - data->len >= n)) { + return true; + } else { + return _grow_buffer_string(data, n); + } +} + +static inline void +StringWriter_init_internal(StringWriterObject *self) { + self->buf = self->data; + self->len = 0; + self->capacity = WRITER_EMBEDDED_BUF_LEN; +} + +static PyObject* +StringWriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + if (type != &StringWriterType) { + PyErr_SetString(PyExc_TypeError, "StringWriter cannot be subclassed"); + return NULL; + } + + StringWriterObject *self = (StringWriterObject *)type->tp_alloc(type, 0); + if (self != NULL) { + StringWriter_init_internal(self); + } + return (PyObject *)self; +} + +static PyObject * +StringWriter_internal(void) { + StringWriterObject *self = (StringWriterObject *)StringWriterType.tp_alloc(&StringWriterType, 0); + if (self == NULL) + return NULL; + StringWriter_init_internal(self); + return (PyObject *)self; +} + +static int +StringWriter_init(StringWriterObject *self, PyObject *args, PyObject *kwds) +{ + if (!PyArg_ParseTuple(args, "")) { + return -1; + } + + if (kwds != NULL && PyDict_Size(kwds) > 0) { + PyErr_SetString(PyExc_TypeError, + "StringWriter() takes no keyword arguments"); + return -1; + } + + StringWriter_init_internal(self); + return 0; +} + +static void +StringWriter_dealloc(StringWriterObject *self) +{ + if (self->buf != self->data) { + PyMem_Free(self->buf); + self->buf = NULL; + } + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject* +StringWriter_getvalue_internal(PyObject *self) +{ + StringWriterObject *obj = (StringWriterObject *)self; + return PyBytes_FromStringAndSize(obj->buf, obj->len); +} + +static PyObject* +StringWriter_repr(StringWriterObject *self) +{ + PyObject *value = StringWriter_getvalue_internal((PyObject *)self); + if (value == NULL) { + return NULL; + } + PyObject *value_repr = PyObject_Repr(value); + Py_DECREF(value); + if (value_repr == NULL) { + return NULL; + } + PyObject *result = PyUnicode_FromFormat("StringWriter(%U)", value_repr); + Py_DECREF(value_repr); + return result; +} + +static PyObject* +StringWriter_getvalue(StringWriterObject *self, PyObject *Py_UNUSED(ignored)) +{ + return PyBytes_FromStringAndSize(self->buf, self->len); +} + +static Py_ssize_t +StringWriter_length(StringWriterObject *self) +{ + return self->len; +} + +static PyObject* +StringWriter_item(StringWriterObject *self, Py_ssize_t index) +{ + Py_ssize_t length = self->len; + + // Check bounds + if (index < 0 || index >= length) { + PyErr_SetString(PyExc_IndexError, "StringWriter index out of range"); + return NULL; + } + + // Return the byte at the given index as a Python int + return PyLong_FromLong((unsigned char)self->buf[index]); +} + +static int +StringWriter_ass_item(StringWriterObject *self, Py_ssize_t index, PyObject *value) +{ + Py_ssize_t length = self->len; + + // Check bounds + if (index < 0 || index >= length) { + PyErr_SetString(PyExc_IndexError, "StringWriter index out of range"); + return -1; + } + + // Check that value is not NULL (deletion not supported) + if (value == NULL) { + PyErr_SetString(PyExc_TypeError, "StringWriter does not support item deletion"); + return -1; + } + + // Convert value to uint8 + uint8_t byte_value = CPyLong_AsUInt8(value); + if (unlikely(byte_value == CPY_LL_UINT_ERROR && PyErr_Occurred())) { + CPy_TypeError("u8", value); + return -1; + } + + // Assign the byte + self->buf[index] = (char)byte_value; + return 0; +} + +static PySequenceMethods StringWriter_as_sequence = { + .sq_length = (lenfunc)StringWriter_length, + .sq_item = (ssizeargfunc)StringWriter_item, + .sq_ass_item = (ssizeobjargproc)StringWriter_ass_item, +}; + +static PyObject* StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames); +static PyObject* StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames); +static PyObject* StringWriter_truncate(PyObject *self, PyObject *const *args, size_t nargs); + +static PyMethodDef StringWriter_methods[] = { + {"append", (PyCFunction) StringWriter_append, METH_FASTCALL | METH_KEYWORDS, + PyDoc_STR("Append a single byte to the buffer") + }, + {"write", (PyCFunction) StringWriter_write, METH_FASTCALL | METH_KEYWORDS, + PyDoc_STR("Append bytes to the buffer") + }, + {"getvalue", (PyCFunction) StringWriter_getvalue, METH_NOARGS, + "Return the buffer content as bytes object" + }, + {"truncate", (PyCFunction) StringWriter_truncate, METH_FASTCALL, + PyDoc_STR("Truncate the buffer to the specified size") + }, + {NULL} /* Sentinel */ +}; + +static PyTypeObject StringWriterType = { + .ob_base = PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "StringWriter", + .tp_doc = PyDoc_STR("Memory buffer for building bytes objects from parts"), + .tp_basicsize = sizeof(StringWriterObject), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_new = StringWriter_new, + .tp_init = (initproc) StringWriter_init, + .tp_dealloc = (destructor) StringWriter_dealloc, + .tp_methods = StringWriter_methods, + .tp_as_sequence = &StringWriter_as_sequence, + .tp_repr = (reprfunc)StringWriter_repr, +}; + +static inline bool +check_string_writer(PyObject *data) { + if (unlikely(Py_TYPE(data) != &StringWriterType)) { + PyErr_Format( + PyExc_TypeError, "data must be a StringWriter object, got %s", Py_TYPE(data)->tp_name + ); + return false; + } + return true; +} + +static char +StringWriter_write_internal(StringWriterObject *self, PyObject *value) { + const char *data; + Py_ssize_t size; + if (likely(PyBytes_Check(value))) { + data = PyBytes_AS_STRING(value); + size = PyBytes_GET_SIZE(value); + } else { + data = PyByteArray_AS_STRING(value); + size = PyByteArray_GET_SIZE(value); + } + // Write bytes content. + if (!ensure_string_writer_size(self, size)) + return CPY_NONE_ERROR; + memcpy(self->buf + self->len, data, size); + self->len += size; + return CPY_NONE; +} + +static PyObject* +StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames) { + static const char * const kwlist[] = {"value", 0}; + static CPyArg_Parser parser = {"O:write", kwlist, 0}; + PyObject *value; + if (unlikely(!CPyArg_ParseStackAndKeywordsSimple(args, nargs, kwnames, &parser, &value))) { + return NULL; + } + if (!check_string_writer(self)) { + return NULL; + } + if (unlikely(!PyBytes_Check(value) && !PyByteArray_Check(value))) { + PyErr_SetString(PyExc_TypeError, "value must be a bytes or bytearray object"); + return NULL; + } + if (unlikely(StringWriter_write_internal((StringWriterObject *)self, value) == CPY_NONE_ERROR)) { + return NULL; + } + Py_INCREF(Py_None); + return Py_None; +} + +static inline char +StringWriter_append_internal(StringWriterObject *self, uint8_t value) { + if (!ensure_string_writer_size(self, 1)) + return CPY_NONE_ERROR; + _WRITE(self, uint8_t, value); + return CPY_NONE; +} + +static PyObject* +StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames) { + static const char * const kwlist[] = {"value", 0}; + static CPyArg_Parser parser = {"O:append", kwlist, 0}; + PyObject *value; + if (unlikely(!CPyArg_ParseStackAndKeywordsSimple(args, nargs, kwnames, &parser, &value))) { + return NULL; + } + if (!check_string_writer(self)) { + return NULL; + } + uint8_t unboxed = CPyLong_AsUInt8(value); + if (unlikely(unboxed == CPY_LL_UINT_ERROR && PyErr_Occurred())) { + CPy_TypeError("u8", value); + return NULL; + } + if (unlikely(StringWriter_append_internal((StringWriterObject *)self, unboxed) == CPY_NONE_ERROR)) { + return NULL; + } + Py_INCREF(Py_None); + return Py_None; +} + +static char +StringWriter_truncate_internal(PyObject *self, int64_t size) { + StringWriterObject *writer = (StringWriterObject *)self; + Py_ssize_t current_size = writer->len; + + // Validate size is non-negative + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "size must be non-negative"); + return CPY_NONE_ERROR; + } + + // Validate size doesn't exceed current size + if (size > current_size) { + PyErr_SetString(PyExc_ValueError, "size cannot be larger than current buffer size"); + return CPY_NONE_ERROR; + } + + writer->len = size; + return CPY_NONE; +} + +static PyObject* +StringWriter_truncate(PyObject *self, PyObject *const *args, size_t nargs) { + if (unlikely(nargs != 1)) { + PyErr_Format(PyExc_TypeError, + "truncate() takes exactly 1 argument (%zu given)", nargs); + return NULL; + } + if (!check_string_writer(self)) { + return NULL; + } + + PyObject *size_obj = args[0]; + int overflow; + long long size = PyLong_AsLongLongAndOverflow(size_obj, &overflow); + + if (size == -1 && PyErr_Occurred()) { + return NULL; + } + if (overflow != 0) { + PyErr_SetString(PyExc_ValueError, "integer out of range"); + return NULL; + } + + if (unlikely(StringWriter_truncate_internal(self, size) == CPY_NONE_ERROR)) { + return NULL; + } + Py_INCREF(Py_None); + return Py_None; +} + +static PyTypeObject * +StringWriter_type_internal(void) { + return &StringWriterType; // Return borrowed reference +}; + +static CPyTagged +StringWriter_len_internal(PyObject *self) { + StringWriterObject *writer = (StringWriterObject *)self; + return writer->len << 1; +} + +// End of StringWriter + #endif static PyMethodDef librt_strings_module_methods[] = { diff --git a/mypyc/lib-rt/librt_strings.h b/mypyc/lib-rt/librt_strings.h index 069bec84a311..f3c106245174 100644 --- a/mypyc/lib-rt/librt_strings.h +++ b/mypyc/lib-rt/librt_strings.h @@ -41,6 +41,14 @@ typedef struct { char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer } BytesWriterObject; +typedef struct { + PyObject_HEAD + char *buf; // Beginning of the buffer + Py_ssize_t len; // Current length (number of bytes written) + Py_ssize_t capacity; // Total capacity of the buffer + char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer +} StringWriterObject; + #define LibRTStrings_ABIVersion (*(int (*)(void)) LibRTStrings_API[0]) #define LibRTStrings_APIVersion (*(int (*)(void)) LibRTStrings_API[1]) #define LibRTStrings_BytesWriter_internal (*(PyObject* (*)(void)) LibRTStrings_API[2]) From e322bb4eeaf766fd8133faafb93ecbd4465eb9e8 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Tue, 13 Jan 2026 18:08:00 +0000 Subject: [PATCH 02/19] Start implementing StringWriter --- mypy/typeshed/stubs/librt/librt/strings.pyi | 18 +++++++++++++----- mypyc/lib-rt/librt_strings.c | 21 ++++++++++++++------- mypyc/lib-rt/librt_strings.h | 5 +++-- mypyc/test-data/run-librt-strings.test | 6 +++++- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 241f6a6fba5b..448b06bdebcf 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -1,13 +1,21 @@ from typing import final -from mypy_extensions import i64, u8 +from mypy_extensions import i64, i32, u8 @final class BytesWriter: def append(self, /, x: int) -> None: ... - def write(self, /, b: bytes | bytearray) -> None: ... + def write(self, b: bytes | bytearray, /) -> None: ... def getvalue(self) -> bytes: ... - def truncate(self, /, size: i64) -> None: ... + def truncate(self, size: i64, /) -> None: ... def __len__(self) -> i64: ... - def __getitem__(self, /, i: i64) -> u8: ... - def __setitem__(self, /, i: i64, x: u8) -> None: ... + def __getitem__(self, i: i64, /) -> u8: ... + def __setitem__(self, i: i64, x: u8, /) -> None: ... + +@final +class StringWriter: + def append(self, x: int, /) -> None: ... + def write(self, s: str, /) -> None: ... + def getvalue(self) -> str: ... + def __len__(self) -> i64: ... + def __getitem__(self, i: i64, /) -> i32: ... diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index f42daa151d4e..9e578bb7e115 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -12,7 +12,7 @@ // BytesWriter // -#define _WRITE(data, type, v) \ +#define _WRITE_BYTES(data, type, v) \ do { \ *(type *)(((BytesWriterObject *)data)->buf + ((BytesWriterObject *)data)->len) = v; \ ((BytesWriterObject *)data)->len += sizeof(type); \ @@ -290,7 +290,7 @@ static inline char BytesWriter_append_internal(BytesWriterObject *self, uint8_t value) { if (!ensure_bytes_writer_size(self, 1)) return CPY_NONE_ERROR; - _WRITE(self, uint8_t, value); + _WRITE_BYTES(self, uint8_t, value); return CPY_NONE; } @@ -421,6 +421,7 @@ ensure_string_writer_size(StringWriterObject *data, Py_ssize_t n) { static inline void StringWriter_init_internal(StringWriterObject *self) { self->buf = self->data; + self->kind = 1; self->len = 0; self->capacity = WRITER_EMBEDDED_BUF_LEN; } @@ -480,7 +481,7 @@ static PyObject* StringWriter_getvalue_internal(PyObject *self) { StringWriterObject *obj = (StringWriterObject *)self; - return PyBytes_FromStringAndSize(obj->buf, obj->len); + return PyUnicode_FromKindAndData(obj->kind, obj->buf, obj->len); } static PyObject* @@ -503,7 +504,7 @@ StringWriter_repr(StringWriterObject *self) static PyObject* StringWriter_getvalue(StringWriterObject *self, PyObject *Py_UNUSED(ignored)) { - return PyBytes_FromStringAndSize(self->buf, self->len); + return PyUnicode_FromKindAndData(self->kind, self->buf, self->len); } static Py_ssize_t @@ -574,7 +575,7 @@ static PyMethodDef StringWriter_methods[] = { PyDoc_STR("Append bytes to the buffer") }, {"getvalue", (PyCFunction) StringWriter_getvalue, METH_NOARGS, - "Return the buffer content as bytes object" + "Return the buffer content as str object" }, {"truncate", (PyCFunction) StringWriter_truncate, METH_FASTCALL, PyDoc_STR("Truncate the buffer to the specified size") @@ -585,7 +586,7 @@ static PyMethodDef StringWriter_methods[] = { static PyTypeObject StringWriterType = { .ob_base = PyVarObject_HEAD_INIT(NULL, 0) .tp_name = "StringWriter", - .tp_doc = PyDoc_STR("Memory buffer for building bytes objects from parts"), + .tp_doc = PyDoc_STR("Memory buffer for building string objects from parts"), .tp_basicsize = sizeof(StringWriterObject), .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, @@ -653,7 +654,7 @@ static inline char StringWriter_append_internal(StringWriterObject *self, uint8_t value) { if (!ensure_string_writer_size(self, 1)) return CPY_NONE_ERROR; - _WRITE(self, uint8_t, value); + // TODO: Replace _WRITE_BYTES(self, uint8_t, value); return CPY_NONE; } @@ -771,9 +772,15 @@ librt_strings_module_exec(PyObject *m) if (PyType_Ready(&BytesWriterType) < 0) { return -1; } + if (PyType_Ready(&StringWriterType) < 0) { + return -1; + } if (PyModule_AddObjectRef(m, "BytesWriter", (PyObject *) &BytesWriterType) < 0) { return -1; } + if (PyModule_AddObjectRef(m, "StringWriter", (PyObject *) &StringWriterType) < 0) { + return -1; + } // Export mypy internal C API, be careful with the order! static void *librt_strings_api[LIBRT_STRINGS_API_LEN] = { diff --git a/mypyc/lib-rt/librt_strings.h b/mypyc/lib-rt/librt_strings.h index f3c106245174..c45e3d3e9875 100644 --- a/mypyc/lib-rt/librt_strings.h +++ b/mypyc/lib-rt/librt_strings.h @@ -44,8 +44,9 @@ typedef struct { typedef struct { PyObject_HEAD char *buf; // Beginning of the buffer - Py_ssize_t len; // Current length (number of bytes written) - Py_ssize_t capacity; // Total capacity of the buffer + char kind; // Bytes per code point (1, 2 or 4) + Py_ssize_t len; // Current length (number of code points written) + Py_ssize_t capacity; // Total capacity of the buffer (number of code points) char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer } StringWriterObject; diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index f3e0b7b13100..4e0e1db5ec62 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -4,7 +4,7 @@ import base64 import binascii import random -from librt.strings import BytesWriter +from librt.strings import BytesWriter, StringWriter from testutil import assertRaises @@ -189,6 +189,10 @@ def test_bytes_writer_wrapper_functions() -> None: with assertRaises(TypeError): b[0] = 256 +def test_string_writer_basics() -> None: + w = StringWriter() + assert w.getvalue() == "" + [case testStringsFeaturesNotAvailableInNonExperimentalBuild_librt] # This also ensures librt.strings can be built without experimental features import librt.strings From dbaf334eb0f14db1e59b135092a68ac8dd7c6520 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 11:26:23 +0000 Subject: [PATCH 03/19] WIP start work on StringWriter append --- mypyc/lib-rt/librt_strings.c | 140 +++++++++++++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 5 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 9e578bb7e115..8490ee07fae1 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -650,14 +650,144 @@ StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject return Py_None; } -static inline char -StringWriter_append_internal(StringWriterObject *self, uint8_t value) { - if (!ensure_string_writer_size(self, 1)) - return CPY_NONE_ERROR; - // TODO: Replace _WRITE_BYTES(self, uint8_t, value); +static void convert_string_data(char *src_buf, char *dest_buf, Py_ssize_t len, + char old_kind, char new_kind) { + bool in_place = (src_buf == dest_buf); + + if (old_kind == 1 && new_kind == 2) { + uint8_t *src = (uint8_t *)src_buf; + uint16_t *dest = (uint16_t *)dest_buf; + if (in_place) { + // Convert backwards to avoid overwriting + for (Py_ssize_t i = len - 1; i >= 0; i--) { + dest[i] = src[i]; + } + } else { + // Convert forwards + for (Py_ssize_t i = 0; i < len; i++) { + dest[i] = src[i]; + } + } + } else if (old_kind == 2 && new_kind == 4) { + uint16_t *src = (uint16_t *)src_buf; + uint32_t *dest = (uint32_t *)dest_buf; + if (in_place) { + // Convert backwards to avoid overwriting + for (Py_ssize_t i = len - 1; i >= 0; i--) { + dest[i] = src[i]; + } + } else { + // Convert forwards + for (Py_ssize_t i = 0; i < len; i++) { + dest[i] = src[i]; + } + } + } +} + +static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, char new_kind) { + // Current buffer size in bytes + Py_ssize_t current_buf_size = (self->buf == self->data) ? WRITER_EMBEDDED_BUF_LEN : (self->capacity * old_kind); + // Needed buffer size in bytes for new kind + Py_ssize_t needed_size = self->len * new_kind; + + if (current_buf_size >= needed_size) { + // Convert in place + convert_string_data(self->buf, self->buf, self->len, old_kind, new_kind); + self->kind = new_kind; + self->capacity = current_buf_size / new_kind; + } else { + // Allocate new buffer + Py_ssize_t new_capacity = self->capacity; + do { + new_capacity *= 2; + } while (new_capacity * new_kind < needed_size); + + char *new_buf = PyMem_Malloc(new_capacity * new_kind); + if (unlikely(new_buf == NULL)) { + PyErr_NoMemory(); + return CPY_NONE_ERROR; + } + + // Convert data during copy + convert_string_data(self->buf, new_buf, self->len, old_kind, new_kind); + + // Free old buffer if it was heap-allocated + if (self->buf != self->data) { + PyMem_Free(self->buf); + } + + self->buf = new_buf; + self->kind = new_kind; + self->capacity = new_capacity; + } return CPY_NONE; } +static char string_writer_switch_kind(StringWriterObject *self, int32_t value) { + if (self->kind == 1) { + // Either kind 1 -> 2 or 1 -> 4. First switch to kind 2. + if (convert_string_buffer_kind(self, 1, 2) == CPY_NONE_ERROR) + return CPY_NONE_ERROR; + if ((uint32_t)value > 0xffff) { + // Call recursively to switch from kind 2 to 4 + return string_writer_switch_kind(self, value); + } + return CPY_NONE; + } else { + // Must be kind 2 -> 4 + return convert_string_buffer_kind(self, 2, 4); + } +} + +static char string_append_slow_path(StringWriterObject *self, int32_t value) { + if (self->kind == 2) { + if ((uint32_t)value <= 0xffff) { + if (!ensure_string_writer_size(self, 1)) + return CPY_NONE_ERROR; + // Copy 2-byte character to buffer + uint16_t val16 = (uint16_t)value; + memcpy(self->buf + self->len * 2, &val16, 2); + self->len++; + return CPY_NONE; + } + if (string_writer_switch_kind(self, value) == CPY_NONE_ERROR) + return CPY_NONE_ERROR; + return string_append_slow_path(self, value); + } else if (self->kind == 1) { + // Check precondition -- this must only be used on slow path + assert((uint32_t)value > 0xff); + if (string_writer_switch_kind(self, value) == CPY_NONE_ERROR) + return CPY_NONE_ERROR; + return string_append_slow_path(self, value); + } + assert(self->kind == 4); + if ((uint32_t)value < (1 << 20)) { + if (!ensure_string_writer_size(self, 1)) + return CPY_NONE_ERROR; + // Copy 4-byte character to buffer + uint32_t val32 = (uint32_t)value; + memcpy(self->buf + self->len * 4, &val32, 4); + self->len++; + return CPY_NONE; + } + // TODO: exception + return CPY_NONE_ERROR; +} + +static inline char +StringWriter_append_internal(StringWriterObject *self, int32_t value) { + char kind = self->kind; + if (kind == 1 && (uint32_t)value < 256) { + if (!ensure_string_writer_size(self, 1)) + return CPY_NONE_ERROR; + self->buf[self->index++] = value; + self->kind = kind; + return CPY_NONE; + } + return string_append_slow_path(self, value); +} + static PyObject* StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames) { static const char * const kwlist[] = {"value", 0}; From 9357f07c961314851df389dfa405262c9e569408 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 11:29:47 +0000 Subject: [PATCH 04/19] Dropbox truncate and __setitem__ and fix compilation --- mypyc/lib-rt/librt_strings.c | 87 +----------------------------------- 1 file changed, 1 insertion(+), 86 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 8490ee07fae1..34813d4565af 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -528,44 +528,13 @@ StringWriter_item(StringWriterObject *self, Py_ssize_t index) return PyLong_FromLong((unsigned char)self->buf[index]); } -static int -StringWriter_ass_item(StringWriterObject *self, Py_ssize_t index, PyObject *value) -{ - Py_ssize_t length = self->len; - - // Check bounds - if (index < 0 || index >= length) { - PyErr_SetString(PyExc_IndexError, "StringWriter index out of range"); - return -1; - } - - // Check that value is not NULL (deletion not supported) - if (value == NULL) { - PyErr_SetString(PyExc_TypeError, "StringWriter does not support item deletion"); - return -1; - } - - // Convert value to uint8 - uint8_t byte_value = CPyLong_AsUInt8(value); - if (unlikely(byte_value == CPY_LL_UINT_ERROR && PyErr_Occurred())) { - CPy_TypeError("u8", value); - return -1; - } - - // Assign the byte - self->buf[index] = (char)byte_value; - return 0; -} - static PySequenceMethods StringWriter_as_sequence = { .sq_length = (lenfunc)StringWriter_length, .sq_item = (ssizeargfunc)StringWriter_item, - .sq_ass_item = (ssizeobjargproc)StringWriter_ass_item, }; static PyObject* StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames); static PyObject* StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames); -static PyObject* StringWriter_truncate(PyObject *self, PyObject *const *args, size_t nargs); static PyMethodDef StringWriter_methods[] = { {"append", (PyCFunction) StringWriter_append, METH_FASTCALL | METH_KEYWORDS, @@ -577,9 +546,6 @@ static PyMethodDef StringWriter_methods[] = { {"getvalue", (PyCFunction) StringWriter_getvalue, METH_NOARGS, "Return the buffer content as str object" }, - {"truncate", (PyCFunction) StringWriter_truncate, METH_FASTCALL, - PyDoc_STR("Truncate the buffer to the specified size") - }, {NULL} /* Sentinel */ }; @@ -781,7 +747,7 @@ StringWriter_append_internal(StringWriterObject *self, int32_t value) { if (kind == 1 && (uint32_t)value < 256) { if (!ensure_string_writer_size(self, 1)) return CPY_NONE_ERROR; - self->buf[self->index++] = value; + self->buf[self->len++] = value; self->kind = kind; return CPY_NONE; } @@ -811,57 +777,6 @@ StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObjec return Py_None; } -static char -StringWriter_truncate_internal(PyObject *self, int64_t size) { - StringWriterObject *writer = (StringWriterObject *)self; - Py_ssize_t current_size = writer->len; - - // Validate size is non-negative - if (size < 0) { - PyErr_SetString(PyExc_ValueError, "size must be non-negative"); - return CPY_NONE_ERROR; - } - - // Validate size doesn't exceed current size - if (size > current_size) { - PyErr_SetString(PyExc_ValueError, "size cannot be larger than current buffer size"); - return CPY_NONE_ERROR; - } - - writer->len = size; - return CPY_NONE; -} - -static PyObject* -StringWriter_truncate(PyObject *self, PyObject *const *args, size_t nargs) { - if (unlikely(nargs != 1)) { - PyErr_Format(PyExc_TypeError, - "truncate() takes exactly 1 argument (%zu given)", nargs); - return NULL; - } - if (!check_string_writer(self)) { - return NULL; - } - - PyObject *size_obj = args[0]; - int overflow; - long long size = PyLong_AsLongLongAndOverflow(size_obj, &overflow); - - if (size == -1 && PyErr_Occurred()) { - return NULL; - } - if (overflow != 0) { - PyErr_SetString(PyExc_ValueError, "integer out of range"); - return NULL; - } - - if (unlikely(StringWriter_truncate_internal(self, size) == CPY_NONE_ERROR)) { - return NULL; - } - Py_INCREF(Py_None); - return Py_None; -} - static PyTypeObject * StringWriter_type_internal(void) { return &StringWriterType; // Return borrowed reference From e60ea5eb44d4be6965b11d2af4ea8e29b1d2f26f Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 11:38:33 +0000 Subject: [PATCH 05/19] Fix append + add some test coverage --- mypyc/lib-rt/librt_strings.c | 6 +++--- mypyc/test-data/run-librt-strings.test | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 34813d4565af..27d4fe1147c6 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -765,9 +765,9 @@ StringWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObjec if (!check_string_writer(self)) { return NULL; } - uint8_t unboxed = CPyLong_AsUInt8(value); - if (unlikely(unboxed == CPY_LL_UINT_ERROR && PyErr_Occurred())) { - CPy_TypeError("u8", value); + int32_t unboxed = CPyLong_AsInt32(value); + if (unlikely(unboxed == CPY_LL_INT_ERROR && PyErr_Occurred())) { + CPy_TypeError("i32", value); return NULL; } if (unlikely(StringWriter_append_internal((StringWriterObject *)self, unboxed) == CPY_NONE_ERROR)) { diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 4e0e1db5ec62..9584543faa10 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -193,6 +193,23 @@ def test_string_writer_basics() -> None: w = StringWriter() assert w.getvalue() == "" +def test_string_writer_append() -> None: + w = StringWriter() + w.append(ord('a')) + assert w.getvalue() == "a" + w.append(0xff) + assert w.getvalue() == "a\xff" + + # Switch kind 1->2 + w.append(0x100) + assert w.getvalue() == "a\xff\u0100", w.getvalue() + w.append(0xffff) + assert w.getvalue() == "a\xff\u0100\uffff" + + # Switch kind 2->4 + w.append(0x10000) + assert w.getvalue() == "a\xff\u0100\uffff" + chr(0x10000) + [case testStringsFeaturesNotAvailableInNonExperimentalBuild_librt] # This also ensures librt.strings can be built without experimental features import librt.strings From 1193fc1def2238ce3021c1f7d8257dad27b706d3 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 15:19:21 +0000 Subject: [PATCH 06/19] Fix growing buffer (no tests yet) --- mypyc/lib-rt/librt_strings.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 27d4fe1147c6..b08397bd620e 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -389,17 +389,21 @@ static bool _grow_buffer_string(StringWriterObject *data, Py_ssize_t n) { Py_ssize_t target = data->len + n; Py_ssize_t size = data->capacity; + char kind = data->kind; do { size *= 2; } while (target >= size); + // Calculate size in bytes + Py_ssize_t size_bytes = size * kind; if (data->buf == data->data) { // Move from embedded buffer to heap-allocated buffer - data->buf = PyMem_Malloc(size); + data->buf = PyMem_Malloc(size_bytes); if (data->buf != NULL) { - memcpy(data->buf, data->data, WRITER_EMBEDDED_BUF_LEN); + // Copy existing data (len * kind bytes) + memcpy(data->buf, data->data, data->len * kind); } } else { - data->buf = PyMem_Realloc(data->buf, size); + data->buf = PyMem_Realloc(data->buf, size_bytes); } if (unlikely(data->buf == NULL)) { PyErr_NoMemory(); From 47ef2e49b692c28c9777bbf3c07dd742e5a6ab7d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 15:24:48 +0000 Subject: [PATCH 07/19] Test buffer growth --- mypyc/test-data/run-librt-strings.test | 83 ++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 9584543faa10..73731cffc895 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -210,6 +210,89 @@ def test_string_writer_append() -> None: w.append(0x10000) assert w.getvalue() == "a\xff\u0100\uffff" + chr(0x10000) +def test_string_writer_append_grow_same_kind() -> None: + # Test growing buffer while staying in kind 1 (ASCII) + w = StringWriter() + # Append enough ASCII characters to grow beyond embedded buffer + for i in range(1000): + w.append(ord('a') + (i % 26)) + assert len(w) == i + 1 + result = w.getvalue() + assert len(result) == 1000 + for i in range(1000): + assert result[i] == chr(ord('a') + (i % 26)) + + # Test growing buffer while staying in kind 2 + w2 = StringWriter() + w2.append(0x100) # Switch to kind 2 + for i in range(1000): + w2.append(0x100 + (i % 100)) + assert len(w2) == i + 2 + result2 = w2.getvalue() + assert len(result2) == 1001 + assert result2[0] == chr(0x100) + for i in range(1000): + assert result2[i + 1] == chr(0x100 + (i % 100)) + + # Test growing buffer while staying in kind 4 + w3 = StringWriter() + w3.append(0x10000) # Switch to kind 4 + for i in range(500): + w3.append(0x10000 + (i % 100)) + assert len(w3) == i + 2 + result3 = w3.getvalue() + assert len(result3) == 501 + assert result3[0] == chr(0x10000) + for i in range(500): + assert result3[i + 1] == chr(0x10000 + (i % 100)) + +def test_string_writer_append_grow_and_switch_kind() -> None: + # Test growing buffer AND switching from kind 1 to kind 2 + w = StringWriter() + # Fill with ASCII to grow buffer + for i in range(500): + w.append(ord('A')) + assert len(w) == 500 + # Now append non-ASCII that requires kind 2, triggering both grow and kind switch + for i in range(500): + w.append(0x100 + i) + result = w.getvalue() + assert len(result) == 1000 + for i in range(500): + assert result[i] == 'A' + for i in range(500): + assert result[500 + i] == chr(0x100 + i) + + # Test growing buffer AND switching from kind 2 to kind 4 + w2 = StringWriter() + w2.append(0x100) # Switch to kind 2 + # Fill with kind 2 characters to grow buffer + for i in range(300): + w2.append(0x200 + (i % 100)) + assert len(w2) == 301 + # Now append characters that require kind 4, triggering both grow and kind switch + for i in range(300): + w2.append(0x10000 + i) + result2 = w2.getvalue() + assert len(result2) == 601 + assert result2[0] == chr(0x100) + for i in range(300): + assert result2[1 + i] == chr(0x200 + (i % 100)) + for i in range(300): + assert result2[301 + i] == chr(0x10000 + i) + + # Test switching kind 1->4 with buffer growth + w3 = StringWriter() + for i in range(300): + w3.append(ord('X')) + # Jump directly to kind 4 + w3.append(0x10000) + result3 = w3.getvalue() + assert len(result3) == 301 + for i in range(300): + assert result3[i] == 'X' + assert result3[300] == chr(0x10000) + [case testStringsFeaturesNotAvailableInNonExperimentalBuild_librt] # This also ensures librt.strings can be built without experimental features import librt.strings From d3d38fe1ef4b0fbdca85c39da124c1d203a26c84 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 15:31:21 +0000 Subject: [PATCH 08/19] Fix get item and add more tests --- mypyc/lib-rt/librt_strings.c | 12 ++- mypyc/test-data/run-librt-strings.test | 118 +++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index b08397bd620e..1501019d459a 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -528,8 +528,16 @@ StringWriter_item(StringWriterObject *self, Py_ssize_t index) return NULL; } - // Return the byte at the given index as a Python int - return PyLong_FromLong((unsigned char)self->buf[index]); + // Read the character at the given index based on kind + uint32_t value; + if (self->kind == 1) { + value = ((uint8_t *)self->buf)[index]; + } else if (self->kind == 2) { + value = ((uint16_t *)self->buf)[index]; + } else { + value = ((uint32_t *)self->buf)[index]; + } + return PyLong_FromLong(value); } static PySequenceMethods StringWriter_as_sequence = { diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 73731cffc895..06f3613e81d4 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -193,6 +193,124 @@ def test_string_writer_basics() -> None: w = StringWriter() assert w.getvalue() == "" +def test_string_writer_repr() -> None: + # Kind 1 (ASCII) + w = StringWriter() + assert repr(w) == "StringWriter('')" + w.append(ord('h')) + w.append(ord('i')) + assert repr(w) == "StringWriter('hi')" + + # Kind 2 (UCS-2) + w2 = StringWriter() + w2.append(0x100) + w2.append(0x200) + assert repr(w2) == "StringWriter('" + chr(0x100) + chr(0x200) + "')" + + # Kind 4 (UCS-4) + w3 = StringWriter() + w3.append(0x10000) + expected = "StringWriter('" + chr(0x10000) + "')" + assert repr(w3) == expected + +def test_string_writer_len() -> None: + # Kind 1 (ASCII) + w = StringWriter() + assert len(w) == 0 + w.append(ord('a')) + assert len(w) == 1 + w.append(ord('b')) + w.append(ord('c')) + assert len(w) == 3 + + # Kind 2 (UCS-2) + w2 = StringWriter() + w2.append(0x100) + assert len(w2) == 1 + for i in range(10): + w2.append(0x200 + i) + assert len(w2) == 11 + + # Kind 4 (UCS-4) + w3 = StringWriter() + w3.append(0x10000) + assert len(w3) == 1 + w3.append(0x10001) + w3.append(0x10002) + assert len(w3) == 3 + + # Test len after growing buffer + w4 = StringWriter() + for i in range(500): + w4.append(ord('X')) + assert len(w4) == 500 + +def test_string_writer_get_item() -> None: + # Kind 1 (ASCII) + w = StringWriter() + w.append(ord('f')) + w.append(ord('o')) + w.append(ord('o')) + assert w[0 + int()] == ord('f') + assert w[1 + int()] == ord('o') + assert w[2 + int()] == ord('o') + assert w[-1 + int()] == ord('o') + assert w[-2 + int()] == ord('o') + assert w[-3 + int()] == ord('f') + + with assertRaises(IndexError): + w[3 + int()] + with assertRaises(IndexError): + w[-4 + int()] + with assertRaises(IndexError): + w[1 << 50] + with assertRaises(IndexError): + w[-(1 << 50)] + + # Kind 2 (UCS-2) + w2 = StringWriter() + w2.append(0x100) + w2.append(0x200) + w2.append(0x300) + assert w2[0 + int()] == 0x100 + assert w2[1 + int()] == 0x200 + assert w2[2 + int()] == 0x300 + assert w2[-1 + int()] == 0x300 + assert w2[-2 + int()] == 0x200 + assert w2[-3 + int()] == 0x100 + + with assertRaises(IndexError): + w2[3 + int()] + with assertRaises(IndexError): + w2[-4 + int()] + + # Kind 4 (UCS-4) + w3 = StringWriter() + w3.append(0x10000) + w3.append(0x10001) + w3.append(0x10002) + assert w3[0 + int()] == 0x10000 + assert w3[1 + int()] == 0x10001 + assert w3[2 + int()] == 0x10002 + assert w3[-1 + int()] == 0x10002 + assert w3[-2 + int()] == 0x10001 + assert w3[-3 + int()] == 0x10000 + + with assertRaises(IndexError): + w3[3 + int()] + with assertRaises(IndexError): + w3[-4 + int()] + + # Test get_item after buffer growth + w4 = StringWriter() + for i in range(1000): + w4.append(ord('a') + (i % 26)) + assert w4[0 + int()] == ord('a') + assert w4[999 + int()] == ord('a') + (999 % 26) + assert w4[500 + int()] == ord('a') + (500 % 26) + assert w4[-1 + int()] == ord('a') + (999 % 26) + assert w4[-1000 + int()] == ord('a') + def test_string_writer_append() -> None: w = StringWriter() w.append(ord('a')) From 962ae0bedfd096f5f9948ec20ee3af75fd8a76d3 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 15:34:30 +0000 Subject: [PATCH 09/19] Test repr --- mypyc/test-data/run-librt-strings.test | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 06f3613e81d4..f1dc4128ca13 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -213,6 +213,28 @@ def test_string_writer_repr() -> None: expected = "StringWriter('" + chr(0x10000) + "')" assert repr(w3) == expected +def test_string_writer_repr_escaping() -> None: + # Kind 1: Test escaping of newline, nul, tab, backslash + w = StringWriter() + w.append(ord('a')) + w.append(ord('\n')) + w.append(0) + w.append(ord('\t')) + w.append(ord('\\')) + assert repr(w) == "StringWriter('a\\n\\x00\\t\\\\')" + + # Kind 2: escaping with UCS-2 + w2 = StringWriter() + w2.append(0x100) + w2.append(ord('\n')) + assert repr(w2) == "StringWriter('" + chr(0x100) + "\\n')" + + # Kind 4: escaping with UCS-4 + w3 = StringWriter() + w3.append(0x10000) + w3.append(0) + assert repr(w3) == "StringWriter('" + chr(0x10000) + "\\x00')" + def test_string_writer_len() -> None: # Kind 1 (ASCII) w = StringWriter() From dbf7ffbbb2d9b676bc3b4e85ea1546d513115ce5 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 15:49:41 +0000 Subject: [PATCH 10/19] Update StringWriter write --- mypyc/lib-rt/librt_strings.c | 57 ++++++++++++++++++-------- mypyc/test-data/run-librt-strings.test | 48 ++++++++++++++++++++++ 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 1501019d459a..7570ec5fa9e1 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -550,10 +550,10 @@ static PyObject* StringWriter_write(PyObject *self, PyObject *const *args, size_ static PyMethodDef StringWriter_methods[] = { {"append", (PyCFunction) StringWriter_append, METH_FASTCALL | METH_KEYWORDS, - PyDoc_STR("Append a single byte to the buffer") + PyDoc_STR("Append a single character (as int codepoint) to the buffer") }, {"write", (PyCFunction) StringWriter_write, METH_FASTCALL | METH_KEYWORDS, - PyDoc_STR("Append bytes to the buffer") + PyDoc_STR("Append a string to the buffer") }, {"getvalue", (PyCFunction) StringWriter_getvalue, METH_NOARGS, "Return the buffer content as str object" @@ -587,22 +587,47 @@ check_string_writer(PyObject *data) { return true; } +// Forward declaration +static char string_writer_switch_kind(StringWriterObject *self, int32_t value); + static char StringWriter_write_internal(StringWriterObject *self, PyObject *value) { - const char *data; - Py_ssize_t size; - if (likely(PyBytes_Check(value))) { - data = PyBytes_AS_STRING(value); - size = PyBytes_GET_SIZE(value); - } else { - data = PyByteArray_AS_STRING(value); - size = PyByteArray_GET_SIZE(value); + // Get string info + Py_ssize_t str_len = PyUnicode_GET_LENGTH(value); + if (str_len == 0) { + return CPY_NONE; } - // Write bytes content. - if (!ensure_string_writer_size(self, size)) + + int src_kind = PyUnicode_KIND(value); + void *src_data = PyUnicode_DATA(value); + + // Switch kind if source requires wider characters + if (src_kind > self->kind) { + // Use max value for the source kind to trigger proper kind switch + int32_t max_for_kind = (src_kind == 2) ? 0x100 : 0x10000; + if (string_writer_switch_kind(self, max_for_kind) == CPY_NONE_ERROR) { + return CPY_NONE_ERROR; + } + } + + // Ensure we have enough space + if (!ensure_string_writer_size(self, str_len)) { return CPY_NONE_ERROR; - memcpy(self->buf + self->len, data, size); - self->len += size; + } + + // Copy data - ASCII/Latin1 (kind 1) are handled uniformly + if (self->kind == src_kind) { + // Same kind, direct copy + memcpy(self->buf + self->len * self->kind, src_data, str_len * src_kind); + } else { + // Different kinds, convert character by character + for (Py_ssize_t i = 0; i < str_len; i++) { + Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, i); + PyUnicode_WRITE(self->kind, self->buf, self->len + i, ch); + } + } + + self->len += str_len; return CPY_NONE; } @@ -617,8 +642,8 @@ StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject if (!check_string_writer(self)) { return NULL; } - if (unlikely(!PyBytes_Check(value) && !PyByteArray_Check(value))) { - PyErr_SetString(PyExc_TypeError, "value must be a bytes or bytearray object"); + if (unlikely(!PyUnicode_Check(value))) { + PyErr_SetString(PyExc_TypeError, "value must be a str object"); return NULL; } if (unlikely(StringWriter_write_internal((StringWriterObject *)self, value) == CPY_NONE_ERROR)) { diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index f1dc4128ca13..04cdcc21cdb6 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -350,6 +350,54 @@ def test_string_writer_append() -> None: w.append(0x10000) assert w.getvalue() == "a\xff\u0100\uffff" + chr(0x10000) +def test_string_writer_write() -> None: + # Kind 1: Write ASCII strings + w = StringWriter() + w.write("hello") + assert w.getvalue() == "hello" + w.write(" world") + assert w.getvalue() == "hello world" + + # Write empty string + w.write("") + assert w.getvalue() == "hello world" + + # Kind 1 -> Kind 2: Write string with UCS-2 characters + w2 = StringWriter() + w2.write("abc") + assert w2.getvalue() == "abc" + w2.write(chr(0x100) + chr(0x200)) + assert w2.getvalue() == "abc" + chr(0x100) + chr(0x200) + w2.write("xyz") + assert w2.getvalue() == "abc" + chr(0x100) + chr(0x200) + "xyz" + + # Kind 2: Write all UCS-2 + w3 = StringWriter() + w3.append(0x100) + w3.write(chr(0x200) + chr(0x300)) + assert w3.getvalue() == chr(0x100) + chr(0x200) + chr(0x300) + + # Kind 2 -> Kind 4: Write string with UCS-4 characters + w4 = StringWriter() + w4.write(chr(0x100)) + w4.write(chr(0x10000)) + assert w4.getvalue() == chr(0x100) + chr(0x10000) + + # Kind 4: Write mixed + w5 = StringWriter() + w5.append(0x10000) + w5.write("abc") + w5.write(chr(0x200)) + w5.write(chr(0x10001)) + assert w5.getvalue() == chr(0x10000) + "abc" + chr(0x200) + chr(0x10001) + + # Test with longer strings to trigger buffer growth + w6 = StringWriter() + for _ in range(100): + w6.write("hello") + assert w6.getvalue() == "hello" * 100 + assert len(w6) == 500 + def test_string_writer_append_grow_same_kind() -> None: # Test growing buffer while staying in kind 1 (ASCII) w = StringWriter() From 0faacb228b532af0e9beef6a71cfe6895b521164 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 16:19:01 +0000 Subject: [PATCH 11/19] Avoid pointer casts --- mypyc/lib-rt/librt_strings.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 7570ec5fa9e1..7d64a9c3935a 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -528,14 +528,18 @@ StringWriter_item(StringWriterObject *self, Py_ssize_t index) return NULL; } - // Read the character at the given index based on kind + // Read the character at the given index based on kind using memcpy uint32_t value; if (self->kind == 1) { - value = ((uint8_t *)self->buf)[index]; + uint8_t val; + memcpy(&val, self->buf + index, 1); + value = val; } else if (self->kind == 2) { - value = ((uint16_t *)self->buf)[index]; + uint16_t val; + memcpy(&val, self->buf + index * 2, 2); + value = val; } else { - value = ((uint32_t *)self->buf)[index]; + memcpy(&value, self->buf + index * 4, 4); } return PyLong_FromLong(value); } From 526d2aca40036aec81ed631560e272556c98575d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 16:24:01 +0000 Subject: [PATCH 12/19] Clean up unnecessary pointer casts --- mypyc/lib-rt/librt_strings.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 7d64a9c3935a..e4772caa9065 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -662,31 +662,37 @@ static void convert_string_data(char *src_buf, char *dest_buf, Py_ssize_t len, bool in_place = (src_buf == dest_buf); if (old_kind == 1 && new_kind == 2) { - uint8_t *src = (uint8_t *)src_buf; - uint16_t *dest = (uint16_t *)dest_buf; if (in_place) { // Convert backwards to avoid overwriting for (Py_ssize_t i = len - 1; i >= 0; i--) { - dest[i] = src[i]; + uint8_t val = (uint8_t)src_buf[i]; + uint16_t expanded = val; + memcpy(dest_buf + i * 2, &expanded, 2); } } else { // Convert forwards for (Py_ssize_t i = 0; i < len; i++) { - dest[i] = src[i]; + uint8_t val = (uint8_t)src_buf[i]; + uint16_t expanded = val; + memcpy(dest_buf + i * 2, &expanded, 2); } } } else if (old_kind == 2 && new_kind == 4) { - uint16_t *src = (uint16_t *)src_buf; - uint32_t *dest = (uint32_t *)dest_buf; if (in_place) { // Convert backwards to avoid overwriting for (Py_ssize_t i = len - 1; i >= 0; i--) { - dest[i] = src[i]; + uint16_t val; + memcpy(&val, src_buf + i * 2, 2); + uint32_t expanded = val; + memcpy(dest_buf + i * 4, &expanded, 4); } } else { // Convert forwards for (Py_ssize_t i = 0; i < len; i++) { - dest[i] = src[i]; + uint16_t val; + memcpy(&val, src_buf + i * 2, 2); + uint32_t expanded = val; + memcpy(dest_buf + i * 4, &expanded, 4); } } } From f4798a2dcaae64f552ab5f3d3b579f0b424e9f1d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 14 Jan 2026 16:32:05 +0000 Subject: [PATCH 13/19] Fix code point range checking --- mypyc/lib-rt/librt_strings.c | 5 +++-- mypyc/test-data/run-librt-strings.test | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index e4772caa9065..59d466fd58b0 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -775,7 +775,7 @@ static char string_append_slow_path(StringWriterObject *self, int32_t value) { return string_append_slow_path(self, value); } assert(self->kind == 4); - if ((uint32_t)value < (1 << 20)) { + if ((uint32_t)value <= 0x10FFFF) { if (!ensure_string_writer_size(self, 1)) return CPY_NONE_ERROR; // Copy 4-byte character to buffer @@ -784,7 +784,8 @@ static char string_append_slow_path(StringWriterObject *self, int32_t value) { self->len++; return CPY_NONE; } - // TODO: exception + // Code point is out of valid Unicode range + PyErr_Format(PyExc_ValueError, "code point %d is outside valid Unicode range (0-1114111)", value); return CPY_NONE_ERROR; } diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 04cdcc21cdb6..251466ff6d44 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -350,6 +350,24 @@ def test_string_writer_append() -> None: w.append(0x10000) assert w.getvalue() == "a\xff\u0100\uffff" + chr(0x10000) + # Maximum valid Unicode code point (0x10FFFF = 1114111) + w2 = StringWriter() + w2.append(0x10FFFF) + assert w2.getvalue() == chr(0x10FFFF) + + # Invalid code points + w3 = StringWriter() + with assertRaises(ValueError, "code point 1114112 is outside valid Unicode range (0-1114111)"): + w3.append(0x110000) + + w4 = StringWriter() + with assertRaises(ValueError, "code point -1 is outside valid Unicode range (0-1114111)"): + w4.append(-1) + + w5 = StringWriter() + with assertRaises(ValueError, "code point 2097152 is outside valid Unicode range (0-1114111)"): + w5.append(0x200000) + def test_string_writer_write() -> None: # Kind 1: Write ASCII strings w = StringWriter() From 45021f83fa2b09ea72e1a72432201495437f9236 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 11:39:43 +0000 Subject: [PATCH 14/19] Fix buffer growth --- mypyc/lib-rt/librt_strings.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 59d466fd58b0..fddb3acaf004 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -716,18 +716,25 @@ static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, new_capacity *= 2; } while (new_capacity * new_kind < needed_size); - char *new_buf = PyMem_Malloc(new_capacity * new_kind); - if (unlikely(new_buf == NULL)) { - PyErr_NoMemory(); - return CPY_NONE_ERROR; - } - - // Convert data during copy - convert_string_data(self->buf, new_buf, self->len, old_kind, new_kind); - - // Free old buffer if it was heap-allocated - if (self->buf != self->data) { - PyMem_Free(self->buf); + char *new_buf; + if (self->buf == self->data) { + // Move from embedded buffer to heap-allocated buffer + new_buf = PyMem_Malloc(new_capacity * new_kind); + if (unlikely(new_buf == NULL)) { + PyErr_NoMemory(); + return CPY_NONE_ERROR; + } + // Convert data during copy + convert_string_data(self->buf, new_buf, self->len, old_kind, new_kind); + } else { + // Realloc existing heap buffer + new_buf = PyMem_Realloc(self->buf, new_capacity * new_kind); + if (unlikely(new_buf == NULL)) { + PyErr_NoMemory(); + return CPY_NONE_ERROR; + } + // Convert in-place (backwards to avoid overwriting) + convert_string_data(new_buf, new_buf, self->len, old_kind, new_kind); } self->buf = new_buf; From 24bd1bbefd02f6a7f804c7c8a4709395af7b4bf7 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 11:41:25 +0000 Subject: [PATCH 15/19] Refactor --- mypyc/lib-rt/librt_strings.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index fddb3acaf004..114139e3ca84 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -717,26 +717,24 @@ static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, } while (new_capacity * new_kind < needed_size); char *new_buf; - if (self->buf == self->data) { + bool from_embedded = (self->buf == self->data); + if (from_embedded) { // Move from embedded buffer to heap-allocated buffer new_buf = PyMem_Malloc(new_capacity * new_kind); - if (unlikely(new_buf == NULL)) { - PyErr_NoMemory(); - return CPY_NONE_ERROR; - } - // Convert data during copy - convert_string_data(self->buf, new_buf, self->len, old_kind, new_kind); } else { // Realloc existing heap buffer new_buf = PyMem_Realloc(self->buf, new_capacity * new_kind); - if (unlikely(new_buf == NULL)) { - PyErr_NoMemory(); - return CPY_NONE_ERROR; - } - // Convert in-place (backwards to avoid overwriting) - convert_string_data(new_buf, new_buf, self->len, old_kind, new_kind); } + if (unlikely(new_buf == NULL)) { + PyErr_NoMemory(); + return CPY_NONE_ERROR; + } + + // Convert data - either during copy from embedded buffer or in-place + convert_string_data(from_embedded ? self->buf : new_buf, new_buf, + self->len, old_kind, new_kind); + self->buf = new_buf; self->kind = new_kind; self->capacity = new_capacity; From 19d0ab21d9879db7919da421906584b35898fda0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 12:02:36 +0000 Subject: [PATCH 16/19] Refactor buffer growing --- mypyc/lib-rt/librt_strings.c | 83 +++++++++++++++++------------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 114139e3ca84..44e3e0bb94ea 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -385,31 +385,50 @@ BytesWriter_len_internal(PyObject *self) { static PyTypeObject StringWriterType; +static void convert_string_data(char *src_buf, char *dest_buf, Py_ssize_t len, + char old_kind, char new_kind); + +// Helper to grow string buffer and optionally convert to new kind +// Returns true on success, false on failure (with PyErr set) +// Updates self->buf, self->capacity, and self->kind static bool -_grow_buffer_string(StringWriterObject *data, Py_ssize_t n) { - Py_ssize_t target = data->len + n; - Py_ssize_t size = data->capacity; - char kind = data->kind; - do { - size *= 2; - } while (target >= size); - // Calculate size in bytes - Py_ssize_t size_bytes = size * kind; - if (data->buf == data->data) { +grow_string_buffer_helper(StringWriterObject *self, Py_ssize_t target_capacity, char new_kind) { + char old_kind = self->kind; + Py_ssize_t new_capacity = self->capacity; + + while (target_capacity >= new_capacity) { + new_capacity *= 2; + } + + Py_ssize_t size_bytes = new_capacity * new_kind; + char *new_buf; + bool from_embedded = (self->buf == self->data); + + if (from_embedded) { // Move from embedded buffer to heap-allocated buffer - data->buf = PyMem_Malloc(size_bytes); - if (data->buf != NULL) { - // Copy existing data (len * kind bytes) - memcpy(data->buf, data->data, data->len * kind); + new_buf = PyMem_Malloc(size_bytes); + if (new_buf != NULL) { + // Copy existing data from embedded buffer + memcpy(new_buf, self->data, self->len * old_kind); } } else { - data->buf = PyMem_Realloc(data->buf, size_bytes); + // Realloc existing heap buffer + new_buf = PyMem_Realloc(self->buf, size_bytes); } - if (unlikely(data->buf == NULL)) { + + if (unlikely(new_buf == NULL)) { PyErr_NoMemory(); return false; } - data->capacity = size; + + // Convert data if kind changed + if (old_kind != new_kind) { + convert_string_data(new_buf, new_buf, self->len, old_kind, new_kind); + } + + self->buf = new_buf; + self->capacity = new_capacity; + self->kind = new_kind; return true; } @@ -418,7 +437,7 @@ ensure_string_writer_size(StringWriterObject *data, Py_ssize_t n) { if (likely(data->capacity - data->len >= n)) { return true; } else { - return _grow_buffer_string(data, n); + return grow_string_buffer_helper(data, data->len + n, data->kind); } } @@ -710,34 +729,10 @@ static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, self->kind = new_kind; self->capacity = current_buf_size / new_kind; } else { - // Allocate new buffer - Py_ssize_t new_capacity = self->capacity; - do { - new_capacity *= 2; - } while (new_capacity * new_kind < needed_size); - - char *new_buf; - bool from_embedded = (self->buf == self->data); - if (from_embedded) { - // Move from embedded buffer to heap-allocated buffer - new_buf = PyMem_Malloc(new_capacity * new_kind); - } else { - // Realloc existing heap buffer - new_buf = PyMem_Realloc(self->buf, new_capacity * new_kind); - } - - if (unlikely(new_buf == NULL)) { - PyErr_NoMemory(); + // Need to allocate new buffer + if (!grow_string_buffer_helper(self, self->len, new_kind)) { return CPY_NONE_ERROR; } - - // Convert data - either during copy from embedded buffer or in-place - convert_string_data(from_embedded ? self->buf : new_buf, new_buf, - self->len, old_kind, new_kind); - - self->buf = new_buf; - self->kind = new_kind; - self->capacity = new_capacity; } return CPY_NONE; } From e89fb38c9a8b840588a7c5a1a326c1f702e5bc72 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 12:07:47 +0000 Subject: [PATCH 17/19] Refactor --- mypyc/lib-rt/librt_strings.c | 55 +++++++++++------------------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 44e3e0bb94ea..9c94d0ec2367 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -385,8 +385,8 @@ BytesWriter_len_internal(PyObject *self) { static PyTypeObject StringWriterType; -static void convert_string_data(char *src_buf, char *dest_buf, Py_ssize_t len, - char old_kind, char new_kind); +static void convert_string_data_in_place(char *buf, Py_ssize_t len, + char old_kind, char new_kind); // Helper to grow string buffer and optionally convert to new kind // Returns true on success, false on failure (with PyErr set) @@ -423,7 +423,7 @@ grow_string_buffer_helper(StringWriterObject *self, Py_ssize_t target_capacity, // Convert data if kind changed if (old_kind != new_kind) { - convert_string_data(new_buf, new_buf, self->len, old_kind, new_kind); + convert_string_data_in_place(new_buf, self->len, old_kind, new_kind); } self->buf = new_buf; @@ -676,43 +676,22 @@ StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject return Py_None; } -static void convert_string_data(char *src_buf, char *dest_buf, Py_ssize_t len, - char old_kind, char new_kind) { - bool in_place = (src_buf == dest_buf); - +static void convert_string_data_in_place(char *buf, Py_ssize_t len, + char old_kind, char new_kind) { if (old_kind == 1 && new_kind == 2) { - if (in_place) { - // Convert backwards to avoid overwriting - for (Py_ssize_t i = len - 1; i >= 0; i--) { - uint8_t val = (uint8_t)src_buf[i]; - uint16_t expanded = val; - memcpy(dest_buf + i * 2, &expanded, 2); - } - } else { - // Convert forwards - for (Py_ssize_t i = 0; i < len; i++) { - uint8_t val = (uint8_t)src_buf[i]; - uint16_t expanded = val; - memcpy(dest_buf + i * 2, &expanded, 2); - } + // Convert backwards to avoid overwriting + for (Py_ssize_t i = len - 1; i >= 0; i--) { + uint8_t val = (uint8_t)buf[i]; + uint16_t expanded = val; + memcpy(buf + i * 2, &expanded, 2); } } else if (old_kind == 2 && new_kind == 4) { - if (in_place) { - // Convert backwards to avoid overwriting - for (Py_ssize_t i = len - 1; i >= 0; i--) { - uint16_t val; - memcpy(&val, src_buf + i * 2, 2); - uint32_t expanded = val; - memcpy(dest_buf + i * 4, &expanded, 4); - } - } else { - // Convert forwards - for (Py_ssize_t i = 0; i < len; i++) { - uint16_t val; - memcpy(&val, src_buf + i * 2, 2); - uint32_t expanded = val; - memcpy(dest_buf + i * 4, &expanded, 4); - } + // Convert backwards to avoid overwriting + for (Py_ssize_t i = len - 1; i >= 0; i--) { + uint16_t val; + memcpy(&val, buf + i * 2, 2); + uint32_t expanded = val; + memcpy(buf + i * 4, &expanded, 4); } } } @@ -725,7 +704,7 @@ static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, if (current_buf_size >= needed_size) { // Convert in place - convert_string_data(self->buf, self->buf, self->len, old_kind, new_kind); + convert_string_data_in_place(self->buf, self->len, old_kind, new_kind); self->kind = new_kind; self->capacity = current_buf_size / new_kind; } else { From ed86e91ae05fe9322138c0d31503118584236845 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 12:16:49 +0000 Subject: [PATCH 18/19] Refactor/optimize --- mypyc/lib-rt/librt_strings.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 9c94d0ec2367..58e8261daf4a 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -623,14 +623,16 @@ StringWriter_write_internal(StringWriterObject *self, PyObject *value) { int src_kind = PyUnicode_KIND(value); void *src_data = PyUnicode_DATA(value); + int self_kind = self->kind; // Switch kind if source requires wider characters - if (src_kind > self->kind) { + if (src_kind > self_kind) { // Use max value for the source kind to trigger proper kind switch - int32_t max_for_kind = (src_kind == 2) ? 0x100 : 0x10000; - if (string_writer_switch_kind(self, max_for_kind) == CPY_NONE_ERROR) { + int32_t codepoint = (src_kind == 2) ? 0x100 : 0x10000; + if (string_writer_switch_kind(self, codepoint) == CPY_NONE_ERROR) { return CPY_NONE_ERROR; } + self_kind = self->kind; } // Ensure we have enough space @@ -639,14 +641,14 @@ StringWriter_write_internal(StringWriterObject *self, PyObject *value) { } // Copy data - ASCII/Latin1 (kind 1) are handled uniformly - if (self->kind == src_kind) { + if (self_kind == src_kind) { // Same kind, direct copy - memcpy(self->buf + self->len * self->kind, src_data, str_len * src_kind); + memcpy(self->buf + self->len * self_kind, src_data, str_len * src_kind); } else { // Different kinds, convert character by character for (Py_ssize_t i = 0; i < str_len; i++) { Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, i); - PyUnicode_WRITE(self->kind, self->buf, self->len + i, ch); + PyUnicode_WRITE(self_kind, self->buf, self->len + i, ch); } } From 2e3a2c36811b69c102f8bfe834ed7d4e00b5c5a5 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 15 Jan 2026 13:54:03 +0000 Subject: [PATCH 19/19] Small tweaks --- mypyc/lib-rt/librt_strings.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/mypyc/lib-rt/librt_strings.c b/mypyc/lib-rt/librt_strings.c index 58e8261daf4a..788d27070114 100644 --- a/mypyc/lib-rt/librt_strings.c +++ b/mypyc/lib-rt/librt_strings.c @@ -432,12 +432,18 @@ grow_string_buffer_helper(StringWriterObject *self, Py_ssize_t target_capacity, return true; } +static bool grow_string_buffer(StringWriterObject *data, Py_ssize_t n) { + return grow_string_buffer_helper(data, data->len + n, data->kind); +} + static inline bool ensure_string_writer_size(StringWriterObject *data, Py_ssize_t n) { if (likely(data->capacity - data->len >= n)) { return true; } else { - return grow_string_buffer_helper(data, data->len + n, data->kind); + // Don't inline the grow function since this is slow path and we + // want to keep this as short as possible for better inlining + return grow_string_buffer(data, n); } } @@ -610,12 +616,10 @@ check_string_writer(PyObject *data) { return true; } -// Forward declaration static char string_writer_switch_kind(StringWriterObject *self, int32_t value); static char StringWriter_write_internal(StringWriterObject *self, PyObject *value) { - // Get string info Py_ssize_t str_len = PyUnicode_GET_LENGTH(value); if (str_len == 0) { return CPY_NONE; @@ -627,7 +631,7 @@ StringWriter_write_internal(StringWriterObject *self, PyObject *value) { // Switch kind if source requires wider characters if (src_kind > self_kind) { - // Use max value for the source kind to trigger proper kind switch + // Use value in the source kind range to trigger proper kind switch int32_t codepoint = (src_kind == 2) ? 0x100 : 0x10000; if (string_writer_switch_kind(self, codepoint) == CPY_NONE_ERROR) { return CPY_NONE_ERROR; @@ -678,6 +682,7 @@ StringWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject return Py_None; } +// Convert string data to next larger kind (1->2 or 2->4) static void convert_string_data_in_place(char *buf, Py_ssize_t len, char old_kind, char new_kind) { if (old_kind == 1 && new_kind == 2) { @@ -695,6 +700,8 @@ static void convert_string_data_in_place(char *buf, Py_ssize_t len, uint32_t expanded = val; memcpy(buf + i * 4, &expanded, 4); } + } else { + assert(false); } } @@ -730,13 +737,17 @@ static char string_writer_switch_kind(StringWriterObject *self, int32_t value) { return CPY_NONE; } else { // Must be kind 2 -> 4 + assert(self->kind == 2); + assert((uint32_t)value > 0xffff); return convert_string_buffer_kind(self, 2, 4); } } +// Handle all append cases except for append that stays within kind 1 static char string_append_slow_path(StringWriterObject *self, int32_t value) { if (self->kind == 2) { if ((uint32_t)value <= 0xffff) { + // Kind stays the same if (!ensure_string_writer_size(self, 1)) return CPY_NONE_ERROR; // Copy 2-byte character to buffer