diff --git a/Doc/library/multiprocessing.shared_memory.rst b/Doc/library/multiprocessing.shared_memory.rst index e8f04a6ac7b95d..d133cd2666d6a3 100644 --- a/Doc/library/multiprocessing.shared_memory.rst +++ b/Doc/library/multiprocessing.shared_memory.rst @@ -311,28 +311,34 @@ finishes execution. existing :class:`!ShareableList`, specify its shared memory block's unique name while leaving *sequence* set to ``None``. + .. versionchanged:: 3.15 + Trailing null bytes (``\x00``) in :class:`bytes` and :class:`str` values + are now preserved correctly, and multi-byte UTF-8 strings are no longer + corrupted. See :gh:`106939` and :gh:`145261`. + .. note:: - A known issue exists for :class:`bytes` and :class:`str` values. - If they end with ``\x00`` nul bytes or characters, those may be - *silently stripped* when fetching them by index from the - :class:`!ShareableList`. This ``.rstrip(b'\x00')`` behavior is - considered a bug and may go away in the future. See :gh:`106939`. + In Python 3.14 and earlier, :class:`bytes` and :class:`str` values + ending with ``\x00`` nul bytes had those bytes *silently stripped* + when fetched by index from the :class:`!ShareableList`. Multi-byte + UTF-8 strings could also be corrupted due to incorrect slot sizing. - For applications where rstripping of trailing nulls is a problem, - work around it by always unconditionally appending an extra non-0 - byte to the end of such values when storing and unconditionally - removing it when fetching: + For applications that need to work with Python 3.14 and earlier where + rstripping of trailing nulls is a problem, work around it by always + unconditionally appending an extra non-0 byte to the end of such values + when storing and unconditionally removing it when fetching: .. doctest:: >>> from multiprocessing import shared_memory - >>> nul_bug_demo = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00']) - >>> nul_bug_demo[0] - '?' - >>> nul_bug_demo[1] - b'\x03\x02\x01' - >>> nul_bug_demo.shm.unlink() + >>> # Python 3.15+: trailing nulls are preserved + >>> sl = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00']) + >>> sl[0] + '?\x00' + >>> sl[1] + b'\x03\x02\x01\x00\x00\x00' + >>> sl.shm.unlink() + >>> # Workaround for Python 3.14 and earlier: >>> padded = shared_memory.ShareableList(['?\x00\x07', b'\x03\x02\x01\x00\x00\x00\x07']) >>> padded[0][:-1] '?\x00' diff --git a/Lib/multiprocessing/shared_memory.py b/Lib/multiprocessing/shared_memory.py index 99a8ce3320ad4e..19fe503b1a77da 100644 --- a/Lib/multiprocessing/shared_memory.py +++ b/Lib/multiprocessing/shared_memory.py @@ -286,9 +286,9 @@ class ShareableList: _alignment = 8 _back_transforms_mapping = { 0: lambda value: value, # int, float, bool - 1: lambda value: value.rstrip(b'\x00').decode(_encoding), # str - 2: lambda value: value.rstrip(b'\x00'), # bytes - 3: lambda _value: None, # None + 1: lambda value: value.decode(_encoding), # str + 2: lambda value: value, # bytes + 3: lambda _value: None, # None } @staticmethod @@ -312,7 +312,13 @@ def __init__(self, sequence=None, *, name=None): self._types_mapping[type(item)] if not isinstance(item, (str, bytes)) else self._types_mapping[type(item)] % ( - self._alignment * (len(item) // self._alignment + 1), + self._alignment * ( + len( + item.encode(_encoding) + if isinstance(item, str) + else item + ) // self._alignment + 1 + ), ) for item in sequence ] @@ -326,6 +332,15 @@ def __init__(self, sequence=None, *, name=None): for fmt in _formats: offset += self._alignment if fmt[-1] != "s" else int(fmt[:-1]) self._allocated_offsets.append(offset) + _stored_formats = [] + for item, fmt in zip(sequence, _formats): + if isinstance(item, (str, bytes)): + encoded = (item.encode(_encoding) + if isinstance(item, str) else item) + _stored_formats.append("%ds" % len(encoded)) + else: + _stored_formats.append(fmt) + _recreation_codes = [ self._extract_recreation_code(item) for item in sequence ] @@ -359,7 +374,7 @@ def __init__(self, sequence=None, *, name=None): self._format_packing_metainfo, self.shm.buf, self._offset_packing_formats, - *(v.encode(_enc) for v in _formats) + *(v.encode(_enc) for v in _stored_formats) ) struct.pack_into( self._format_back_transform_codes, @@ -459,6 +474,7 @@ def __setitem__(self, position, value): if not isinstance(value, (str, bytes)): new_format = self._types_mapping[type(value)] + pack_format = new_format encoded_value = value else: allocated_length = self._allocated_offsets[position + 1] - item_offset @@ -467,19 +483,17 @@ def __setitem__(self, position, value): if isinstance(value, str) else value) if len(encoded_value) > allocated_length: raise ValueError("bytes/str item exceeds available storage") - if current_format[-1] == "s": - new_format = current_format - else: - new_format = self._types_mapping[str] % ( - allocated_length, - ) + # Allocated-length format for struct.pack_into (fills the slot). + pack_format = "%ds" % allocated_length + # Actual-length format stored in metadata (for exact retrieval). + new_format = "%ds" % len(encoded_value) self._set_packing_format_and_transform( position, new_format, value ) - struct.pack_into(new_format, self.shm.buf, offset, encoded_value) + struct.pack_into(pack_format, self.shm.buf, offset, encoded_value) def __reduce__(self): return partial(self.__class__, name=self.shm.name), () diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index cc07062eee6f98..85020988580807 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -4757,7 +4757,7 @@ def test_shared_memory_ShareableList_basics(self): self.assertEqual(current_format, sl._get_packing_format(0)) # Verify attributes are readable. - self.assertEqual(sl.format, '8s8sdqxxxxxx?xxxxxxxx?q') + self.assertEqual(sl.format, '5s5sdqxxxxxx?xxxxxxxx?q') # Exercise len(). self.assertEqual(len(sl), 7) @@ -4785,7 +4785,7 @@ def test_shared_memory_ShareableList_basics(self): self.assertEqual(sl[3], 42) sl[4] = 'some' # Change type at a given position. self.assertEqual(sl[4], 'some') - self.assertEqual(sl.format, '8s8sdq8sxxxxxxx?q') + self.assertEqual(sl.format, '5s5sdq4sxxxxxxx?q') with self.assertRaisesRegex(ValueError, "exceeds available storage"): sl[4] = 'far too many' @@ -4887,6 +4887,58 @@ def test_shared_memory_ShareableList_pickling_dead_object(self): with self.assertRaises(FileNotFoundError): pickle.loads(serialized_sl) + def test_shared_memory_ShareableList_trailing_nulls(self): + # gh-106939: ShareableList should preserve trailing null bytes + # in bytes and str values. + sl = shared_memory.ShareableList([ + b'\x03\x02\x01\x00\x00\x00', + '?\x00', + b'\x00\x00\x00', + b'', + b'no nulls', + ]) + self.addCleanup(sl.shm.unlink) + self.addCleanup(sl.shm.close) + + self.assertEqual(sl[0], b'\x03\x02\x01\x00\x00\x00') + self.assertEqual(sl[1], '?\x00') + self.assertEqual(sl[2], b'\x00\x00\x00') + self.assertEqual(sl[3], b'') + self.assertEqual(sl[4], b'no nulls') + + sl2 = shared_memory.ShareableList(name=sl.shm.name) + self.addCleanup(sl2.shm.close) + self.assertEqual(sl2[0], b'\x03\x02\x01\x00\x00\x00') + self.assertEqual(sl2[1], '?\x00') + self.assertEqual(sl2[2], b'\x00\x00\x00') + self.assertEqual(sl2[3], b'') + self.assertEqual(sl2[4], b'no nulls') + + def test_shared_memory_ShareableList_multibyte_utf8(self): + # gh-145261: ShareableList should correctly handle multi-byte + # UTF-8 strings without corruption or spillage. + sl = shared_memory.ShareableList([ + 'ascii', # 1-byte per char (5 bytes) + 'café', # 2-byte char: é (5 bytes) + '中文测试', # 3-byte per char (12 bytes) + '𐀀𐀁', # 4-byte per char (8 bytes) + ]) + self.addCleanup(sl.shm.unlink) + self.addCleanup(sl.shm.close) + + self.assertEqual(sl[0], 'ascii') + self.assertEqual(sl[1], 'café') + self.assertEqual(sl[2], '中文测试') + self.assertEqual(sl[3], '𐀀𐀁') + + # Verify cross-process access via name-based attachment. + sl2 = shared_memory.ShareableList(name=sl.shm.name) + self.addCleanup(sl2.shm.close) + self.assertEqual(sl2[0], 'ascii') + self.assertEqual(sl2[1], 'café') + self.assertEqual(sl2[2], '中文测试') + self.assertEqual(sl2[3], '𐀀𐀁') + def test_shared_memory_cleaned_after_process_termination(self): cmd = '''if 1: import os, time, sys diff --git a/Misc/NEWS.d/next/Library/2026-03-04-06-43-24.gh-issue-106939.FZlMljA1.rst b/Misc/NEWS.d/next/Library/2026-03-04-06-43-24.gh-issue-106939.FZlMljA1.rst new file mode 100644 index 00000000000000..6e9fc36acc5f38 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-03-04-06-43-24.gh-issue-106939.FZlMljA1.rst @@ -0,0 +1,4 @@ +Fix :class:`~multiprocessing.shared_memory.ShareableList` corrupting +multi-byte UTF-8 strings due to using character count instead of byte count +for slot allocation, and stripping legitimate trailing null bytes from +:class:`bytes` and :class:`str` values.