gh-106939, gh-145261: Fix ShareableList data corruption

jakelodwick · jakelodwick · commit 844114d9eaa0 · 2026-03-03T22:53:04.000-08:00
Store actual byte lengths in format metadata instead of allocated
slot sizes, so retrieval extracts exact data without relying on
null-termination. Use byte count instead of character count for
str slot allocation to prevent multi-byte UTF-8 overflow.
diff --git a/Doc/library/multiprocessing.shared_memory.rst b/Doc/library/multiprocessing.shared_memory.rst
@@ -311,28 +311,34 @@ finishes execution.
    existing :class:`!ShareableList`, specify its shared memory block's unique
    name while leaving *sequence* set to ``None``.
 
+   .. versionchanged:: 3.15
+      Trailing null bytes (``\x00``) in :class:`bytes` and :class:`str` values
+      are now preserved correctly, and multi-byte UTF-8 strings are no longer
+      corrupted. See :gh:`106939` and :gh:`145261`.
+
    .. note::
 
-      A known issue exists for :class:`bytes` and :class:`str` values.
-      If they end with ``\x00`` nul bytes or characters, those may be
-      *silently stripped* when fetching them by index from the
-      :class:`!ShareableList`. This ``.rstrip(b'\x00')`` behavior is
-      considered a bug and may go away in the future. See :gh:`106939`.
+      In Python 3.14 and earlier, :class:`bytes` and :class:`str` values
+      ending with ``\x00`` nul bytes had those bytes *silently stripped*
+      when fetched by index from the :class:`!ShareableList`.  Multi-byte
+      UTF-8 strings could also be corrupted due to incorrect slot sizing.
 
-   For applications where rstripping of trailing nulls is a problem,
-   work around it by always unconditionally appending an extra non-0
-   byte to the end of such values when storing and unconditionally
-   removing it when fetching:
+   For applications that need to work with Python 3.14 and earlier where
+   rstripping of trailing nulls is a problem, work around it by always
+   unconditionally appending an extra non-0 byte to the end of such values
+   when storing and unconditionally removing it when fetching:
 
    .. doctest::
 
        >>> from multiprocessing import shared_memory
-       >>> nul_bug_demo = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00'])
-       >>> nul_bug_demo[0]
-       '?'
-       >>> nul_bug_demo[1]
-       b'\x03\x02\x01'
-       >>> nul_bug_demo.shm.unlink()
+       >>> # Python 3.15+: trailing nulls are preserved
+       >>> sl = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00'])
+       >>> sl[0]
+       '?\x00'
+       >>> sl[1]
+       b'\x03\x02\x01\x00\x00\x00'
+       >>> sl.shm.unlink()
+       >>> # Workaround for Python 3.14 and earlier:
        >>> padded = shared_memory.ShareableList(['?\x00\x07', b'\x03\x02\x01\x00\x00\x00\x07'])
        >>> padded[0][:-1]
        '?\x00'
diff --git a/Lib/multiprocessing/shared_memory.py b/Lib/multiprocessing/shared_memory.py
@@ -286,9 +286,9 @@ class ShareableList:
     _alignment = 8
     _back_transforms_mapping = {
         0: lambda value: value,                   # int, float, bool
-        1: lambda value: value.rstrip(b'\x00').decode(_encoding),  # str
-        2: lambda value: value.rstrip(b'\x00'),   # bytes
-        3: lambda _value: None,                   # None
+        1: lambda value: value.decode(_encoding),  # str
+        2: lambda value: value,                    # bytes
+        3: lambda _value: None,                    # None
     }
 
     @staticmethod
@@ -312,7 +312,13 @@ def __init__(self, sequence=None, *, name=None):
                 self._types_mapping[type(item)]
                     if not isinstance(item, (str, bytes))
                     else self._types_mapping[type(item)] % (
-                        self._alignment * (len(item) // self._alignment + 1),
+                        self._alignment * (
+                            len(
+                                item.encode(_encoding)
+                                if isinstance(item, str)
+                                else item
+                            ) // self._alignment + 1
+                        ),
                     )
                 for item in sequence
             ]
@@ -326,6 +332,15 @@ def __init__(self, sequence=None, *, name=None):
             for fmt in _formats:
                 offset += self._alignment if fmt[-1] != "s" else int(fmt[:-1])
                 self._allocated_offsets.append(offset)
+            _stored_formats = []
+            for item, fmt in zip(sequence, _formats):
+                if isinstance(item, (str, bytes)):
+                    encoded = (item.encode(_encoding)
+                               if isinstance(item, str) else item)
+                    _stored_formats.append("%ds" % len(encoded))
+                else:
+                    _stored_formats.append(fmt)
+
             _recreation_codes = [
                 self._extract_recreation_code(item) for item in sequence
             ]
@@ -359,7 +374,7 @@ def __init__(self, sequence=None, *, name=None):
                 self._format_packing_metainfo,
                 self.shm.buf,
                 self._offset_packing_formats,
-                *(v.encode(_enc) for v in _formats)
+                *(v.encode(_enc) for v in _stored_formats)
             )
             struct.pack_into(
                 self._format_back_transform_codes,
@@ -459,6 +474,7 @@ def __setitem__(self, position, value):
 
         if not isinstance(value, (str, bytes)):
             new_format = self._types_mapping[type(value)]
+            pack_format = new_format
             encoded_value = value
         else:
             allocated_length = self._allocated_offsets[position + 1] - item_offset
@@ -467,19 +483,17 @@ def __setitem__(self, position, value):
                              if isinstance(value, str) else value)
             if len(encoded_value) > allocated_length:
                 raise ValueError("bytes/str item exceeds available storage")
-            if current_format[-1] == "s":
-                new_format = current_format
-            else:
-                new_format = self._types_mapping[str] % (
-                    allocated_length,
-                )
+            # Allocated-length format for struct.pack_into (fills the slot).
+            pack_format = "%ds" % allocated_length
+            # Actual-length format stored in metadata (for exact retrieval).
+            new_format = "%ds" % len(encoded_value)
 
         self._set_packing_format_and_transform(
             position,
             new_format,
             value
         )
-        struct.pack_into(new_format, self.shm.buf, offset, encoded_value)
+        struct.pack_into(pack_format, self.shm.buf, offset, encoded_value)
 
     def __reduce__(self):
         return partial(self.__class__, name=self.shm.name), ()
diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py
@@ -4757,7 +4757,7 @@ def test_shared_memory_ShareableList_basics(self):
         self.assertEqual(current_format, sl._get_packing_format(0))
 
         # Verify attributes are readable.
-        self.assertEqual(sl.format, '8s8sdqxxxxxx?xxxxxxxx?q')
+        self.assertEqual(sl.format, '5s5sdqxxxxxx?xxxxxxxx?q')
 
         # Exercise len().
         self.assertEqual(len(sl), 7)
@@ -4785,7 +4785,7 @@ def test_shared_memory_ShareableList_basics(self):
         self.assertEqual(sl[3], 42)
         sl[4] = 'some'  # Change type at a given position.
         self.assertEqual(sl[4], 'some')
-        self.assertEqual(sl.format, '8s8sdq8sxxxxxxx?q')
+        self.assertEqual(sl.format, '5s5sdq4sxxxxxxx?q')
         with self.assertRaisesRegex(ValueError,
                                     "exceeds available storage"):
             sl[4] = 'far too many'
@@ -4887,6 +4887,58 @@ def test_shared_memory_ShareableList_pickling_dead_object(self):
                 with self.assertRaises(FileNotFoundError):
                     pickle.loads(serialized_sl)
 
+    def test_shared_memory_ShareableList_trailing_nulls(self):
+        # gh-106939: ShareableList should preserve trailing null bytes
+        # in bytes and str values.
+        sl = shared_memory.ShareableList([
+            b'\x03\x02\x01\x00\x00\x00',
+            '?\x00',
+            b'\x00\x00\x00',
+            b'',
+            b'no nulls',
+        ])
+        self.addCleanup(sl.shm.unlink)
+        self.addCleanup(sl.shm.close)
+
+        self.assertEqual(sl[0], b'\x03\x02\x01\x00\x00\x00')
+        self.assertEqual(sl[1], '?\x00')
+        self.assertEqual(sl[2], b'\x00\x00\x00')
+        self.assertEqual(sl[3], b'')
+        self.assertEqual(sl[4], b'no nulls')
+
+        sl2 = shared_memory.ShareableList(name=sl.shm.name)
+        self.addCleanup(sl2.shm.close)
+        self.assertEqual(sl2[0], b'\x03\x02\x01\x00\x00\x00')
+        self.assertEqual(sl2[1], '?\x00')
+        self.assertEqual(sl2[2], b'\x00\x00\x00')
+        self.assertEqual(sl2[3], b'')
+        self.assertEqual(sl2[4], b'no nulls')
+
+    def test_shared_memory_ShareableList_multibyte_utf8(self):
+        # gh-145261: ShareableList should correctly handle multi-byte
+        # UTF-8 strings without corruption or spillage.
+        sl = shared_memory.ShareableList([
+            'ascii',        # 1-byte per char (5 bytes)
+            'café',         # 2-byte char: é (5 bytes)
+            '中文测试',     # 3-byte per char (12 bytes)
+            '𐀀𐀁',         # 4-byte per char (8 bytes)
+        ])
+        self.addCleanup(sl.shm.unlink)
+        self.addCleanup(sl.shm.close)
+
+        self.assertEqual(sl[0], 'ascii')
+        self.assertEqual(sl[1], 'café')
+        self.assertEqual(sl[2], '中文测试')
+        self.assertEqual(sl[3], '𐀀𐀁')
+
+        # Verify cross-process access via name-based attachment.
+        sl2 = shared_memory.ShareableList(name=sl.shm.name)
+        self.addCleanup(sl2.shm.close)
+        self.assertEqual(sl2[0], 'ascii')
+        self.assertEqual(sl2[1], 'café')
+        self.assertEqual(sl2[2], '中文测试')
+        self.assertEqual(sl2[3], '𐀀𐀁')
+
     def test_shared_memory_cleaned_after_process_termination(self):
         cmd = '''if 1:
             import os, time, sys
diff --git a/Misc/NEWS.d/next/Library/2026-03-04-06-43-24.gh-issue-106939.FZlMljA1.rst b/Misc/NEWS.d/next/Library/2026-03-04-06-43-24.gh-issue-106939.FZlMljA1.rst
@@ -0,0 +1,4 @@
+Fix :class:`~multiprocessing.shared_memory.ShareableList` corrupting
+multi-byte UTF-8 strings due to using character count instead of byte count
+for slot allocation, and stripping legitimate trailing null bytes from
+:class:`bytes` and :class:`str` values.