Skip to content

Commit 844114d

Browse files
committed
gh-106939, gh-145261: Fix ShareableList data corruption
Store actual byte lengths in format metadata instead of allocated slot sizes, so retrieval extracts exact data without relying on null-termination. Use byte count instead of character count for str slot allocation to prevent multi-byte UTF-8 overflow.
1 parent dc12d19 commit 844114d

File tree

4 files changed

+105
-29
lines changed

4 files changed

+105
-29
lines changed

Doc/library/multiprocessing.shared_memory.rst

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -311,28 +311,34 @@ finishes execution.
311311
existing :class:`!ShareableList`, specify its shared memory block's unique
312312
name while leaving *sequence* set to ``None``.
313313

314+
.. versionchanged:: 3.15
315+
Trailing null bytes (``\x00``) in :class:`bytes` and :class:`str` values
316+
are now preserved correctly, and multi-byte UTF-8 strings are no longer
317+
corrupted. See :gh:`106939` and :gh:`145261`.
318+
314319
.. note::
315320

316-
A known issue exists for :class:`bytes` and :class:`str` values.
317-
If they end with ``\x00`` nul bytes or characters, those may be
318-
*silently stripped* when fetching them by index from the
319-
:class:`!ShareableList`. This ``.rstrip(b'\x00')`` behavior is
320-
considered a bug and may go away in the future. See :gh:`106939`.
321+
In Python 3.14 and earlier, :class:`bytes` and :class:`str` values
322+
ending with ``\x00`` nul bytes had those bytes *silently stripped*
323+
when fetched by index from the :class:`!ShareableList`. Multi-byte
324+
UTF-8 strings could also be corrupted due to incorrect slot sizing.
321325

322-
For applications where rstripping of trailing nulls is a problem,
323-
work around it by always unconditionally appending an extra non-0
324-
byte to the end of such values when storing and unconditionally
325-
removing it when fetching:
326+
For applications that need to work with Python 3.14 and earlier where
327+
rstripping of trailing nulls is a problem, work around it by always
328+
unconditionally appending an extra non-0 byte to the end of such values
329+
when storing and unconditionally removing it when fetching:
326330

327331
.. doctest::
328332

329333
>>> from multiprocessing import shared_memory
330-
>>> nul_bug_demo = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00'])
331-
>>> nul_bug_demo[0]
332-
'?'
333-
>>> nul_bug_demo[1]
334-
b'\x03\x02\x01'
335-
>>> nul_bug_demo.shm.unlink()
334+
>>> # Python 3.15+: trailing nulls are preserved
335+
>>> sl = shared_memory.ShareableList(['?\x00', b'\x03\x02\x01\x00\x00\x00'])
336+
>>> sl[0]
337+
'?\x00'
338+
>>> sl[1]
339+
b'\x03\x02\x01\x00\x00\x00'
340+
>>> sl.shm.unlink()
341+
>>> # Workaround for Python 3.14 and earlier:
336342
>>> padded = shared_memory.ShareableList(['?\x00\x07', b'\x03\x02\x01\x00\x00\x00\x07'])
337343
>>> padded[0][:-1]
338344
'?\x00'

Lib/multiprocessing/shared_memory.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,9 @@ class ShareableList:
286286
_alignment = 8
287287
_back_transforms_mapping = {
288288
0: lambda value: value, # int, float, bool
289-
1: lambda value: value.rstrip(b'\x00').decode(_encoding), # str
290-
2: lambda value: value.rstrip(b'\x00'), # bytes
291-
3: lambda _value: None, # None
289+
1: lambda value: value.decode(_encoding), # str
290+
2: lambda value: value, # bytes
291+
3: lambda _value: None, # None
292292
}
293293

294294
@staticmethod
@@ -312,7 +312,13 @@ def __init__(self, sequence=None, *, name=None):
312312
self._types_mapping[type(item)]
313313
if not isinstance(item, (str, bytes))
314314
else self._types_mapping[type(item)] % (
315-
self._alignment * (len(item) // self._alignment + 1),
315+
self._alignment * (
316+
len(
317+
item.encode(_encoding)
318+
if isinstance(item, str)
319+
else item
320+
) // self._alignment + 1
321+
),
316322
)
317323
for item in sequence
318324
]
@@ -326,6 +332,15 @@ def __init__(self, sequence=None, *, name=None):
326332
for fmt in _formats:
327333
offset += self._alignment if fmt[-1] != "s" else int(fmt[:-1])
328334
self._allocated_offsets.append(offset)
335+
_stored_formats = []
336+
for item, fmt in zip(sequence, _formats):
337+
if isinstance(item, (str, bytes)):
338+
encoded = (item.encode(_encoding)
339+
if isinstance(item, str) else item)
340+
_stored_formats.append("%ds" % len(encoded))
341+
else:
342+
_stored_formats.append(fmt)
343+
329344
_recreation_codes = [
330345
self._extract_recreation_code(item) for item in sequence
331346
]
@@ -359,7 +374,7 @@ def __init__(self, sequence=None, *, name=None):
359374
self._format_packing_metainfo,
360375
self.shm.buf,
361376
self._offset_packing_formats,
362-
*(v.encode(_enc) for v in _formats)
377+
*(v.encode(_enc) for v in _stored_formats)
363378
)
364379
struct.pack_into(
365380
self._format_back_transform_codes,
@@ -459,6 +474,7 @@ def __setitem__(self, position, value):
459474

460475
if not isinstance(value, (str, bytes)):
461476
new_format = self._types_mapping[type(value)]
477+
pack_format = new_format
462478
encoded_value = value
463479
else:
464480
allocated_length = self._allocated_offsets[position + 1] - item_offset
@@ -467,19 +483,17 @@ def __setitem__(self, position, value):
467483
if isinstance(value, str) else value)
468484
if len(encoded_value) > allocated_length:
469485
raise ValueError("bytes/str item exceeds available storage")
470-
if current_format[-1] == "s":
471-
new_format = current_format
472-
else:
473-
new_format = self._types_mapping[str] % (
474-
allocated_length,
475-
)
486+
# Allocated-length format for struct.pack_into (fills the slot).
487+
pack_format = "%ds" % allocated_length
488+
# Actual-length format stored in metadata (for exact retrieval).
489+
new_format = "%ds" % len(encoded_value)
476490

477491
self._set_packing_format_and_transform(
478492
position,
479493
new_format,
480494
value
481495
)
482-
struct.pack_into(new_format, self.shm.buf, offset, encoded_value)
496+
struct.pack_into(pack_format, self.shm.buf, offset, encoded_value)
483497

484498
def __reduce__(self):
485499
return partial(self.__class__, name=self.shm.name), ()

Lib/test/_test_multiprocessing.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4757,7 +4757,7 @@ def test_shared_memory_ShareableList_basics(self):
47574757
self.assertEqual(current_format, sl._get_packing_format(0))
47584758

47594759
# Verify attributes are readable.
4760-
self.assertEqual(sl.format, '8s8sdqxxxxxx?xxxxxxxx?q')
4760+
self.assertEqual(sl.format, '5s5sdqxxxxxx?xxxxxxxx?q')
47614761

47624762
# Exercise len().
47634763
self.assertEqual(len(sl), 7)
@@ -4785,7 +4785,7 @@ def test_shared_memory_ShareableList_basics(self):
47854785
self.assertEqual(sl[3], 42)
47864786
sl[4] = 'some' # Change type at a given position.
47874787
self.assertEqual(sl[4], 'some')
4788-
self.assertEqual(sl.format, '8s8sdq8sxxxxxxx?q')
4788+
self.assertEqual(sl.format, '5s5sdq4sxxxxxxx?q')
47894789
with self.assertRaisesRegex(ValueError,
47904790
"exceeds available storage"):
47914791
sl[4] = 'far too many'
@@ -4887,6 +4887,58 @@ def test_shared_memory_ShareableList_pickling_dead_object(self):
48874887
with self.assertRaises(FileNotFoundError):
48884888
pickle.loads(serialized_sl)
48894889

4890+
def test_shared_memory_ShareableList_trailing_nulls(self):
4891+
# gh-106939: ShareableList should preserve trailing null bytes
4892+
# in bytes and str values.
4893+
sl = shared_memory.ShareableList([
4894+
b'\x03\x02\x01\x00\x00\x00',
4895+
'?\x00',
4896+
b'\x00\x00\x00',
4897+
b'',
4898+
b'no nulls',
4899+
])
4900+
self.addCleanup(sl.shm.unlink)
4901+
self.addCleanup(sl.shm.close)
4902+
4903+
self.assertEqual(sl[0], b'\x03\x02\x01\x00\x00\x00')
4904+
self.assertEqual(sl[1], '?\x00')
4905+
self.assertEqual(sl[2], b'\x00\x00\x00')
4906+
self.assertEqual(sl[3], b'')
4907+
self.assertEqual(sl[4], b'no nulls')
4908+
4909+
sl2 = shared_memory.ShareableList(name=sl.shm.name)
4910+
self.addCleanup(sl2.shm.close)
4911+
self.assertEqual(sl2[0], b'\x03\x02\x01\x00\x00\x00')
4912+
self.assertEqual(sl2[1], '?\x00')
4913+
self.assertEqual(sl2[2], b'\x00\x00\x00')
4914+
self.assertEqual(sl2[3], b'')
4915+
self.assertEqual(sl2[4], b'no nulls')
4916+
4917+
def test_shared_memory_ShareableList_multibyte_utf8(self):
4918+
# gh-145261: ShareableList should correctly handle multi-byte
4919+
# UTF-8 strings without corruption or spillage.
4920+
sl = shared_memory.ShareableList([
4921+
'ascii', # 1-byte per char (5 bytes)
4922+
'café', # 2-byte char: é (5 bytes)
4923+
'中文测试', # 3-byte per char (12 bytes)
4924+
'𐀀𐀁', # 4-byte per char (8 bytes)
4925+
])
4926+
self.addCleanup(sl.shm.unlink)
4927+
self.addCleanup(sl.shm.close)
4928+
4929+
self.assertEqual(sl[0], 'ascii')
4930+
self.assertEqual(sl[1], 'café')
4931+
self.assertEqual(sl[2], '中文测试')
4932+
self.assertEqual(sl[3], '𐀀𐀁')
4933+
4934+
# Verify cross-process access via name-based attachment.
4935+
sl2 = shared_memory.ShareableList(name=sl.shm.name)
4936+
self.addCleanup(sl2.shm.close)
4937+
self.assertEqual(sl2[0], 'ascii')
4938+
self.assertEqual(sl2[1], 'café')
4939+
self.assertEqual(sl2[2], '中文测试')
4940+
self.assertEqual(sl2[3], '𐀀𐀁')
4941+
48904942
def test_shared_memory_cleaned_after_process_termination(self):
48914943
cmd = '''if 1:
48924944
import os, time, sys
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix :class:`~multiprocessing.shared_memory.ShareableList` corrupting
2+
multi-byte UTF-8 strings due to using character count instead of byte count
3+
for slot allocation, and stripping legitimate trailing null bytes from
4+
:class:`bytes` and :class:`str` values.

0 commit comments

Comments
 (0)