diff --git a/dlio_benchmark/reader/_local_fs_iterable_mixin.py b/dlio_benchmark/reader/_local_fs_iterable_mixin.py index e681130e..b222b0c6 100644 --- a/dlio_benchmark/reader/_local_fs_iterable_mixin.py +++ b/dlio_benchmark/reader/_local_fs_iterable_mixin.py @@ -29,11 +29,11 @@ TWO PREFETCH MODES ================== -storage_library: +Buffered (default — neither uri_scheme nor storage_library selects direct) ThreadPoolExecutor(64) + Python open() + buffered read. Simple, portable, uses OS page cache. -storage_library: "direct" +O_DIRECT (uri_scheme: "direct" OR storage_library: "direct") s3dlio.get_many() with direct:// URIs. Uses O_DIRECT (Linux) — bypasses page cache entirely, 4 KiB-aligned I/O via Tokio async tasks in the s3dlio Rust runtime. GIL is released for the @@ -42,6 +42,11 @@ the page cache rather than the device, understating storage latency and saturating DRAM bandwidth instead of device bandwidth. + Either knob triggers this path. ``uri_scheme: "direct"`` is the canonical + signal (matches ``storage_type=direct_fs`` validation, which requires + ``storage_library=s3dlio``); ``storage_library: "direct"`` is the legacy + single-knob form, kept for backward compatibility. See storage#567. + USAGE PATTERN ============= Subclass from BOTH the format-specific parent AND this mixin:: @@ -82,18 +87,31 @@ class _LocalFSIterableMixin: call ``_localfs_init()`` from the subclass ``__init__`` after ``super().__init__()``. - Set ``storage_library: direct`` in storage_options to use s3dlio's O_DIRECT - path (bypasses page cache — essential for accurate NVMe benchmarking). - Default (no storage_library, or ``posix``) uses buffered Python open(). + The O_DIRECT path (s3dlio with ``direct://`` URIs — bypasses page cache, + essential for accurate NVMe benchmarking) is selected when EITHER: + + - ``storage_options.uri_scheme == "direct"`` — the canonical signal, + matching how ``storage_type=direct_fs`` is configured upstream + (mlcommons/storage's ``--o-direct`` sets uri_scheme=direct and + storage_library=s3dlio per ``utils/config.py`` direct_fs validation). + - ``storage_options.storage_library == "direct"`` — the legacy + single-knob form, kept for backward compatibility. + + Anything else falls back to buffered Python ``open()``. """ def _localfs_init(self) -> None: """ Initialise mixin state. - Reads ``storage_options.storage_library`` from ConfigArguments: - - ``"direct"`` → s3dlio O_DIRECT path (``direct://`` URIs, Tokio, GIL-free) - - anything else → buffered Python ThreadPoolExecutor path + Selects the O_DIRECT prefetch path when EITHER + ``storage_options.uri_scheme == "direct"`` OR + ``storage_options.storage_library == "direct"``. The uri_scheme + gate exists so configs produced by ``storage_type=direct_fs`` + (which validates ``storage_library=s3dlio``, NOT ``direct``) + still reach the s3dlio O_DIRECT engine. Without it, those configs + hand ``direct://...`` URIs to plain ``open()`` and crash with + FileNotFoundError — see mlcommons/storage#567. Sets: - ``self._local_cache`` (dict: filename → int byte count) @@ -108,14 +126,16 @@ def _localfs_init(self) -> None: opts = getattr(self._args, "storage_options", {}) or {} lib = opts.get("storage_library", "") - self._use_direct: bool = (lib == "direct") + scheme = opts.get("uri_scheme", "") + self._use_direct: bool = (scheme == "direct") or (lib == "direct") if self._use_direct: try: import s3dlio as _s3dlio # noqa: F401 except ImportError as exc: raise ImportError( - f"{self.__class__.__name__}: storage_library='direct' requires " + f"{self.__class__.__name__}: O_DIRECT mode " + f"(uri_scheme='direct' or storage_library='direct') requires " "the s3dlio package. Install with: pip install s3dlio" ) from exc diff --git a/tests/test_direct_fs_iterable_mixin_gate.py b/tests/test_direct_fs_iterable_mixin_gate.py new file mode 100644 index 00000000..32d02a3c --- /dev/null +++ b/tests/test_direct_fs_iterable_mixin_gate.py @@ -0,0 +1,135 @@ +""" +Regression tests for the O_DIRECT prefetch-path gate in +``_LocalFSIterableMixin._localfs_init``. + +Background — mlcommons/storage#567 +================================== +Before this fix, the mixin only selected the s3dlio O_DIRECT path when +``storage_options.storage_library == "direct"``. But ``storage_type=direct_fs`` +(reached from mlpstorage's ``--o-direct``) is *required* by +``utils/config.py`` to set ``storage_library == "s3dlio"``. With the old +``lib == "direct"`` gate that meant ``_use_direct`` was always ``False`` for +``direct_fs`` runs — the mixin then handed ``direct://…`` URIs to plain +``open()`` and crashed with ``FileNotFoundError`` on the warmup batch for +every local NPZ / NPY / JPEG workload (UNet3D, ResNet, …). + +The fix accepts EITHER signal: + - ``uri_scheme == "direct"`` (canonical — matches direct_fs validation) + - ``storage_library == "direct"`` (legacy single-knob form) + +These tests lock both signals so the gate can't silently drift back to the +single-knob check that broke direct_fs. +""" +import pytest +from unittest.mock import MagicMock + +from dlio_benchmark.reader._local_fs_iterable_mixin import _LocalFSIterableMixin + + +def _make_instance(storage_options): + """Build a bare mixin instance whose ``_args.storage_options`` returns + the given dict. We bypass ``__init__`` on purpose — ``_localfs_init`` is + the only method under test, and the full reader stack pulls in mpi4py + + torch + hydra which is overkill for a sentinel-comparison unit test. + """ + instance = _LocalFSIterableMixin.__new__(_LocalFSIterableMixin) + instance._args = MagicMock() + instance._args.storage_options = storage_options + return instance + + +class TestUseDirectGateForDirectFsConfig: + """storage#567: configs emitted by ``storage_type=direct_fs`` MUST take + the O_DIRECT path.""" + + def test_direct_fs_shape_selects_direct(self): + """The exact config shape mlpstorage's ``--o-direct`` produces: + storage_library=s3dlio + uri_scheme=direct. Pre-fix this was the + broken case — ``_use_direct`` was False and the mixin handed + ``direct://…`` to ``open()``.""" + inst = _make_instance({"storage_library": "s3dlio", "uri_scheme": "direct"}) + inst._localfs_init() + assert inst._use_direct is True, ( + "storage#567 regression: storage_library=s3dlio + uri_scheme=direct " + "MUST select the O_DIRECT path. _use_direct=False means the " + "buffered open() path will be invoked on direct:// URIs." + ) + + def test_uri_scheme_direct_alone_selects_direct(self): + """uri_scheme is sufficient on its own — storage_library may be + absent or set to any non-'direct' value.""" + inst = _make_instance({"uri_scheme": "direct"}) + inst._localfs_init() + assert inst._use_direct is True + + +class TestUseDirectGateLegacyForm: + """Backward compatibility: the legacy single-knob form must keep + working.""" + + def test_storage_library_direct_alone_selects_direct(self): + inst = _make_instance({"storage_library": "direct"}) + inst._localfs_init() + assert inst._use_direct is True + + def test_both_knobs_set_to_direct_selects_direct(self): + inst = _make_instance({"storage_library": "direct", "uri_scheme": "direct"}) + inst._localfs_init() + assert inst._use_direct is True + + +class TestUseDirectGateBufferedFallback: + """The buffered path must remain the default for non-direct configs.""" + + def test_empty_options_selects_buffered(self): + inst = _make_instance({}) + inst._localfs_init() + assert inst._use_direct is False + + def test_none_options_selects_buffered(self): + """getattr returning None must not blow up; treat as buffered.""" + instance = _LocalFSIterableMixin.__new__(_LocalFSIterableMixin) + instance._args = MagicMock() + instance._args.storage_options = None + instance._localfs_init() + assert instance._use_direct is False + + def test_posix_storage_library_selects_buffered(self): + inst = _make_instance({"storage_library": "posix"}) + inst._localfs_init() + assert inst._use_direct is False + + def test_s3dlio_with_s3_uri_scheme_selects_buffered(self): + """storage_library=s3dlio is also used for real S3 (uri_scheme=s3). + That must NOT trigger the local O_DIRECT path.""" + inst = _make_instance({"storage_library": "s3dlio", "uri_scheme": "s3"}) + inst._localfs_init() + assert inst._use_direct is False, ( + "storage_library=s3dlio alone must not imply local O_DIRECT — " + "the same library serves remote S3 with uri_scheme=s3." + ) + + def test_s3dlio_with_file_uri_scheme_selects_buffered(self): + """file:// is a separate s3dlio mode (buffered local FS via s3dlio); + not the O_DIRECT path.""" + inst = _make_instance({"storage_library": "s3dlio", "uri_scheme": "file"}) + inst._localfs_init() + assert inst._use_direct is False + + +class TestUseDirectInitsCounters: + """The init must always seed the bookkeeping attributes the rest of + the mixin reads, regardless of which path is selected.""" + + @pytest.mark.parametrize("opts", [ + {"storage_library": "s3dlio", "uri_scheme": "direct"}, + {"storage_library": "direct"}, + {}, + {"storage_library": "posix"}, + ]) + def test_counters_seeded(self, opts): + inst = _make_instance(opts) + inst._localfs_init() + assert inst._local_cache == {} + assert inst._total_bytes_read == 0 + assert inst._total_objects_read == 0