mne-tools · teonbrooks · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -0,0 +1 @@
+Fix bug with reading large CNT files by `Teon Brooks`_.
diff --git a/mne/annotations.py b/mne/annotations.py
@@ -1366,7 +1366,12 @@ def _write_annotations_txt(fname, annot):
 
 @fill_doc
 def read_annotations(
-    fname, sfreq="auto", uint16_codec=None, encoding="utf8", ignore_marker_types=False
+    fname,
+    sfreq="auto",
+    uint16_codec=None,
+    encoding="utf8",
+    ignore_marker_types=False,
+    data_format="auto",
 ) -> Annotations:
     r"""Read annotations from a file.
 
@@ -1400,6 +1405,8 @@ def read_annotations(
     ignore_marker_types : bool
         If ``True``, ignore marker types in BrainVision files (and only use their
         descriptions). Defaults to ``False``.
+    data_format : str
+        Only used by CNT files, see :func:`mne.io.read_raw_cnt` for details.
 
     Returns
     -------
@@ -1444,6 +1451,7 @@ def read_annotations(
     kwargs = {
         ".vmrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
         ".amrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
+        ".cnt": {"data_format": data_format},
         ".dat": {"sfreq": sfreq},
         ".cdt": {"sfreq": sfreq},
         ".cef": {"sfreq": sfreq},

diff --git a/mne/io/cnt/_utils.py b/mne/io/cnt/_utils.py
@@ -10,7 +10,15 @@
 
 import numpy as np
 
-from ...utils import warn
+from ...utils import _check_option, logger, warn
+
+# Offsets from SETUP structure in http://paulbourke.net/dataformats/eeg/
+_NCHANNELS_OFFSET = 370
+_NSAMPLES_OFFSET = 864
+_RATE_OFFSET = 376
+_EVENTTABLEPOS_OFFSET = 886
+_DATA_OFFSET = 900  # Size of the 'SETUP' header.
+_CH_SIZE = 75  # Size of each channel in bytes
 
 
 def _read_teeg(f, teeg_offset):
@@ -105,8 +113,8 @@ def _session_date_2_meas_date(session_date, date_format):
         return (int_part, frac_part)
 
 
-def _compute_robust_event_table_position(fid, data_format="int32"):
-    """Compute `event_table_position`.
+def _compute_robust_sizes(*, fid, data_format):
+    """Compute n_channels, n_samples, n_bytes, and event_table_position.
 
     When recording event_table_position is computed (as accomulation). If the
     file recording is large then this value overflows and ends up pointing
@@ -115,36 +123,94 @@ def _compute_robust_event_table_position(fid, data_format="int32"):
     If the file is smaller than 2G the value in the SETUP is returned.
     Otherwise, the address of the table position is computed from:
     n_samples, n_channels, and the bytes size.
-    """
-    SETUP_NCHANNELS_OFFSET = 370
-    SETUP_NSAMPLES_OFFSET = 864
-    SETUP_EVENTTABLEPOS_OFFSET = 886
-
-    fid_origin = fid.tell()  # save the state
 
-    if fid.seek(0, SEEK_END) < 2e9:
-        fid.seek(SETUP_EVENTTABLEPOS_OFFSET)
-        (event_table_pos,) = np.frombuffer(fid.read(4), dtype="<i4")
-
-    else:
+    Reference: https://paulbourke.net/dataformats/eeg/
+    Header has a field for number of samples, but it does not seem to be
+    too reliable.
+    """
+    _check_option("data_format", data_format, ["auto", "int16", "int32"])
+    # Read the number of channels and samples from the header
+    fid.seek(_NCHANNELS_OFFSET)
+    n_channels = int(np.fromfile(fid, dtype="<u2", count=1).item())
+    logger.debug("Number of channels: %d", n_channels)
+    fid.seek(_NSAMPLES_OFFSET)
+    n_samples = int(np.frombuffer(fid.read(4), dtype="<i4").item())  # may be unreliable
+    logger.debug("Header number of samples: %d", n_samples)
+    file_size = fid.seek(0, SEEK_END)
+    workaround = "pass data_format='int16' or 'int32' explicitly"
+    samples_offset = _DATA_OFFSET + _CH_SIZE * n_channels
+    if file_size < 2e9:
+        # Our most reliable way to get the number of samples is to compute it
+        logger.debug("File size < 2GB, using header values")
+        fid.seek(_EVENTTABLEPOS_OFFSET)
+        event_offset = int(np.frombuffer(fid.read(4), dtype="<i4").item())
+        logger.debug("Event table offset from header: %d", event_offset)
+        if event_offset > file_size:
+            problem = (
+                f"Event table offset from header ({event_offset}) is larger than file "
+                f"size ({file_size})"
+            )
+            if data_format == "auto":
+                raise RuntimeError(
+                    f"{problem}, cannot automatically compute data format, {workaround}"
+                )
+            warn(
+                f"Event table offset from header ({event_offset}) is larger than file "
+                f"size ({file_size}), recomputing event table offset."
+            )
+            n_bytes = 2 if data_format == "int16" else 4
+            event_offset = samples_offset + n_samples * n_channels * n_bytes
+        n_data_bytes = event_offset - samples_offset
         if data_format == "auto":
+            n_bytes_per_samp, rem = divmod(n_data_bytes, n_channels)
+            why = ""
+            n_bytes = 2
+            if rem != 0:
+                why = (
+                    f"number of data bytes {n_data_bytes} is not evenly divisible by "
+                    f"{n_channels=}"
+                )
+            elif n_samples == 0:
+                why = "number of read samples is 0"
+            else:
+                n_bytes, rem = divmod(n_bytes_per_samp, n_samples)
+                if rem != 0 or n_bytes not in [2, 4]:
+                    why = (
+                        f"number of bytes per sample {n_bytes_per_samp} is not evenly "
+                        f"divisible by {n_samples=} or does not result in 2 or 4 bytes "
+                        f"per sample ({n_bytes=})"
+                    )
+                logger.debug("Inferred data format with %d bytes per sample", n_bytes)
+            if why:
+                raise RuntimeError(
+                    "Could not automatically compute number of bytes per sample as the "
+                    f"{why}.  set data_format manually."
+                )
+        else:
+            n_bytes = 2 if data_format == "int16" else 4
+        logger.debug(
+            "Using %d bytes per sample from data_format=%s", n_bytes, data_format
+        )
+        n_samples, rem = divmod(n_data_bytes, (n_channels * n_bytes))
+        logger.debug("Computed number of samples: %d", n_samples)
+        if rem != 0:
             warn(
+                "Inconsistent file information detected, number of data bytes "
+                f"({n_data_bytes}) not evenly divisible by number of channels "
+                f"({n_channels}) times number of bytes ({n_bytes})"
+            )
+    else:
+        logger.debug("File size >= 2GB, computing event table offset")
+        if data_format == "auto":
+            raise RuntimeError(
                 "Using `data_format='auto' for a CNT file larger"
-                " than 2Gb is not granted to work. Please pass"
-                " 'int16' or 'int32'.` (assuming int32)"
+                " than 2Gb is not supported, explicitly pass data_format as "
+                "'int16' or 'int32'"
             )
-
         n_bytes = 2 if data_format == "int16" else 4
-
-        fid.seek(SETUP_NSAMPLES_OFFSET)
-        (n_samples,) = np.frombuffer(fid.read(4), dtype="<i4")
-
-        fid.seek(SETUP_NCHANNELS_OFFSET)
-        (n_channels,) = np.frombuffer(fid.read(2), dtype="<u2")
-
-        event_table_pos = (
-            900 + 75 * int(n_channels) + n_bytes * int(n_channels) * int(n_samples)
+        event_offset = (
+            _DATA_OFFSET + _CH_SIZE * n_channels + n_bytes * n_channels * n_samples
         )
+        logger.debug("Computed event table offset: %d", event_offset)
 
-    fid.seek(fid_origin)  # restore the state
-    return event_table_pos
+    return n_channels, n_samples, n_bytes, event_offset
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix bug with reading large CNT files by `Teon Brooks`_.