Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/dev/13548.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix bug with reading large CNT files by `Teon Brooks`_.
10 changes: 9 additions & 1 deletion mne/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1366,7 +1366,12 @@ def _write_annotations_txt(fname, annot):

@fill_doc
def read_annotations(
fname, sfreq="auto", uint16_codec=None, encoding="utf8", ignore_marker_types=False
fname,
sfreq="auto",
uint16_codec=None,
encoding="utf8",
ignore_marker_types=False,
data_format="auto",
) -> Annotations:
r"""Read annotations from a file.

Expand Down Expand Up @@ -1400,6 +1405,8 @@ def read_annotations(
ignore_marker_types : bool
If ``True``, ignore marker types in BrainVision files (and only use their
descriptions). Defaults to ``False``.
data_format : str
Only used by CNT files, see :func:`mne.io.read_raw_cnt` for details.

Returns
-------
Expand Down Expand Up @@ -1444,6 +1451,7 @@ def read_annotations(
kwargs = {
".vmrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
".amrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
".cnt": {"data_format": data_format},
".dat": {"sfreq": sfreq},
".cdt": {"sfreq": sfreq},
".cef": {"sfreq": sfreq},
Expand Down
122 changes: 94 additions & 28 deletions mne/io/cnt/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@

import numpy as np

from ...utils import warn
from ...utils import _check_option, logger, warn

# Offsets from SETUP structure in http://paulbourke.net/dataformats/eeg/
_NCHANNELS_OFFSET = 370
_NSAMPLES_OFFSET = 864
_RATE_OFFSET = 376
_EVENTTABLEPOS_OFFSET = 886
_DATA_OFFSET = 900 # Size of the 'SETUP' header.
_CH_SIZE = 75 # Size of each channel in bytes


def _read_teeg(f, teeg_offset):
Expand Down Expand Up @@ -105,8 +113,8 @@ def _session_date_2_meas_date(session_date, date_format):
return (int_part, frac_part)


def _compute_robust_event_table_position(fid, data_format="int32"):
"""Compute `event_table_position`.
def _compute_robust_sizes(*, fid, data_format):
"""Compute n_channels, n_samples, n_bytes, and event_table_position.

When recording event_table_position is computed (as accomulation). If the
file recording is large then this value overflows and ends up pointing
Expand All @@ -115,36 +123,94 @@ def _compute_robust_event_table_position(fid, data_format="int32"):
If the file is smaller than 2G the value in the SETUP is returned.
Otherwise, the address of the table position is computed from:
n_samples, n_channels, and the bytes size.
"""
SETUP_NCHANNELS_OFFSET = 370
SETUP_NSAMPLES_OFFSET = 864
SETUP_EVENTTABLEPOS_OFFSET = 886

fid_origin = fid.tell() # save the state

if fid.seek(0, SEEK_END) < 2e9:
fid.seek(SETUP_EVENTTABLEPOS_OFFSET)
(event_table_pos,) = np.frombuffer(fid.read(4), dtype="<i4")

else:
Reference: https://paulbourke.net/dataformats/eeg/
Header has a field for number of samples, but it does not seem to be
too reliable.
"""
_check_option("data_format", data_format, ["auto", "int16", "int32"])
# Read the number of channels and samples from the header
fid.seek(_NCHANNELS_OFFSET)
n_channels = int(np.fromfile(fid, dtype="<u2", count=1).item())
logger.debug("Number of channels: %d", n_channels)
fid.seek(_NSAMPLES_OFFSET)
n_samples = int(np.frombuffer(fid.read(4), dtype="<i4").item()) # may be unreliable
logger.debug("Header number of samples: %d", n_samples)
file_size = fid.seek(0, SEEK_END)
workaround = "pass data_format='int16' or 'int32' explicitly"
samples_offset = _DATA_OFFSET + _CH_SIZE * n_channels
if file_size < 2e9:
# Our most reliable way to get the number of samples is to compute it
logger.debug("File size < 2GB, using header values")
fid.seek(_EVENTTABLEPOS_OFFSET)
event_offset = int(np.frombuffer(fid.read(4), dtype="<i4").item())
logger.debug("Event table offset from header: %d", event_offset)
if event_offset > file_size:
problem = (
f"Event table offset from header ({event_offset}) is larger than file "
f"size ({file_size})"
)
if data_format == "auto":
raise RuntimeError(
f"{problem}, cannot automatically compute data format, {workaround}"
)
warn(
f"Event table offset from header ({event_offset}) is larger than file "
f"size ({file_size}), recomputing event table offset."
)
n_bytes = 2 if data_format == "int16" else 4
event_offset = samples_offset + n_samples * n_channels * n_bytes
n_data_bytes = event_offset - samples_offset
if data_format == "auto":
n_bytes_per_samp, rem = divmod(n_data_bytes, n_channels)
why = ""
n_bytes = 2
if rem != 0:
why = (
f"number of data bytes {n_data_bytes} is not evenly divisible by "
f"{n_channels=}"
)
elif n_samples == 0:
why = "number of read samples is 0"
else:
n_bytes, rem = divmod(n_bytes_per_samp, n_samples)
if rem != 0 or n_bytes not in [2, 4]:
why = (
f"number of bytes per sample {n_bytes_per_samp} is not evenly "
f"divisible by {n_samples=} or does not result in 2 or 4 bytes "
f"per sample ({n_bytes=})"
)
logger.debug("Inferred data format with %d bytes per sample", n_bytes)
if why:
raise RuntimeError(
"Could not automatically compute number of bytes per sample as the "
f"{why}. set data_format manually."
)
else:
n_bytes = 2 if data_format == "int16" else 4
logger.debug(
"Using %d bytes per sample from data_format=%s", n_bytes, data_format
)
n_samples, rem = divmod(n_data_bytes, (n_channels * n_bytes))
logger.debug("Computed number of samples: %d", n_samples)
if rem != 0:
warn(
"Inconsistent file information detected, number of data bytes "
f"({n_data_bytes}) not evenly divisible by number of channels "
f"({n_channels}) times number of bytes ({n_bytes})"
)
else:
logger.debug("File size >= 2GB, computing event table offset")
if data_format == "auto":
raise RuntimeError(
"Using `data_format='auto' for a CNT file larger"
" than 2Gb is not granted to work. Please pass"
" 'int16' or 'int32'.` (assuming int32)"
" than 2Gb is not supported, explicitly pass data_format as "
"'int16' or 'int32'"
)

n_bytes = 2 if data_format == "int16" else 4

fid.seek(SETUP_NSAMPLES_OFFSET)
(n_samples,) = np.frombuffer(fid.read(4), dtype="<i4")

fid.seek(SETUP_NCHANNELS_OFFSET)
(n_channels,) = np.frombuffer(fid.read(2), dtype="<u2")

event_table_pos = (
900 + 75 * int(n_channels) + n_bytes * int(n_channels) * int(n_samples)
event_offset = (
_DATA_OFFSET + _CH_SIZE * n_channels + n_bytes * n_channels * n_samples
)
logger.debug("Computed event table offset: %d", event_offset)

fid.seek(fid_origin) # restore the state
return event_table_pos
return n_channels, n_samples, n_bytes, event_offset
Loading