diff --git a/doc/changes/dev/13548.bugfix.rst b/doc/changes/dev/13548.bugfix.rst new file mode 100644 index 00000000000..01c0d2eafcc --- /dev/null +++ b/doc/changes/dev/13548.bugfix.rst @@ -0,0 +1 @@ +Fix bug with reading large CNT files by `Teon Brooks`_. diff --git a/mne/annotations.py b/mne/annotations.py index e298e80918c..6fee30b7b93 100644 --- a/mne/annotations.py +++ b/mne/annotations.py @@ -1366,7 +1366,12 @@ def _write_annotations_txt(fname, annot): @fill_doc def read_annotations( - fname, sfreq="auto", uint16_codec=None, encoding="utf8", ignore_marker_types=False + fname, + sfreq="auto", + uint16_codec=None, + encoding="utf8", + ignore_marker_types=False, + data_format="auto", ) -> Annotations: r"""Read annotations from a file. @@ -1400,6 +1405,8 @@ def read_annotations( ignore_marker_types : bool If ``True``, ignore marker types in BrainVision files (and only use their descriptions). Defaults to ``False``. + data_format : str + Only used by CNT files, see :func:`mne.io.read_raw_cnt` for details. Returns ------- @@ -1444,6 +1451,7 @@ def read_annotations( kwargs = { ".vmrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types}, ".amrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types}, + ".cnt": {"data_format": data_format}, ".dat": {"sfreq": sfreq}, ".cdt": {"sfreq": sfreq}, ".cef": {"sfreq": sfreq}, diff --git a/mne/io/cnt/_utils.py b/mne/io/cnt/_utils.py index cf2d45cb1ef..590cd4ed6fe 100644 --- a/mne/io/cnt/_utils.py +++ b/mne/io/cnt/_utils.py @@ -10,7 +10,15 @@ import numpy as np -from ...utils import warn +from ...utils import _check_option, logger, warn + +# Offsets from SETUP structure in http://paulbourke.net/dataformats/eeg/ +_NCHANNELS_OFFSET = 370 +_NSAMPLES_OFFSET = 864 +_RATE_OFFSET = 376 +_EVENTTABLEPOS_OFFSET = 886 +_DATA_OFFSET = 900 # Size of the 'SETUP' header. +_CH_SIZE = 75 # Size of each channel in bytes def _read_teeg(f, teeg_offset): @@ -105,8 +113,8 @@ def _session_date_2_meas_date(session_date, date_format): return (int_part, frac_part) -def _compute_robust_event_table_position(fid, data_format="int32"): - """Compute `event_table_position`. +def _compute_robust_sizes(*, fid, data_format): + """Compute n_channels, n_samples, n_bytes, and event_table_position. When recording event_table_position is computed (as accomulation). If the file recording is large then this value overflows and ends up pointing @@ -115,36 +123,94 @@ def _compute_robust_event_table_position(fid, data_format="int32"): If the file is smaller than 2G the value in the SETUP is returned. Otherwise, the address of the table position is computed from: n_samples, n_channels, and the bytes size. - """ - SETUP_NCHANNELS_OFFSET = 370 - SETUP_NSAMPLES_OFFSET = 864 - SETUP_EVENTTABLEPOS_OFFSET = 886 - - fid_origin = fid.tell() # save the state - if fid.seek(0, SEEK_END) < 2e9: - fid.seek(SETUP_EVENTTABLEPOS_OFFSET) - (event_table_pos,) = np.frombuffer(fid.read(4), dtype=" file_size: + problem = ( + f"Event table offset from header ({event_offset}) is larger than file " + f"size ({file_size})" + ) + if data_format == "auto": + raise RuntimeError( + f"{problem}, cannot automatically compute data format, {workaround}" + ) + warn( + f"Event table offset from header ({event_offset}) is larger than file " + f"size ({file_size}), recomputing event table offset." + ) + n_bytes = 2 if data_format == "int16" else 4 + event_offset = samples_offset + n_samples * n_channels * n_bytes + n_data_bytes = event_offset - samples_offset if data_format == "auto": + n_bytes_per_samp, rem = divmod(n_data_bytes, n_channels) + why = "" + n_bytes = 2 + if rem != 0: + why = ( + f"number of data bytes {n_data_bytes} is not evenly divisible by " + f"{n_channels=}" + ) + elif n_samples == 0: + why = "number of read samples is 0" + else: + n_bytes, rem = divmod(n_bytes_per_samp, n_samples) + if rem != 0 or n_bytes not in [2, 4]: + why = ( + f"number of bytes per sample {n_bytes_per_samp} is not evenly " + f"divisible by {n_samples=} or does not result in 2 or 4 bytes " + f"per sample ({n_bytes=})" + ) + logger.debug("Inferred data format with %d bytes per sample", n_bytes) + if why: + raise RuntimeError( + "Could not automatically compute number of bytes per sample as the " + f"{why}. set data_format manually." + ) + else: + n_bytes = 2 if data_format == "int16" else 4 + logger.debug( + "Using %d bytes per sample from data_format=%s", n_bytes, data_format + ) + n_samples, rem = divmod(n_data_bytes, (n_channels * n_bytes)) + logger.debug("Computed number of samples: %d", n_samples) + if rem != 0: warn( + "Inconsistent file information detected, number of data bytes " + f"({n_data_bytes}) not evenly divisible by number of channels " + f"({n_channels}) times number of bytes ({n_bytes})" + ) + else: + logger.debug("File size >= 2GB, computing event table offset") + if data_format == "auto": + raise RuntimeError( "Using `data_format='auto' for a CNT file larger" - " than 2Gb is not granted to work. Please pass" - " 'int16' or 'int32'.` (assuming int32)" + " than 2Gb is not supported, explicitly pass data_format as " + "'int16' or 'int32'" ) - n_bytes = 2 if data_format == "int16" else 4 - - fid.seek(SETUP_NSAMPLES_OFFSET) - (n_samples,) = np.frombuffer(fid.read(4), dtype="= 0] fid.seek(438) - lowpass_toggle = np.fromfile(fid, "i1", count=1).item() - highpass_toggle = np.fromfile(fid, "i1", count=1).item() - - # Header has a field for number of samples, but it does not seem to be - # too reliable. That's why we have option for setting n_bytes manually. - fid.seek(864) - n_samples = np.fromfile(fid, dtype=" n_samples: - n_bytes = 4 - n_samples = n_samples_header - warn( - "Annotations are outside data range. " - "Changing data format to 'int32'." - ) - else: - n_bytes = data_size // (n_samples * n_channels) - else: - n_bytes = 2 if data_format == "int16" else 4 - n_samples = data_size // (n_bytes * n_channels) + cnt_info["continuous_seconds"] = float( + np.fromfile(fid, dtype=" 1: @@ -382,37 +333,37 @@ def _get_cnt_info(input_fname, eog, ecg, emg, misc, data_format, date_format, he _validate_type(header, str, "header") _check_option("header", header, ("auto", "new", "old")) for ch_idx in range(n_channels): # ELECTLOC fields - fid.seek(data_offset + 75 * ch_idx) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx) ch_name = read_str(fid, 10) ch_names.append(ch_name) # Some files have bad channels marked differently in the header. if header in ("new", "auto"): - fid.seek(data_offset + 75 * ch_idx + 14) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 14) if np.fromfile(fid, dtype="u1", count=1).item(): bads.append(ch_name) if header in ("old", "auto"): - fid.seek(data_offset + 75 * ch_idx + 4) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 4) if np.fromfile(fid, dtype="u1", count=1).item(): bads.append(ch_name) - fid.seek(data_offset + 75 * ch_idx + 19) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 19) xy = np.fromfile(fid, dtype="f4", count=2) xy[1] *= -1 # invert y-axis pos.append(xy) - fid.seek(data_offset + 75 * ch_idx + 47) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 47) # Baselines are subtracted before scaling the data. baselines.append(np.fromfile(fid, dtype="i2", count=1).item()) - fid.seek(data_offset + 75 * ch_idx + 59) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 59) sensitivity = np.fromfile(fid, dtype="f4", count=1).item() - fid.seek(data_offset + 75 * ch_idx + 71) + fid.seek(_DATA_OFFSET + _CH_SIZE * ch_idx + 71) cal = np.fromfile(fid, dtype="f4", count=1).item() cals.append(cal * sensitivity * 1e-6 / 204.8) info = _empty_info(sfreq) - if lowpass_toggle == 1: + if lowpass_toggle and highcutoff > 0: info["lowpass"] = highcutoff - if highpass_toggle == 1: + if highpass_toggle and lowcutoff > 0: info["highpass"] = lowcutoff subject_info = { "hand": hand, @@ -520,6 +471,7 @@ class RawCNT(BaseRaw): mne.io.Raw : Documentation of attributes and methods. """ + @verbose def __init__( self, input_fname, @@ -540,7 +492,9 @@ def __init__( else: _date_format = "%m/%d/%y %H:%M:%S" - input_fname = path.abspath(input_fname) + input_fname = _check_fname( + input_fname, overwrite="read", must_exist=True, name="input_fname" + ) try: info, cnt_info = _get_cnt_info( input_fname, eog, ecg, emg, misc, data_format, _date_format, header @@ -565,7 +519,9 @@ def __init__( data_format = "int32" if cnt_info["n_bytes"] == 4 else "int16" self.set_annotations( - _read_annotations_cnt(input_fname, data_format=data_format) + _read_annotations_cnt( + input_fname, data_format=data_format, verbose=_verbose_safe_false() + ) ) def _read_segment_file(self, data, idx, fi, start, stop, cals, mult): @@ -594,7 +550,9 @@ def _read_segment_file(self, data, idx, fi, start, stop, cals, mult): block_size = min(data_left, block_size) s_offset = start % channel_offset with open(self.filenames[fi], "rb", buffering=0) as fid: - fid.seek(900 + f_channels * (75 + (start - s_offset) * n_bytes)) + fid.seek( + _DATA_OFFSET + f_channels * (_CH_SIZE + (start - s_offset) * n_bytes) + ) for sample_start in np.arange(0, data_left, block_size) // f_channels: # Earlier comment says n_samples is unreliable, but I think it # is because it needed to be changed to unsigned int diff --git a/mne/io/cnt/tests/test_cnt.py b/mne/io/cnt/tests/test_cnt.py index f98253b1317..d41cbfcd122 100644 --- a/mne/io/cnt/tests/test_cnt.py +++ b/mne/io/cnt/tests/test_cnt.py @@ -19,14 +19,23 @@ _no_parse = pytest.warns(RuntimeWarning, match="Could not parse") +inconsistent = pytest.warns(RuntimeWarning, match="Inconsistent file information") +outside = pytest.warns(RuntimeWarning, match="outside the data") +omitted = pytest.warns(RuntimeWarning, match="Omitted 4 annot") @testing.requires_testing_data def test_old_data(): """Test reading raw cnt files.""" - with _no_parse, pytest.warns(RuntimeWarning, match="number of bytes"): + with _no_parse, pytest.raises(RuntimeError, match="number of bytes"): + read_raw_cnt(input_fname=fname, eog="auto", misc=["NA1", "LEFT_EAR"]) + with _no_parse: raw = _test_raw_reader( - read_raw_cnt, input_fname=fname, eog="auto", misc=["NA1", "LEFT_EAR"] + read_raw_cnt, + input_fname=fname, + eog="auto", + misc=["NA1", "LEFT_EAR"], + data_format="int16", ) # make sure we use annotations event if we synthesized stim @@ -43,8 +52,10 @@ def test_old_data(): @testing.requires_testing_data def test_new_data(): """Test reading raw cnt files with different header.""" - with pytest.warns(RuntimeWarning): - raw = read_raw_cnt(input_fname=fname_bad_spans, header="new") + with inconsistent, outside, omitted: + raw = read_raw_cnt( + input_fname=fname_bad_spans, header="new", data_format="int32" + ) assert raw.info["bads"] == ["F8"] # test bads @@ -52,18 +63,21 @@ def test_new_data(): @testing.requires_testing_data def test_auto_data(): """Test reading raw cnt files with automatic header.""" - first = pytest.warns(RuntimeWarning, match="Could not define the number of bytes.*") - second = pytest.warns(RuntimeWarning, match="Annotations are outside") - third = pytest.warns(RuntimeWarning, match="Omitted 6 annot") - with first, second, third: - raw = read_raw_cnt(input_fname=fname_bad_spans) + with inconsistent, outside, omitted: + raw = read_raw_cnt( + input_fname=fname_bad_spans, data_format="int16", verbose="debug" + ) # Test that responses are read properly assert "KeyPad Response 1" in raw.annotations.description assert raw.info["bads"] == ["F8"] - with _no_parse, pytest.warns(RuntimeWarning, match="number of bytes"): + with _no_parse: raw = _test_raw_reader( - read_raw_cnt, input_fname=fname, eog="auto", misc=["NA1", "LEFT_EAR"] + read_raw_cnt, + input_fname=fname, + eog="auto", + misc=["NA1", "LEFT_EAR"], + data_format="int16", ) # make sure we use annotations event if we synthesized stim @@ -80,33 +94,35 @@ def test_auto_data(): @testing.requires_testing_data def test_compare_events_and_annotations(): """Test comparing annotations and events.""" - with _no_parse, pytest.warns(RuntimeWarning, match="Could not define the num"): - raw = read_raw_cnt(fname) + with _no_parse: + raw = read_raw_cnt(fname, data_format="int16") events = np.array( [[333, 0, 7], [1010, 0, 7], [1664, 0, 109], [2324, 0, 7], [2984, 0, 109]] ) - annot = read_annotations(fname) + annot = read_annotations(fname, data_format="int16") assert len(annot) == 6 assert_array_equal(annot.onset[:-1], events[:, 0] / raw.info["sfreq"]) assert "STI 014" not in raw.info["ch_names"] @testing.requires_testing_data -@pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_reading_bytes(): """Test reading raw cnt files with different header.""" - raw_16 = read_raw_cnt(fname, preload=True) - raw_32 = read_raw_cnt(fname_bad_spans, preload=True) + with _no_parse: + raw_16 = read_raw_cnt(fname, preload=True, data_format="int16") + with inconsistent, outside, omitted: + raw_32 = read_raw_cnt(fname_bad_spans, preload=True, data_format="int32") # Verify that the number of bytes read is correct assert len(raw_16) == 3070 - assert len(raw_32) == 90000 + assert len(raw_32) == 143765 # TODO: Used to be 90000! Need to eyeball @testing.requires_testing_data def test_bad_spans(): """Test reading raw cnt files with bad spans.""" - annot = read_annotations(fname_bad_spans) + with inconsistent: + annot = read_annotations(fname_bad_spans, data_format="int32") temp = "\t".join(annot.description) assert "BAD" in temp diff --git a/mne/io/fieldtrip/tests/helpers.py b/mne/io/fieldtrip/tests/helpers.py index 0a0f6671d6b..59df5c61108 100644 --- a/mne/io/fieldtrip/tests/helpers.py +++ b/mne/io/fieldtrip/tests/helpers.py @@ -49,7 +49,7 @@ system_to_reader_fn_dict = { "neuromag306": mne.io.read_raw_fif, - "CNT": partial(mne.io.read_raw_cnt), + "CNT": partial(mne.io.read_raw_cnt, data_format="int16"), "CTF": partial(mne.io.read_raw_ctf, clean_names=True), "BTI": partial( mne.io.read_raw_bti,