From 6b356868eb5f010398948ee6290d04fb094d2c8c Mon Sep 17 00:00:00 2001 From: Taksh Date: Fri, 10 Apr 2026 14:44:26 +0530 Subject: [PATCH] Apply subjects filter to chunked reads in sqlite/import.py When the MIMIC-IV CSV being imported exceeds THRESHOLD_SIZE (50MB) the loader switches to a chunked pd.read_csv loop, but the chunked path calls process_dataframe(chunk) without passing 'subjects'. The small- file path just above passes subjects=subjects, and process_dataframe only filters to the requested subject_ids when that argument is supplied, so --limit is silently ignored for any table large enough to be chunked (e.g. chartevents, labevents). The resulting database ends up containing every subject in those tables instead of the limited subset the user asked for. Pass subjects=subjects in the chunked call so --limit is honored for large tables as well. --- mimic-iv/buildmimic/sqlite/import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mimic-iv/buildmimic/sqlite/import.py b/mimic-iv/buildmimic/sqlite/import.py index 1cb8eb0d..68d43647 100644 --- a/mimic-iv/buildmimic/sqlite/import.py +++ b/mimic-iv/buildmimic/sqlite/import.py @@ -165,7 +165,7 @@ def main(): else: # If the file is too large, let's do the work in chunks for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False, dtype=mimic_dtypes): - chunk = process_dataframe(chunk) + chunk = process_dataframe(chunk, subjects=subjects) chunk.to_sql(tablename, connection, if_exists="append", index=False) row_counts[tablename] += len(chunk) print("done!")