From 6b356868eb5f010398948ee6290d04fb094d2c8c Mon Sep 17 00:00:00 2001
From: Taksh <takshkothari09@gmail.com>
Date: Fri, 10 Apr 2026 14:44:26 +0530
Subject: [PATCH] Apply subjects filter to chunked reads in sqlite/import.py

When the MIMIC-IV CSV being imported exceeds THRESHOLD_SIZE (50MB) the
loader switches to a chunked pd.read_csv loop, but the chunked path
calls process_dataframe(chunk) without passing 'subjects'. The small-
file path just above passes subjects=subjects, and process_dataframe
only filters to the requested subject_ids when that argument is
supplied, so --limit is silently ignored for any table large enough to
be chunked (e.g. chartevents, labevents). The resulting database ends
up containing every subject in those tables instead of the limited
subset the user asked for.

Pass subjects=subjects in the chunked call so --limit is honored for
large tables as well.
---
 mimic-iv/buildmimic/sqlite/import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mimic-iv/buildmimic/sqlite/import.py b/mimic-iv/buildmimic/sqlite/import.py
index 1cb8eb0d..68d43647 100644
--- a/mimic-iv/buildmimic/sqlite/import.py
+++ b/mimic-iv/buildmimic/sqlite/import.py
@@ -165,7 +165,7 @@ def main():
             else:
                 # If the file is too large, let's do the work in chunks
                 for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False, dtype=mimic_dtypes):
-                    chunk = process_dataframe(chunk)
+                    chunk = process_dataframe(chunk, subjects=subjects)
                     chunk.to_sql(tablename, connection, if_exists="append", index=False)
                     row_counts[tablename] += len(chunk)
             print("done!")