PennChopMicrobiomeProgram · Ulthran · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py
@@ -47,6 +47,7 @@ def backup_fastq(
     sample_sheet_fp: Path,
     has_index: bool,
     min_file_size: int,
+    allow_check_failures: bool = False,
 ):
 
     R1 = IlluminaFastq(gzip.open(forward_reads, mode="rt"))
@@ -58,25 +59,47 @@ def backup_fastq(
     illumina_fastqs = [IlluminaFastq(gzip.open(fp, mode="rt")) for fp in RI_fps]
     r1 = illumina_fastqs[0]
 
-    if not all([ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]):
+    fp_vs_content_results = [ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]
+    if not all(fp_vs_content_results):
         [ifq.check_fp_vs_content(verbose=True) for ifq in illumina_fastqs]
-        raise ValueError(
+        message = (
             "The file path and header information don't match",
-            [str(ifq) for ifq in illumina_fastqs if not ifq.check_fp_vs_content()[0]],
+            [
+                str(ifq)
+                for ifq, ok in zip(illumina_fastqs, fp_vs_content_results)
+                if not ok
+            ],
         )
-    if not all([ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]):
-        raise ValueError(
-            "File seems suspiciously small. Please check if you have the correct file or lower the minimum file size threshold",
-            [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs],
+        if allow_check_failures:
+            warnings.warn(f"{message[0]}: {message[1]}")
+        else:
+            raise ValueError(*message)
+    file_size_results = [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]
+    if not all(file_size_results):
+        message = (
+            "File seems suspiciously small. Please check if you have the correct file or"
+            " lower the minimum file size threshold",
+            file_size_results,
         )
+        if allow_check_failures:
+            warnings.warn(f"{message[0]}: {message[1]}")
+        else:
+            raise ValueError(*message)
     if not all([ifq.check_index_read_exists() for ifq in illumina_fastqs]):
         warnings.warn(
             "No barcodes in headers. Were the fastq files generated properly?"
         )
 
     # parse the info from the headers in EACH file and check they are consistent within each other
-    if not all([fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]):
-        raise ValueError("The files are not from the same run.")
+    same_run_results = [
+        fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs
+    ]
+    if not all(same_run_results):
+        message = "The files are not from the same run."
+        if allow_check_failures:
+            warnings.warn(message)
+        else:
+            raise ValueError(message)
 
     ## Archiving steps
 
@@ -144,13 +167,19 @@ def main(argv=None):
         default=DEFAULT_MIN_FILE_SIZE,
         help="Minimum file size to register in bytes",
     )
+    parser.add_argument(
+        "--allow-check-failures",
+        action="store_true",
+        help="Continue archiving even if validation checks fail",
+    )
     args = parser.parse_args(argv)
     return backup_fastq(
         args.forward_reads,
         args.destination_dir,
         args.sample_sheet,
         not args.no_index,
         args.min_file_size,
+        args.allow_check_failures,
     )
 
     # maybe also ask for single or double reads
diff --git a/test/test_backup.py b/test/test_backup.py
@@ -1,11 +1,14 @@
 import pytest
 from pathlib import Path
-from seqBackupLib.backup import (
-    backup_fastq,
-    build_fp_to_archive,
-    return_md5,
-    main,
-)
+import gzip
+from seqBackupLib.backup import backup_fastq, build_fp_to_archive, return_md5, main
+
+
+def _write_fastq(fp: Path, header: str) -> None:
+    sequence = "N" * 10
+    content = f"{header}\n{sequence}\n+\n{'#' * len(sequence)}\n"
+    with gzip.open(fp, "wt") as handle:
+        handle.write(content)
 
 
 def test_build_fp_to_archive():
@@ -132,3 +135,47 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir):
     expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001"
     assert out_dir == expected_dir
     assert expected_dir.is_dir()
+
+
+def test_allow_check_failures_continues_archive(tmp_path):
+    run_dir = tmp_path / "240101_M01234_0001_ABCDEFGX"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    sample_sheet_fp = run_dir / "sample_sheet.csv"
+    sample_sheet_fp.write_text(
+        "[Header]\nIEMFileVersion,4\n[Data]\nSample_ID,Sample_Name\nS1,S1\n"
+    )
+
+    header = "@M01234:1:ZZZZZZ:1:1101:10000:10000 1:N:0:ATCACG"
+    for name in [
+        "Undetermined_S0_L001_R1_001.fastq.gz",
+        "Undetermined_S0_L001_R2_001.fastq.gz",
+        "Undetermined_S0_L001_I1_001.fastq.gz",
+        "Undetermined_S0_L001_I2_001.fastq.gz",
+    ]:
+        _write_fastq(run_dir / name, header)
+
+    raw = tmp_path / "raw_reads"
+    raw.mkdir(parents=True, exist_ok=True)
+
+    with pytest.raises(ValueError, match="header information don't match"):
+        backup_fastq(
+            run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
+            raw,
+            sample_sheet_fp,
+            True,
+            1,
+        )
+
+    with pytest.warns(UserWarning, match="header information don't match"):
+        out_dir = backup_fastq(
+            run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
+            raw,
+            sample_sheet_fp,
+            True,
+            1,
+            allow_check_failures=True,
+        )
+
+    assert out_dir.is_dir()
+    md5_fp = out_dir / f"{out_dir.name}.md5"
+    assert md5_fp.is_file()