diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index f9745a0..55c4b6c 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -47,6 +47,7 @@ def backup_fastq( sample_sheet_fp: Path, has_index: bool, min_file_size: int, + allow_check_failures: bool = False, ): R1 = IlluminaFastq(gzip.open(forward_reads, mode="rt")) @@ -58,25 +59,47 @@ def backup_fastq( illumina_fastqs = [IlluminaFastq(gzip.open(fp, mode="rt")) for fp in RI_fps] r1 = illumina_fastqs[0] - if not all([ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]): + fp_vs_content_results = [ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs] + if not all(fp_vs_content_results): [ifq.check_fp_vs_content(verbose=True) for ifq in illumina_fastqs] - raise ValueError( + message = ( "The file path and header information don't match", - [str(ifq) for ifq in illumina_fastqs if not ifq.check_fp_vs_content()[0]], + [ + str(ifq) + for ifq, ok in zip(illumina_fastqs, fp_vs_content_results) + if not ok + ], ) - if not all([ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]): - raise ValueError( - "File seems suspiciously small. Please check if you have the correct file or lower the minimum file size threshold", - [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs], + if allow_check_failures: + warnings.warn(f"{message[0]}: {message[1]}") + else: + raise ValueError(*message) + file_size_results = [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs] + if not all(file_size_results): + message = ( + "File seems suspiciously small. Please check if you have the correct file or" + " lower the minimum file size threshold", + file_size_results, ) + if allow_check_failures: + warnings.warn(f"{message[0]}: {message[1]}") + else: + raise ValueError(*message) if not all([ifq.check_index_read_exists() for ifq in illumina_fastqs]): warnings.warn( "No barcodes in headers. Were the fastq files generated properly?" ) # parse the info from the headers in EACH file and check they are consistent within each other - if not all([fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]): - raise ValueError("The files are not from the same run.") + same_run_results = [ + fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs + ] + if not all(same_run_results): + message = "The files are not from the same run." + if allow_check_failures: + warnings.warn(message) + else: + raise ValueError(message) ## Archiving steps @@ -144,6 +167,11 @@ def main(argv=None): default=DEFAULT_MIN_FILE_SIZE, help="Minimum file size to register in bytes", ) + parser.add_argument( + "--allow-check-failures", + action="store_true", + help="Continue archiving even if validation checks fail", + ) args = parser.parse_args(argv) return backup_fastq( args.forward_reads, @@ -151,6 +179,7 @@ def main(argv=None): args.sample_sheet, not args.no_index, args.min_file_size, + args.allow_check_failures, ) # maybe also ask for single or double reads diff --git a/test/test_backup.py b/test/test_backup.py index b571898..8ec9846 100644 --- a/test/test_backup.py +++ b/test/test_backup.py @@ -1,11 +1,14 @@ import pytest from pathlib import Path -from seqBackupLib.backup import ( - backup_fastq, - build_fp_to_archive, - return_md5, - main, -) +import gzip +from seqBackupLib.backup import backup_fastq, build_fp_to_archive, return_md5, main + + +def _write_fastq(fp: Path, header: str) -> None: + sequence = "N" * 10 + content = f"{header}\n{sequence}\n+\n{'#' * len(sequence)}\n" + with gzip.open(fp, "wt") as handle: + handle.write(content) def test_build_fp_to_archive(): @@ -132,3 +135,47 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir): expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001" assert out_dir == expected_dir assert expected_dir.is_dir() + + +def test_allow_check_failures_continues_archive(tmp_path): + run_dir = tmp_path / "240101_M01234_0001_ABCDEFGX" + run_dir.mkdir(parents=True, exist_ok=True) + sample_sheet_fp = run_dir / "sample_sheet.csv" + sample_sheet_fp.write_text( + "[Header]\nIEMFileVersion,4\n[Data]\nSample_ID,Sample_Name\nS1,S1\n" + ) + + header = "@M01234:1:ZZZZZZ:1:1101:10000:10000 1:N:0:ATCACG" + for name in [ + "Undetermined_S0_L001_R1_001.fastq.gz", + "Undetermined_S0_L001_R2_001.fastq.gz", + "Undetermined_S0_L001_I1_001.fastq.gz", + "Undetermined_S0_L001_I2_001.fastq.gz", + ]: + _write_fastq(run_dir / name, header) + + raw = tmp_path / "raw_reads" + raw.mkdir(parents=True, exist_ok=True) + + with pytest.raises(ValueError, match="header information don't match"): + backup_fastq( + run_dir / "Undetermined_S0_L001_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 1, + ) + + with pytest.warns(UserWarning, match="header information don't match"): + out_dir = backup_fastq( + run_dir / "Undetermined_S0_L001_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 1, + allow_check_failures=True, + ) + + assert out_dir.is_dir() + md5_fp = out_dir / f"{out_dir.name}.md5" + assert md5_fp.is_file()