Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions seqBackupLib/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def backup_fastq(
sample_sheet_fp: Path,
has_index: bool,
min_file_size: int,
allow_check_failures: bool = False,
):

R1 = IlluminaFastq(gzip.open(forward_reads, mode="rt"))
Expand All @@ -58,25 +59,47 @@ def backup_fastq(
illumina_fastqs = [IlluminaFastq(gzip.open(fp, mode="rt")) for fp in RI_fps]
r1 = illumina_fastqs[0]

if not all([ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]):
fp_vs_content_results = [ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]
if not all(fp_vs_content_results):
[ifq.check_fp_vs_content(verbose=True) for ifq in illumina_fastqs]
raise ValueError(
message = (
"The file path and header information don't match",
[str(ifq) for ifq in illumina_fastqs if not ifq.check_fp_vs_content()[0]],
[
str(ifq)
for ifq, ok in zip(illumina_fastqs, fp_vs_content_results)
if not ok
],
)
if not all([ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]):
raise ValueError(
"File seems suspiciously small. Please check if you have the correct file or lower the minimum file size threshold",
[ifq.check_file_size(min_file_size) for ifq in illumina_fastqs],
if allow_check_failures:
warnings.warn(f"{message[0]}: {message[1]}")
else:
raise ValueError(*message)
file_size_results = [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]
if not all(file_size_results):
message = (
"File seems suspiciously small. Please check if you have the correct file or"
" lower the minimum file size threshold",
file_size_results,
)
if allow_check_failures:
warnings.warn(f"{message[0]}: {message[1]}")
else:
raise ValueError(*message)
if not all([ifq.check_index_read_exists() for ifq in illumina_fastqs]):
warnings.warn(
"No barcodes in headers. Were the fastq files generated properly?"
)

# parse the info from the headers in EACH file and check they are consistent within each other
if not all([fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]):
raise ValueError("The files are not from the same run.")
same_run_results = [
fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs
]
if not all(same_run_results):
message = "The files are not from the same run."
if allow_check_failures:
warnings.warn(message)
else:
raise ValueError(message)

## Archiving steps

Expand Down Expand Up @@ -144,13 +167,19 @@ def main(argv=None):
default=DEFAULT_MIN_FILE_SIZE,
help="Minimum file size to register in bytes",
)
parser.add_argument(
"--allow-check-failures",
action="store_true",
help="Continue archiving even if validation checks fail",
)
args = parser.parse_args(argv)
return backup_fastq(
args.forward_reads,
args.destination_dir,
args.sample_sheet,
not args.no_index,
args.min_file_size,
args.allow_check_failures,
)

# maybe also ask for single or double reads
59 changes: 53 additions & 6 deletions test/test_backup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import pytest
from pathlib import Path
from seqBackupLib.backup import (
backup_fastq,
build_fp_to_archive,
return_md5,
main,
)
import gzip
from seqBackupLib.backup import backup_fastq, build_fp_to_archive, return_md5, main


def _write_fastq(fp: Path, header: str) -> None:
sequence = "N" * 10
content = f"{header}\n{sequence}\n+\n{'#' * len(sequence)}\n"
with gzip.open(fp, "wt") as handle:
handle.write(content)


def test_build_fp_to_archive():
Expand Down Expand Up @@ -132,3 +135,47 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir):
expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001"
assert out_dir == expected_dir
assert expected_dir.is_dir()


def test_allow_check_failures_continues_archive(tmp_path):
run_dir = tmp_path / "240101_M01234_0001_ABCDEFGX"
run_dir.mkdir(parents=True, exist_ok=True)
sample_sheet_fp = run_dir / "sample_sheet.csv"
sample_sheet_fp.write_text(
"[Header]\nIEMFileVersion,4\n[Data]\nSample_ID,Sample_Name\nS1,S1\n"
)

header = "@M01234:1:ZZZZZZ:1:1101:10000:10000 1:N:0:ATCACG"
for name in [
"Undetermined_S0_L001_R1_001.fastq.gz",
"Undetermined_S0_L001_R2_001.fastq.gz",
"Undetermined_S0_L001_I1_001.fastq.gz",
"Undetermined_S0_L001_I2_001.fastq.gz",
]:
_write_fastq(run_dir / name, header)

raw = tmp_path / "raw_reads"
raw.mkdir(parents=True, exist_ok=True)

with pytest.raises(ValueError, match="header information don't match"):
backup_fastq(
run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
raw,
sample_sheet_fp,
True,
1,
)

with pytest.warns(UserWarning, match="header information don't match"):
out_dir = backup_fastq(
run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
raw,
sample_sheet_fp,
True,
1,
allow_check_failures=True,
)

assert out_dir.is_dir()
md5_fp = out_dir / f"{out_dir.name}.md5"
assert md5_fp.is_file()