From 17712cec2e5ad1a688c9131c55f77d7eae101da1 Mon Sep 17 00:00:00 2001 From: Charlie Date: Wed, 20 Aug 2025 14:28:52 -0400 Subject: [PATCH 1/3] Handle lane-less Undetermined fastq files --- seqBackupLib/backup.py | 16 ++++++++++++---- seqBackupLib/illumina.py | 17 +++++++++++++---- test/test_backup.py | 31 +++++++++++++++++++++++++++++++ test/test_illumina.py | 10 ++++++++++ 4 files changed, 66 insertions(+), 8 deletions(-) diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index 920f206..0b4da75 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -22,8 +22,11 @@ def build_fp_to_archive(fp: Path, has_index: bool, lane: str) -> list[Path]: if has_index: label.extend(["I1", "I2"]) - rexp = "".join(["(L00", lane, "_)(R1)(_001.fastq.gz)$"]) - modified_fp = [re.sub(rexp, "".join(["\\1", lab, "\\3"]), fp.name) for lab in label] + if "_L" in fp.name: + rexp = "".join(["(L00", lane, "_)(R1)(_001.fastq.gz)$"]) + modified_fp = [re.sub(rexp, "".join(["\\1", lab, "\\3"]), fp.name) for lab in label] + else: + modified_fp = [fp.name.replace("R1", lab) for lab in label] return [fp] + [fp.parent / n for n in modified_fp] @@ -87,16 +90,21 @@ def backup_fastq( # move the files to the archive location and remove permission permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH + md5s = [] for fp in RI_fps: - output_fp = write_dir / fp.name + if "_L" in fp.name: + dest_name = fp.name + else: + dest_name = fp.name.replace("_S0_", f"_S0_L{r1.lane.zfill(3)}_") + output_fp = write_dir / dest_name shutil.copyfile(fp, output_fp) output_fp.chmod(permission) + md5s.append((dest_name, return_md5(fp))) # copy the sample sheet to destination folder shutil.copyfile(sample_sheet_fp, write_dir / sample_sheet_fp.name) # write md5sums to a file - md5s = [(fp.name, return_md5(fp)) for fp in RI_fps] md5_out_fp = write_dir / ".".join([r1.build_archive_dir(), "md5"]) with open(md5_out_fp, "w") as md5_out: [md5_out.write("\t".join(md5) + "\n") for md5 in md5s] diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index b816262..aa02b6b 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -106,10 +106,19 @@ def _parse_fastq_file(self) -> dict[str, str]: matches = re.match( "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", self.filepath.name ) - keys2 = ("lane", "read_or_index", "read") - vals2 = dict((k, v) for k, v in zip(keys2, matches.groups())) - - return vals2 + if matches: + keys2 = ("lane", "read_or_index", "read") + return dict((k, v) for k, v in zip(keys2, matches.groups())) + + matches = re.match("Undetermined_S0_([RI])([12])_001.fastq.gz", self.filepath.name) + if matches: + return { + "lane": self.fastq_info["lane"], + "read_or_index": matches.group(1), + "read": matches.group(2), + } + + raise ValueError(f"Unexpected FASTQ file name: {self.filepath.name}") @property def lane(self) -> str: diff --git a/test/test_backup.py b/test/test_backup.py index 4645a71..c3d9ee3 100644 --- a/test/test_backup.py +++ b/test/test_backup.py @@ -35,6 +35,14 @@ def test_build_fp_to_archive(): with pytest.raises(IOError): build_fp_to_archive(Path("Undetermined_S0_L001_R2_001.fastq.gz"), True, "1") + archive = build_fp_to_archive(Path("Undetermined_S0_R1_001.fastq.gz"), True, "1") + assert archive == [ + Path("Undetermined_S0_R1_001.fastq.gz"), + Path("Undetermined_S0_R2_001.fastq.gz"), + Path("Undetermined_S0_I1_001.fastq.gz"), + Path("Undetermined_S0_I2_001.fastq.gz"), + ] + def test_return_md5(tmp_path): test_file = tmp_path / "test.txt" @@ -73,3 +81,26 @@ def test_backup_fastq(tmp_path, full_miseq_dir): True, 100, ) + + +def test_backup_fastq_without_lane(tmp_path, full_miseq_dir): + raw = tmp_path / "raw_reads" + raw.mkdir(parents=True, exist_ok=True) + sample_sheet_fp = full_miseq_dir / "sample_sheet.csv" + + for lab in ["R1", "R2", "I1", "I2"]: + (full_miseq_dir / f"Undetermined_S0_L001_{lab}_001.fastq.gz").rename( + full_miseq_dir / f"Undetermined_S0_{lab}_001.fastq.gz" + ) + + backup_fastq( + full_miseq_dir / "Undetermined_S0_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) + + out_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001" + assert (out_dir / "Undetermined_S0_L001_R1_001.fastq.gz").is_file() + assert (out_dir / "Undetermined_S0_L001_R2_001.fastq.gz").is_file() diff --git a/test/test_illumina.py b/test/test_illumina.py index 0abe92a..ada9adc 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -48,3 +48,13 @@ def test_illumina_dir(machine_type, request): fp = request.getfixturevalue(fixture_name) d = IlluminaDir(fp.name) + + +def test_illumina_fastq_without_lane(novaseq_dir): + original = novaseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz" + renamed = novaseq_dir / "Undetermined_S0_R1_001.fastq.gz" + original.rename(renamed) + with gzip.open(renamed, "rt") as f: + r1 = IlluminaFastq(f) + assert r1.check_fp_vs_content()[0] + assert r1.build_archive_dir().endswith("L001") From 41e351a7aa863b75fbed06b5c86a15f94c465d81 Mon Sep 17 00:00:00 2001 From: Charlie Date: Wed, 20 Aug 2025 14:55:05 -0400 Subject: [PATCH 2/3] refactor fastq regex parsing --- seqBackupLib/illumina.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index aa02b6b..2415bbf 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -103,22 +103,21 @@ def _parse_header(self) -> dict[str, str]: def _parse_fastq_file(self) -> dict[str, str]: # Extract file name info - matches = re.match( - "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", self.filepath.name - ) - if matches: + filename = self.filepath.name + if matches := re.match( + "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", filename + ): keys2 = ("lane", "read_or_index", "read") - return dict((k, v) for k, v in zip(keys2, matches.groups())) + return dict(zip(keys2, matches.groups())) - matches = re.match("Undetermined_S0_([RI])([12])_001.fastq.gz", self.filepath.name) - if matches: + if matches := re.match("Undetermined_S0_([RI])([12])_001.fastq.gz", filename): return { "lane": self.fastq_info["lane"], "read_or_index": matches.group(1), "read": matches.group(2), } - raise ValueError(f"Unexpected FASTQ file name: {self.filepath.name}") + raise ValueError(f"Unexpected FASTQ file name: {filename}") @property def lane(self) -> str: From 37e9540866580774913e96474dabc1db311f3bdb Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 20 Aug 2025 14:56:38 -0400 Subject: [PATCH 3/3] Reformat --- seqBackupLib/backup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index 0b4da75..5ab5d81 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -24,7 +24,9 @@ def build_fp_to_archive(fp: Path, has_index: bool, lane: str) -> list[Path]: if "_L" in fp.name: rexp = "".join(["(L00", lane, "_)(R1)(_001.fastq.gz)$"]) - modified_fp = [re.sub(rexp, "".join(["\\1", lab, "\\3"]), fp.name) for lab in label] + modified_fp = [ + re.sub(rexp, "".join(["\\1", lab, "\\3"]), fp.name) for lab in label + ] else: modified_fp = [fp.name.replace("R1", lab) for lab in label] return [fp] + [fp.parent / n for n in modified_fp]