From 05969b99d4b96e54aec48cc94658d4b51cb43a4d Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 14 May 2025 14:12:07 -0400 Subject: [PATCH 1/2] Separate IlluminaDir logic --- seqBackupLib/illumina.py | 119 +++++++++++++++++++++------------------ test/test_illumina.py | 6 +- 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index bc0a913..450315c 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -3,51 +3,24 @@ from pathlib import Path -class IlluminaFastq: - MACHINE_TYPES = { - "VH": "Illumina-NextSeq", - "D": "Illumina-HiSeq", - "M": "Illumina-MiSeq", - "A": "Illumina-NovaSeq", - "NB": "Illumina-MiniSeq", - "LH": "Illumina-NovaSeqX", - } +MACHINE_TYPES = { + "VH": "Illumina-NextSeq", + "D": "Illumina-HiSeq", + "M": "Illumina-MiSeq", + "A": "Illumina-NovaSeq", + "NB": "Illumina-MiniSeq", + "LH": "Illumina-NovaSeqX", +} - def __init__(self, f: TextIOWrapper): - self.file = f - self.fastq_info = self._parse_header() - self.folder_info = self._parse_folder() - def __str__(self): - return "_".join( - [ - self.fastq_info["instrument"], - self.fastq_info["run_number"], - self.fastq_info["flowcell_id"], - self.fastq_info["lane"], - ] - ) +def extract_instrument_code(instrument: str) -> str: + return "".join(filter(lambda x: not x.isdigit(), instrument)) - def is_same_run(self, other: "IlluminaFastq") -> bool: - keys = ["run_number", "instrument", "flowcell_id"] - return all(self.fastq_info[k] == other.fastq_info[k] for k in keys) - def _parse_header(self) -> dict[str, str]: - line = next(self.file).strip() - if not line.startswith("@"): - raise ValueError("Not a FASTQ header line") - # Remove first character, @ - line = line[1:] - word1, _, word2 = line.partition(" ") - - keys1 = ("instrument", "run_number", "flowcell_id", "lane") - vals1 = dict((k, v) for k, v in zip(keys1, word1.split(":"))) - - keys2 = ("read", "is_filtered", "control_number", "index_reads") - vals2 = dict((k, v) for k, v in zip(keys2, word2.split(":"))) - - vals1.update(vals2) - return vals1 +class IlluminaDir: + def __init__(self, run_name: str): + self.run_name = run_name + self.folder_info = self._parse_folder() def _parse_folder(self) -> dict[str, str]: # Extract directory name info @@ -62,8 +35,9 @@ def _parse_folder(self) -> dict[str, str]: raise ValueError(f"Invalid date format in run name: {date}") instrument = parts[1] - if self._extract_instrument_code(instrument) not in self.MACHINE_TYPES: + if extract_instrument_code(instrument) not in MACHINE_TYPES: raise ValueError(f"Invalid instrument code in run name: {instrument}") + self.machine_type = MACHINE_TYPES[extract_instrument_code(instrument)] run_number = parts[2] if not run_number.isdigit(): @@ -89,6 +63,48 @@ def _parse_folder(self) -> dict[str, str]: ): vals1["flowcell_id"] = vals1["flowcell_id"][1:] + return vals1 + + +class IlluminaFastq: + def __init__(self, f: TextIOWrapper): + self.file = f + self.fastq_info = self._parse_header() + self.folder_info = IlluminaDir(self.run_name).folder_info + self.folder_info.update(self._parse_fastq_file()) + + def __str__(self): + return "_".join( + [ + self.fastq_info["instrument"], + self.fastq_info["run_number"], + self.fastq_info["flowcell_id"], + self.fastq_info["lane"], + ] + ) + + def is_same_run(self, other: "IlluminaFastq") -> bool: + keys = ["run_number", "instrument", "flowcell_id"] + return all(self.fastq_info[k] == other.fastq_info[k] for k in keys) + + def _parse_header(self) -> dict[str, str]: + line = next(self.file).strip() + if not line.startswith("@"): + raise ValueError("Not a FASTQ header line") + # Remove first character, @ + line = line[1:] + word1, _, word2 = line.partition(" ") + + keys1 = ("instrument", "run_number", "flowcell_id", "lane") + vals1 = dict((k, v) for k, v in zip(keys1, word1.split(":"))) + + keys2 = ("read", "is_filtered", "control_number", "index_reads") + vals2 = dict((k, v) for k, v in zip(keys2, word2.split(":"))) + + vals1.update(vals2) + return vals1 + + def _parse_fastq_file(self) -> dict[str, str]: # Extract file name info matches = re.match( "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", self.filepath.name @@ -96,18 +112,7 @@ def _parse_folder(self) -> dict[str, str]: keys2 = ("lane", "read_or_index", "read") vals2 = dict((k, v) for k, v in zip(keys2, matches.groups())) - vals1.update(vals2) - return vals1 - - @staticmethod - def _extract_instrument_code(instrument: str) -> str: - return "".join(filter(lambda x: not x.isdigit(), instrument)) - - @property - def machine_type(self): - return self.MACHINE_TYPES[ - self._extract_instrument_code(self.fastq_info["instrument"]) - ] + return vals2 @property def lane(self) -> str: @@ -117,6 +122,10 @@ def lane(self) -> str: def filepath(self) -> Path: return Path(self.file.name) + @property + def machine_type(self) -> str: + return MACHINE_TYPES[extract_instrument_code(self.fastq_info["instrument"])] + @property def run_name(self) -> str: for part in self.filepath.parts: @@ -124,7 +133,7 @@ def run_name(self) -> str: if ( len(segments) >= 4 and segments[0].isdigit() - and self._extract_instrument_code(segments[1]) in self.MACHINE_TYPES + and extract_instrument_code(segments[1]) in MACHINE_TYPES and segments[2].isdigit() ): return part diff --git a/test/test_illumina.py b/test/test_illumina.py index 6aa7939..10257a7 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -2,7 +2,7 @@ import pytest from pathlib import Path from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE -from seqBackupLib.illumina import IlluminaFastq +from seqBackupLib.illumina import IlluminaFastq, MACHINE_TYPES machine_fixtures = { @@ -15,7 +15,7 @@ } -@pytest.mark.parametrize("machine_type", IlluminaFastq.MACHINE_TYPES.keys()) +@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) def test_illumina_fastq(machine_type, request): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: @@ -29,7 +29,7 @@ def test_illumina_fastq(machine_type, request): r1 = IlluminaFastq(f) print("FASTQ info: ", r1.fastq_info, "\nFolder info: ", r1.folder_info) - assert r1.machine_type == IlluminaFastq.MACHINE_TYPES[machine_type] + assert r1.machine_type == MACHINE_TYPES[machine_type] assert r1.check_fp_vs_content()[0], r1.check_fp_vs_content() assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE) assert r1.check_file_size(100) From 069e1bdeb8d7299656cb21c1cbb46c927f4de96b Mon Sep 17 00:00:00 2001 From: Ulthran Date: Wed, 14 May 2025 14:20:39 -0400 Subject: [PATCH 2/2] Add testing --- test/test_illumina.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test/test_illumina.py b/test/test_illumina.py index 10257a7..7dfa36a 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -2,7 +2,7 @@ import pytest from pathlib import Path from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE -from seqBackupLib.illumina import IlluminaFastq, MACHINE_TYPES +from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES machine_fixtures = { @@ -34,3 +34,16 @@ def test_illumina_fastq(machine_type, request): assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE) assert r1.check_file_size(100) assert r1.check_index_read_exists() + + +@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) +def test_illumina_dir(machine_type, request): + fixture_name = machine_fixtures.get(machine_type) + if not fixture_name: + raise ValueError( + f"All supported machine types must be tested. Missing: {machine_type}" + ) + + fp = request.getfixturevalue(fixture_name) + + d = IlluminaDir(fp.name)