From 05969b99d4b96e54aec48cc94658d4b51cb43a4d Mon Sep 17 00:00:00 2001
From: Ulthran <ctbushman@gmail.com>
Date: Wed, 14 May 2025 14:12:07 -0400
Subject: [PATCH 1/2] Separate IlluminaDir logic

---
 seqBackupLib/illumina.py | 119 +++++++++++++++++++++------------------
 test/test_illumina.py    |   6 +-
 2 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py
index bc0a913..450315c 100644
--- a/seqBackupLib/illumina.py
+++ b/seqBackupLib/illumina.py
@@ -3,51 +3,24 @@
 from pathlib import Path
 
 
-class IlluminaFastq:
-    MACHINE_TYPES = {
-        "VH": "Illumina-NextSeq",
-        "D": "Illumina-HiSeq",
-        "M": "Illumina-MiSeq",
-        "A": "Illumina-NovaSeq",
-        "NB": "Illumina-MiniSeq",
-        "LH": "Illumina-NovaSeqX",
-    }
+MACHINE_TYPES = {
+    "VH": "Illumina-NextSeq",
+    "D": "Illumina-HiSeq",
+    "M": "Illumina-MiSeq",
+    "A": "Illumina-NovaSeq",
+    "NB": "Illumina-MiniSeq",
+    "LH": "Illumina-NovaSeqX",
+}
 
-    def __init__(self, f: TextIOWrapper):
-        self.file = f
-        self.fastq_info = self._parse_header()
-        self.folder_info = self._parse_folder()
 
-    def __str__(self):
-        return "_".join(
-            [
-                self.fastq_info["instrument"],
-                self.fastq_info["run_number"],
-                self.fastq_info["flowcell_id"],
-                self.fastq_info["lane"],
-            ]
-        )
+def extract_instrument_code(instrument: str) -> str:
+    return "".join(filter(lambda x: not x.isdigit(), instrument))
 
-    def is_same_run(self, other: "IlluminaFastq") -> bool:
-        keys = ["run_number", "instrument", "flowcell_id"]
-        return all(self.fastq_info[k] == other.fastq_info[k] for k in keys)
 
-    def _parse_header(self) -> dict[str, str]:
-        line = next(self.file).strip()
-        if not line.startswith("@"):
-            raise ValueError("Not a FASTQ header line")
-        # Remove first character, @
-        line = line[1:]
-        word1, _, word2 = line.partition(" ")
-
-        keys1 = ("instrument", "run_number", "flowcell_id", "lane")
-        vals1 = dict((k, v) for k, v in zip(keys1, word1.split(":")))
-
-        keys2 = ("read", "is_filtered", "control_number", "index_reads")
-        vals2 = dict((k, v) for k, v in zip(keys2, word2.split(":")))
-
-        vals1.update(vals2)
-        return vals1
+class IlluminaDir:
+    def __init__(self, run_name: str):
+        self.run_name = run_name
+        self.folder_info = self._parse_folder()
 
     def _parse_folder(self) -> dict[str, str]:
         # Extract directory name info
@@ -62,8 +35,9 @@ def _parse_folder(self) -> dict[str, str]:
             raise ValueError(f"Invalid date format in run name: {date}")
 
         instrument = parts[1]
-        if self._extract_instrument_code(instrument) not in self.MACHINE_TYPES:
+        if extract_instrument_code(instrument) not in MACHINE_TYPES:
             raise ValueError(f"Invalid instrument code in run name: {instrument}")
+        self.machine_type = MACHINE_TYPES[extract_instrument_code(instrument)]
 
         run_number = parts[2]
         if not run_number.isdigit():
@@ -89,6 +63,48 @@ def _parse_folder(self) -> dict[str, str]:
         ):
             vals1["flowcell_id"] = vals1["flowcell_id"][1:]
 
+        return vals1
+
+
+class IlluminaFastq:
+    def __init__(self, f: TextIOWrapper):
+        self.file = f
+        self.fastq_info = self._parse_header()
+        self.folder_info = IlluminaDir(self.run_name).folder_info
+        self.folder_info.update(self._parse_fastq_file())
+
+    def __str__(self):
+        return "_".join(
+            [
+                self.fastq_info["instrument"],
+                self.fastq_info["run_number"],
+                self.fastq_info["flowcell_id"],
+                self.fastq_info["lane"],
+            ]
+        )
+
+    def is_same_run(self, other: "IlluminaFastq") -> bool:
+        keys = ["run_number", "instrument", "flowcell_id"]
+        return all(self.fastq_info[k] == other.fastq_info[k] for k in keys)
+
+    def _parse_header(self) -> dict[str, str]:
+        line = next(self.file).strip()
+        if not line.startswith("@"):
+            raise ValueError("Not a FASTQ header line")
+        # Remove first character, @
+        line = line[1:]
+        word1, _, word2 = line.partition(" ")
+
+        keys1 = ("instrument", "run_number", "flowcell_id", "lane")
+        vals1 = dict((k, v) for k, v in zip(keys1, word1.split(":")))
+
+        keys2 = ("read", "is_filtered", "control_number", "index_reads")
+        vals2 = dict((k, v) for k, v in zip(keys2, word2.split(":")))
+
+        vals1.update(vals2)
+        return vals1
+
+    def _parse_fastq_file(self) -> dict[str, str]:
         # Extract file name info
         matches = re.match(
             "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", self.filepath.name
@@ -96,18 +112,7 @@ def _parse_folder(self) -> dict[str, str]:
         keys2 = ("lane", "read_or_index", "read")
         vals2 = dict((k, v) for k, v in zip(keys2, matches.groups()))
 
-        vals1.update(vals2)
-        return vals1
-
-    @staticmethod
-    def _extract_instrument_code(instrument: str) -> str:
-        return "".join(filter(lambda x: not x.isdigit(), instrument))
-
-    @property
-    def machine_type(self):
-        return self.MACHINE_TYPES[
-            self._extract_instrument_code(self.fastq_info["instrument"])
-        ]
+        return vals2
 
     @property
     def lane(self) -> str:
@@ -117,6 +122,10 @@ def lane(self) -> str:
     def filepath(self) -> Path:
         return Path(self.file.name)
 
+    @property
+    def machine_type(self) -> str:
+        return MACHINE_TYPES[extract_instrument_code(self.fastq_info["instrument"])]
+
     @property
     def run_name(self) -> str:
         for part in self.filepath.parts:
@@ -124,7 +133,7 @@ def run_name(self) -> str:
             if (
                 len(segments) >= 4
                 and segments[0].isdigit()
-                and self._extract_instrument_code(segments[1]) in self.MACHINE_TYPES
+                and extract_instrument_code(segments[1]) in MACHINE_TYPES
                 and segments[2].isdigit()
             ):
                 return part
diff --git a/test/test_illumina.py b/test/test_illumina.py
index 6aa7939..10257a7 100644
--- a/test/test_illumina.py
+++ b/test/test_illumina.py
@@ -2,7 +2,7 @@
 import pytest
 from pathlib import Path
 from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE
-from seqBackupLib.illumina import IlluminaFastq
+from seqBackupLib.illumina import IlluminaFastq, MACHINE_TYPES
 
 
 machine_fixtures = {
@@ -15,7 +15,7 @@
 }
 
 
-@pytest.mark.parametrize("machine_type", IlluminaFastq.MACHINE_TYPES.keys())
+@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
 def test_illumina_fastq(machine_type, request):
     fixture_name = machine_fixtures.get(machine_type)
     if not fixture_name:
@@ -29,7 +29,7 @@ def test_illumina_fastq(machine_type, request):
         r1 = IlluminaFastq(f)
 
     print("FASTQ info: ", r1.fastq_info, "\nFolder info: ", r1.folder_info)
-    assert r1.machine_type == IlluminaFastq.MACHINE_TYPES[machine_type]
+    assert r1.machine_type == MACHINE_TYPES[machine_type]
     assert r1.check_fp_vs_content()[0], r1.check_fp_vs_content()
     assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE)
     assert r1.check_file_size(100)

From 069e1bdeb8d7299656cb21c1cbb46c927f4de96b Mon Sep 17 00:00:00 2001
From: Ulthran <ctbushman@gmail.com>
Date: Wed, 14 May 2025 14:20:39 -0400
Subject: [PATCH 2/2] Add testing

---
 test/test_illumina.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/test_illumina.py b/test/test_illumina.py
index 10257a7..7dfa36a 100644
--- a/test/test_illumina.py
+++ b/test/test_illumina.py
@@ -2,7 +2,7 @@
 import pytest
 from pathlib import Path
 from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE
-from seqBackupLib.illumina import IlluminaFastq, MACHINE_TYPES
+from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES
 
 
 machine_fixtures = {
@@ -34,3 +34,16 @@ def test_illumina_fastq(machine_type, request):
     assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE)
     assert r1.check_file_size(100)
     assert r1.check_index_read_exists()
+
+
+@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
+def test_illumina_dir(machine_type, request):
+    fixture_name = machine_fixtures.get(machine_type)
+    if not fixture_name:
+        raise ValueError(
+            f"All supported machine types must be tested. Missing: {machine_type}"
+        )
+
+    fp = request.getfixturevalue(fixture_name)
+
+    d = IlluminaDir(fp.name)