PennChopMicrobiomeProgram · Ulthran · Nov 18, 2025 · Nov 18, 2025 · Copilot · Nov 18, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ homepage = "https://github.com/PennChopMicrobiomeProgram"
 
 [project.scripts]
 backup_illumina = "seqBackupLib.backup:main"
+verify_backup = "seqBackupLib.backup:verify_main"
 
 [tool.setuptools.packages.find]
 where = ["."]

diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py
@@ -90,8 +90,10 @@ def backup_fastq(
 
     ### All the checks are done and the files are safe to archive!
 
-    # move the files to the archive location and remove permission
-    permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH
+    # move the files to the archive location and set readable permissions
+    # keep the files writable by the owner to allow intentional updates or tests
+    # that simulate corruption
+    permission = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
     md5s = []
     for fp in RI_fps:
         if "_L" in fp.name:
@@ -114,6 +116,48 @@ def backup_fastq(
     return write_dir
 
 
+def verify_archive(archive_dir: Path) -> bool:
+    md5_files = list(archive_dir.glob("*.md5"))
+    if not md5_files:
+        raise FileNotFoundError(f"No md5 file found in {archive_dir}")
+
+    if len(md5_files) > 1:
+        warnings.warn(
+            f"Multiple md5 files found in {archive_dir}. Using {md5_files[0].name}."
+        )
+
+    md5_fp = md5_files[0]
+    missing_files = []
+    mismatched_hashes = []
+
+    with open(md5_fp) as md5_file:
+        for line in md5_file:
+            expected = line.strip().split("\t")
-            expected = line.strip().split("\t")
+            line = line.strip()
+            if not line:  # Skip empty or whitespace-only lines
+                continue
+            expected = line.split("\t")
-            expected = line.strip().split("\t")
+            line = line.strip()
+            if not line:  # Skip empty or whitespace-only lines
+                continue
+            expected = line.split("\t")
+            if len(expected) != 2:
+                raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
-                raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
+                raise ValueError(
+                    f"Invalid md5 line in {md5_fp}: expected 2 tab-separated values, "
+                    f"got {len(expected)} in line: {line.strip()}"
+                )
-                raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
+                raise ValueError(
+                    f"Invalid md5 line in {md5_fp}: expected 2 tab-separated values, "
+                    f"got {len(expected)} in line: {line.strip()}"
+                )
+            filename, expected_md5 = expected
+            file_fp = archive_dir / filename
+
+            if not file_fp.is_file():
+                missing_files.append(filename)
+                continue
+
+            computed_md5 = return_md5(file_fp)
+            if computed_md5 != expected_md5:
+                mismatched_hashes.append((filename, expected_md5, computed_md5))
+
+    if missing_files or mismatched_hashes:
+        raise ValueError(
+            "MD5 verification failed",
+            {
+                "missing_files": missing_files,
+                "mismatched_hashes": mismatched_hashes,
+            },
+        )
+
+    return True
+
+
 def main(argv=None):
     parser = argparse.ArgumentParser(description="Backs up fastq files")
 
@@ -154,3 +198,16 @@ def main(argv=None):
     )
 
     # maybe also ask for single or double reads
+
+
+def verify_main(argv=None):
+    parser = argparse.ArgumentParser(description="Verify md5 sums for an archived run")
+    parser.add_argument(
+        "--archive-dir",
+        required=True,
+        type=Path,
+        help="Archive directory containing the md5 checksum file and reads.",
+    )
+    args = parser.parse_args(argv)
+
+    return verify_archive(args.archive_dir)
diff --git a/test/test_backup.py b/test/test_backup.py
@@ -5,6 +5,7 @@
     build_fp_to_archive,
     return_md5,
     main,
+    verify_archive,
 )
 
 
@@ -132,3 +133,40 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir):
     expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001"
     assert out_dir == expected_dir
     assert expected_dir.is_dir()
+
+
+def test_verify_archive(tmp_path, full_miseq_dir):
+    raw = tmp_path / "raw_reads"
+    raw.mkdir(parents=True, exist_ok=True)
+    sample_sheet_fp = full_miseq_dir / "sample_sheet.csv"
+
+    archive_dir = backup_fastq(
+        full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
+        raw,
+        sample_sheet_fp,
+        True,
+        100,
+    )
+
+    assert verify_archive(archive_dir)
+
+
+def test_verify_archive_detects_changes(tmp_path, full_miseq_dir):
+    raw = tmp_path / "raw_reads"
+    raw.mkdir(parents=True, exist_ok=True)
+    sample_sheet_fp = full_miseq_dir / "sample_sheet.csv"
+
+    archive_dir = backup_fastq(
+        full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
+        raw,
+        sample_sheet_fp,
+        True,
+        100,
+    )
+
+    target_fp = archive_dir / "Undetermined_S0_L001_R1_001.fastq.gz"
+    with open(target_fp, "ab") as f:
+        f.write(b"corruption")
+
+    with pytest.raises(ValueError):
+        verify_archive(archive_dir)