From 31bd200f55794b470b6b1e4eddc0525baa994048 Mon Sep 17 00:00:00 2001 From: Charlie Date: Tue, 18 Nov 2025 11:21:32 -0500 Subject: [PATCH 1/2] Add archive verification command --- pyproject.toml | 1 + seqBackupLib/backup.py | 55 ++++++++++++++++++++++++++++++++++++++++++ test/test_backup.py | 38 +++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 16c687a..320c359 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ homepage = "https://github.com/PennChopMicrobiomeProgram" [project.scripts] backup_illumina = "seqBackupLib.backup:main" +verify_backup = "seqBackupLib.backup:verify_main" [tool.setuptools.packages.find] where = ["."] diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index f9745a0..9880416 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -114,6 +114,48 @@ def backup_fastq( return write_dir +def verify_archive(archive_dir: Path) -> bool: + md5_files = list(archive_dir.glob("*.md5")) + if not md5_files: + raise FileNotFoundError(f"No md5 file found in {archive_dir}") + + if len(md5_files) > 1: + warnings.warn( + f"Multiple md5 files found in {archive_dir}. Using {md5_files[0].name}." + ) + + md5_fp = md5_files[0] + missing_files = [] + mismatched_hashes = [] + + with open(md5_fp) as md5_file: + for line in md5_file: + expected = line.strip().split("\t") + if len(expected) != 2: + raise ValueError(f"Invalid md5 line in {md5_fp}: {line}") + filename, expected_md5 = expected + file_fp = archive_dir / filename + + if not file_fp.is_file(): + missing_files.append(filename) + continue + + computed_md5 = return_md5(file_fp) + if computed_md5 != expected_md5: + mismatched_hashes.append((filename, expected_md5, computed_md5)) + + if missing_files or mismatched_hashes: + raise ValueError( + "MD5 verification failed", + { + "missing_files": missing_files, + "mismatched_hashes": mismatched_hashes, + }, + ) + + return True + + def main(argv=None): parser = argparse.ArgumentParser(description="Backs up fastq files") @@ -154,3 +196,16 @@ def main(argv=None): ) # maybe also ask for single or double reads + + +def verify_main(argv=None): + parser = argparse.ArgumentParser(description="Verify md5 sums for an archived run") + parser.add_argument( + "--archive-dir", + required=True, + type=Path, + help="Archive directory containing the md5 checksum file and reads.", + ) + args = parser.parse_args(argv) + + return verify_archive(args.archive_dir) diff --git a/test/test_backup.py b/test/test_backup.py index b571898..b5b9e75 100644 --- a/test/test_backup.py +++ b/test/test_backup.py @@ -5,6 +5,7 @@ build_fp_to_archive, return_md5, main, + verify_archive, ) @@ -132,3 +133,40 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir): expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001" assert out_dir == expected_dir assert expected_dir.is_dir() + + +def test_verify_archive(tmp_path, full_miseq_dir): + raw = tmp_path / "raw_reads" + raw.mkdir(parents=True, exist_ok=True) + sample_sheet_fp = full_miseq_dir / "sample_sheet.csv" + + archive_dir = backup_fastq( + full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) + + assert verify_archive(archive_dir) + + +def test_verify_archive_detects_changes(tmp_path, full_miseq_dir): + raw = tmp_path / "raw_reads" + raw.mkdir(parents=True, exist_ok=True) + sample_sheet_fp = full_miseq_dir / "sample_sheet.csv" + + archive_dir = backup_fastq( + full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) + + target_fp = archive_dir / "Undetermined_S0_L001_R1_001.fastq.gz" + with open(target_fp, "ab") as f: + f.write(b"corruption") + + with pytest.raises(ValueError): + verify_archive(archive_dir) From d20271fb9d1e3547c0f8a00b5b4c419d5e5ccbb0 Mon Sep 17 00:00:00 2001 From: Charlie Date: Tue, 18 Nov 2025 11:24:16 -0500 Subject: [PATCH 2/2] Make archived files owner-writable --- seqBackupLib/backup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index 9880416..20b2c38 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -90,8 +90,10 @@ def backup_fastq( ### All the checks are done and the files are safe to archive! - # move the files to the archive location and remove permission - permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH + # move the files to the archive location and set readable permissions + # keep the files writable by the owner to allow intentional updates or tests + # that simulate corruption + permission = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH md5s = [] for fp in RI_fps: if "_L" in fp.name: