Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ homepage = "https://github.com/PennChopMicrobiomeProgram"

[project.scripts]
backup_illumina = "seqBackupLib.backup:main"
verify_backup = "seqBackupLib.backup:verify_main"

[tool.setuptools.packages.find]
where = ["."]
Expand Down
61 changes: 59 additions & 2 deletions seqBackupLib/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ def backup_fastq(

### All the checks are done and the files are safe to archive!

# move the files to the archive location and remove permission
permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH
# move the files to the archive location and set readable permissions
# keep the files writable by the owner to allow intentional updates or tests
# that simulate corruption
permission = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
md5s = []
for fp in RI_fps:
if "_L" in fp.name:
Expand All @@ -114,6 +116,48 @@ def backup_fastq(
return write_dir


def verify_archive(archive_dir: Path) -> bool:
md5_files = list(archive_dir.glob("*.md5"))
if not md5_files:
raise FileNotFoundError(f"No md5 file found in {archive_dir}")

if len(md5_files) > 1:
warnings.warn(
f"Multiple md5 files found in {archive_dir}. Using {md5_files[0].name}."
)

md5_fp = md5_files[0]
missing_files = []
mismatched_hashes = []

with open(md5_fp) as md5_file:
for line in md5_file:
expected = line.strip().split("\t")
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The MD5 file parsing doesn't handle empty lines or lines with only whitespace. If the MD5 file contains blank lines, they will cause a ValueError when len(expected) != 2.

Consider adding a check to skip empty lines:

for line in md5_file:
    line = line.strip()
    if not line:  # Skip empty lines
        continue
    expected = line.split("\t")
    if len(expected) != 2:
        raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
Suggested change
expected = line.strip().split("\t")
line = line.strip()
if not line: # Skip empty or whitespace-only lines
continue
expected = line.split("\t")

Copilot uses AI. Check for mistakes.
if len(expected) != 2:
raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message could be more helpful. When an invalid MD5 line is encountered, the error includes the entire line in the message, but doesn't indicate which specific field or format issue was detected.

Consider improving the error message to be more specific:

raise ValueError(
    f"Invalid md5 line in {md5_fp}: expected 2 tab-separated values, "
    f"got {len(expected)} in line: {line.strip()}"
)
Suggested change
raise ValueError(f"Invalid md5 line in {md5_fp}: {line}")
raise ValueError(
f"Invalid md5 line in {md5_fp}: expected 2 tab-separated values, "
f"got {len(expected)} in line: {line.strip()}"
)

Copilot uses AI. Check for mistakes.
filename, expected_md5 = expected
file_fp = archive_dir / filename

if not file_fp.is_file():
missing_files.append(filename)
continue

computed_md5 = return_md5(file_fp)
if computed_md5 != expected_md5:
Comment on lines 117 to +146
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Potential performance issue: The function processes all files before reporting any errors. For large archives with many files, this means the entire verification must complete even if the first file fails.

While collecting all errors provides comprehensive feedback, consider whether early termination on first failure would be more appropriate for this use case. If comprehensive reporting is desired, the current approach is acceptable but could benefit from progress logging for large archives.

Copilot uses AI. Check for mistakes.
mismatched_hashes.append((filename, expected_md5, computed_md5))

if missing_files or mismatched_hashes:
raise ValueError(
"MD5 verification failed",
{
"missing_files": missing_files,
"mismatched_hashes": mismatched_hashes,
},
)

return True


def main(argv=None):
parser = argparse.ArgumentParser(description="Backs up fastq files")

Expand Down Expand Up @@ -154,3 +198,16 @@ def main(argv=None):
)

# maybe also ask for single or double reads


def verify_main(argv=None):
parser = argparse.ArgumentParser(description="Verify md5 sums for an archived run")
parser.add_argument(
"--archive-dir",
required=True,
type=Path,
help="Archive directory containing the md5 checksum file and reads.",
)
args = parser.parse_args(argv)

return verify_archive(args.archive_dir)
38 changes: 38 additions & 0 deletions test/test_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
build_fp_to_archive,
return_md5,
main,
verify_archive,
)


Expand Down Expand Up @@ -132,3 +133,40 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir):
expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001"
assert out_dir == expected_dir
assert expected_dir.is_dir()


def test_verify_archive(tmp_path, full_miseq_dir):
raw = tmp_path / "raw_reads"
raw.mkdir(parents=True, exist_ok=True)
sample_sheet_fp = full_miseq_dir / "sample_sheet.csv"

archive_dir = backup_fastq(
full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
raw,
sample_sheet_fp,
True,
100,
)

assert verify_archive(archive_dir)


def test_verify_archive_detects_changes(tmp_path, full_miseq_dir):
raw = tmp_path / "raw_reads"
raw.mkdir(parents=True, exist_ok=True)
sample_sheet_fp = full_miseq_dir / "sample_sheet.csv"

archive_dir = backup_fastq(
full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
raw,
sample_sheet_fp,
True,
100,
)

target_fp = archive_dir / "Undetermined_S0_L001_R1_001.fastq.gz"
with open(target_fp, "ab") as f:
f.write(b"corruption")

with pytest.raises(ValueError):
verify_archive(archive_dir)