diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..36fed4b --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,14 @@ +name: Tests + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + + workflow_dispatch: + +jobs: + run-tests: + uses: ./.github/workflows/test.yml + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..71f87ec --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,50 @@ +name: Tests + +on: + workflow_call: + + workflow_dispatch: + +jobs: + tests: + name: Run Tests + strategy: + fail-fast: false + matrix: + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + runs-on: "ubuntu-latest" + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install pytest + python -m pip install . + + - name: Run tests + run: pytest -s -vvvv -l --tb=long test + + lint: + name: Lint Code Base + runs-on: ubuntu-latest + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: Install Dependencies + run: pip install black + + - name: Lint Code Base + run: | + black --check . \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1f48344..3e37603 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,27 @@ +# Editors +.vscode/ +.idea/ + +# Vagrant +.vagrant/ + +# Mac/OSX +.DS_Store + +# Windows +Thumbs.db + +# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -*~ +*$py.class # C extensions *.so # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -20,9 +33,11 @@ lib64/ parts/ sdist/ var/ +wheels/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -37,12 +52,15 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +.hypothesis/ +.pytest_cache/ # Translations *.mo @@ -50,6 +68,15 @@ coverage.xml # Django stuff: *.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy # Sphinx documentation docs/_build/ @@ -57,6 +84,42 @@ docs/_build/ # PyBuilder target/ -# data files -*.fasta -*.fastq \ No newline at end of file +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json \ No newline at end of file diff --git a/README.md b/README.md index a243b07..d1fe92b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,30 @@ # seqBackup -`python -m unittests` to run tests. +Logic for parsing Illumina headers and folders as well as for archiving reads. -## TODO: +## Dev --make backup_illumina.py able to take relative paths for --forward-reads argument +``` +git clone https://github.com/PennChopMicrobiomeProgram/seqBackup.git +cd seqBackup/ +python -m venv env +source env/bin/activate +pip install -e . +pip install black pytest +``` + +Before commits, make sure everything is well formatted and working: + +``` +black . +pytest test/ +git commit ... +``` + +### Adding a new machine type + +To add a new machine type, add the new machine code to the `MACHINE_TYPES` map in `seqBackuplib/illumina.py`. In some cases, you may have to add machine specific parsing in `_parse_header` or `_parse_folder`. In `test/test_illumina.py`, we have a mechanism for requiring tests for each supported machine type. Add the new machine type to the `machine_fixtures` map and then create the fixture that it points to in `test/conftest.py`. Follow the pattern laid out by other fixtures and try to make the test data as realistic as possible. + +### Incorporting new version + +This software is the "source of truth" for Illumina file handling logic. Other software in our ecosystem depend on this logic including the sample registry and the automation pipeline. When you update this software you will have to then update the installed versions wherever it is deployed as a dependency. We don't bother with official GitHub releases and instead just point directly at the `master` branch, so usually it is a matter of running `pip install git+https://github.com/PennChopMicrobiomeProgram/seqBackup.git@master` from the host machine. \ No newline at end of file diff --git a/scripts/backup_illumina.py b/scripts/backup_illumina.py index b1687c5..7fcb1c7 100644 --- a/scripts/backup_illumina.py +++ b/scripts/backup_illumina.py @@ -1,3 +1,4 @@ #!/usr/bin/env python from seqBackupLib.backup import main + main() diff --git a/seqBackupLib/backup.py b/seqBackupLib/backup.py index fe790bd..920f206 100644 --- a/seqBackupLib/backup.py +++ b/seqBackupLib/backup.py @@ -6,49 +6,68 @@ import gzip import hashlib import warnings - +from pathlib import Path from seqBackupLib.illumina import IlluminaFastq -def build_fp_to_archive(file_name, has_index, lane): - if re.search("R1_001.fastq", file_name) is None: - raise IOError("The file doesn't look like an R1 file: {}".format(file_name)) +DEFAULT_MIN_FILE_SIZE = 500000000 # 500MB + + +def build_fp_to_archive(fp: Path, has_index: bool, lane: str) -> list[Path]: + + if re.search("R1_001.fastq", fp.name) is None: + raise IOError("The file doesn't look like an R1 file: {}".format(fp)) label = ["R2"] if has_index: label.extend(["I1", "I2"]) rexp = "".join(["(L00", lane, "_)(R1)(_001.fastq.gz)$"]) - modified_fp = [re.sub(rexp, "".join(["\\1", lab, "\\3"]), file_name) for lab in label] - return [file_name] + modified_fp + modified_fp = [re.sub(rexp, "".join(["\\1", lab, "\\3"]), fp.name) for lab in label] + return [fp] + [fp.parent / n for n in modified_fp] + -def return_md5(fname): +def return_md5(fp: Path) -> str: # from https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file hash_md5 = hashlib.md5() - with open(fname, "rb") as f: + with open(fp, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() -def backup_fastq(forward_reads, dest_dir, sample_sheet_fp, has_index, min_file_size): - - R1 = IlluminaFastq(gzip.open(forward_reads, mode = 'rt')) - # build the strings for the required files - file_names_RI = build_fp_to_archive(forward_reads, has_index, R1.lane) +def backup_fastq( + forward_reads: Path, + dest_dir: Path, + sample_sheet_fp: Path, + has_index: bool, + min_file_size: int, +): + + R1 = IlluminaFastq(gzip.open(forward_reads, mode="rt")) + + # build the strings for the required files + RI_fps = build_fp_to_archive(forward_reads, has_index, R1.lane) # create the Illumina objects and check the files - illumina_fastqs = [] - for fp in file_names_RI: - illumina_temp = IlluminaFastq(gzip.open(fp, mode = 'rt')) - if not illumina_temp.check_fp_vs_content()[0]: - print(illumina_temp.check_fp_vs_content()[1:]) - raise ValueError("The file path and header information don't match") - if not illumina_temp.check_file_size(min_file_size): - raise ValueError("File {0} seems suspiciously small. Plese check if you have the correct file or lower the minimum file size threshold".format(fp)) - if not illumina_temp.check_index_read_exists(): - warnings.warn("No barcodes in headers. Were the fastq files generated properly?: {0}".format(fp)) - illumina_fastqs.append(illumina_temp) + illumina_fastqs = [IlluminaFastq(gzip.open(fp, mode="rt")) for fp in RI_fps] + r1 = illumina_fastqs[0] + + if not all([ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]): + [ifq.check_fp_vs_content(verbose=True) for ifq in illumina_fastqs] + raise ValueError( + "The file path and header information don't match", + [str(ifq) for ifq in illumina_fastqs if not ifq.check_fp_vs_content()[0]], + ) + if not all([ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]): + raise ValueError( + "File seems suspiciously small. Please check if you have the correct file or lower the minimum file size threshold", + [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs], + ) + if not all([ifq.check_index_read_exists() for ifq in illumina_fastqs]): + warnings.warn( + "No barcodes in headers. Were the fastq files generated properly?" + ) # parse the info from the headers in EACH file and check they are consistent within each other if not all([fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]): @@ -57,59 +76,72 @@ def backup_fastq(forward_reads, dest_dir, sample_sheet_fp, has_index, min_file_s ## Archiving steps # make sure the sample sheet exists - if not os.path.isfile(sample_sheet_fp): - raise IOError("Sample sheet does not exist: {}".format(sample_sheet_fp)) + if not sample_sheet_fp.is_file(): + raise IOError("Sample sheet does not exist", str(sample_sheet_fp)) # create the folder to write to - write_dir = os.path.join(dest_dir, illumina_temp.build_archive_dir()) - - # create the folder. If it exists exit - if os.path.isdir(write_dir): - raise IOError("The folder already exists: {}".format(write_dir)) - os.mkdir(write_dir) + write_dir = dest_dir / r1.build_archive_dir() + write_dir.mkdir(parents=True, exist_ok=False) ### All the checks are done and the files are safe to archive! # move the files to the archive location and remove permission permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH - for fp in file_names_RI: - shutil.copyfile(fp, os.path.join(write_dir, os.path.basename(fp))) - os.chmod(os.path.join(write_dir, os.path.basename(fp)), permission) #this doesn't work on isilon + for fp in RI_fps: + output_fp = write_dir / fp.name + shutil.copyfile(fp, output_fp) + output_fp.chmod(permission) # copy the sample sheet to destination folder - shutil.copyfile(sample_sheet_fp, os.path.join(write_dir, os.path.basename(sample_sheet_fp))) + shutil.copyfile(sample_sheet_fp, write_dir / sample_sheet_fp.name) # write md5sums to a file - md5s = [(os.path.basename(fp), return_md5(fp)) for fp in file_names_RI] - md5out_fp = os.path.join(write_dir, ".".join([illumina_temp.build_archive_dir(), "md5"])) - with open(md5out_fp, "w") as md5_out: + md5s = [(fp.name, return_md5(fp)) for fp in RI_fps] + md5_out_fp = write_dir / ".".join([r1.build_archive_dir(), "md5"]) + with open(md5_out_fp, "w") as md5_out: [md5_out.write("\t".join(md5) + "\n") for md5 in md5s] + def main(argv=None): parser = argparse.ArgumentParser(description="Backs up fastq files") parser.add_argument( - "--forward-reads", required=True, - type=str, - help="R1.fastq") + "--forward-reads", required=True, type=Path, help="Gzipped R1 fastq file" + ) parser.add_argument( - "--destination-dir", required=True, - type=str, - help="Destination folder to copy the files to.") + "--destination-dir", + required=True, + type=Path, + help="Destination folder to copy the files to.", + ) parser.add_argument( - "--sample-sheet", required=True, - type=str, - help="The sample sheet associated with the run.") + "--sample-sheet", + required=True, + type=Path, + help="The sample sheet associated with the run.", + ) parser.add_argument( - "--has-index", required=False, - type=bool, default=True, - help="Are index reads generated") + "--has-index", + required=False, + type=bool, + default=True, + help="Are index reads generated", + ) parser.add_argument( - "--min-file-size", required=False, - type=int, default=500000000, - help="Minimum file size to register in bytes") + "--min-file-size", + required=False, + type=int, + default=DEFAULT_MIN_FILE_SIZE, + help="Minimum file size to register in bytes", + ) args = parser.parse_args(argv) - backup_fastq(args.forward_reads, args.destination_dir, args.sample_sheet, args.has_index, args.min_file_size) + backup_fastq( + args.forward_reads, + args.destination_dir, + args.sample_sheet, + args.has_index, + args.min_file_size, + ) # maybe also ask for single or double reads diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index ec57b19..bc0a913 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -1,38 +1,45 @@ -import gzip -import os.path import re -import warnings +from io import TextIOWrapper +from pathlib import Path +class IlluminaFastq: + MACHINE_TYPES = { + "VH": "Illumina-NextSeq", + "D": "Illumina-HiSeq", + "M": "Illumina-MiSeq", + "A": "Illumina-NovaSeq", + "NB": "Illumina-MiniSeq", + "LH": "Illumina-NovaSeqX", + } -class IlluminaFastq(object): - machine_types = {"V": "Illumina-NextSeq", "D": "Illumina-HiSeq", "M": "Illumina-MiSeq", "A": "Illumina-NovaSeq","N": "Illumina-MiniSeq"} - - def __init__(self, f): + def __init__(self, f: TextIOWrapper): self.file = f self.fastq_info = self._parse_header() self.folder_info = self._parse_folder() - + def __str__(self): - return "_".join([self.fastq_info["instrument"], - self.fastq_info["run_number"], - self.fastq_info["flowcell_id"], - self.fastq_info["lane"]]) - - def is_same_run(self, other): - run_check = self.fastq_info["run_number"] == other.fastq_info["run_number"] - instrument_check = self.fastq_info["instrument"] == other.fastq_info["instrument"] - flowcell_check = self.fastq_info["flowcell_id"] == other.fastq_info["flowcell_id"] - return (run_check and instrument_check and flowcell_check) - - def _parse_header(self): + return "_".join( + [ + self.fastq_info["instrument"], + self.fastq_info["run_number"], + self.fastq_info["flowcell_id"], + self.fastq_info["lane"], + ] + ) + + def is_same_run(self, other: "IlluminaFastq") -> bool: + keys = ["run_number", "instrument", "flowcell_id"] + return all(self.fastq_info[k] == other.fastq_info[k] for k in keys) + + def _parse_header(self) -> dict[str, str]: line = next(self.file).strip() if not line.startswith("@"): raise ValueError("Not a FASTQ header line") # Remove first character, @ line = line[1:] word1, _, word2 = line.partition(" ") - + keys1 = ("instrument", "run_number", "flowcell_id", "lane") vals1 = dict((k, v) for k, v in zip(keys1, word1.split(":"))) @@ -42,65 +49,167 @@ def _parse_header(self): vals1.update(vals2) return vals1 - def _parse_folder(self): - matches = re.match("(\\d{6})_([DMANV]B?H?\\d{5,6})_0*(\\d{1,4})_(.*)", self.run_name) - keys1 = ("date", "instrument", "run_number", "flowcell_id") - vals1 = dict((k, v) for k, v in zip(keys1, matches.groups())) - - if self.machine_type == "Illumina-HiSeq" or self.machine_type == "Illumina-NovaSeq" or self.machine_type == "Illumina-MiniSeq": - vals1["flowcell_id"] = vals1["flowcell_id"][1:] - - matches = re.match("Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", os.path.basename(self.filepath)) + def _parse_folder(self) -> dict[str, str]: + # Extract directory name info + parts = self.run_name.split("_") + + date = parts[0] + if len(date) == 8: + self.date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}" + elif len(date) == 6: + self.date = f"20{date[0:2]}-{date[2:4]}-{date[4:6]}" + else: + raise ValueError(f"Invalid date format in run name: {date}") + + instrument = parts[1] + if self._extract_instrument_code(instrument) not in self.MACHINE_TYPES: + raise ValueError(f"Invalid instrument code in run name: {instrument}") + + run_number = parts[2] + if not run_number.isdigit(): + raise ValueError(f"Invalid run number in run name: {run_number}") + + flowcell_id = parts[3] + + if len(parts) > 4: + raise ValueError(f"Unexpected extra parts in run name: {parts[4:]}") + + vals1 = { + "date": date, + "instrument": instrument, + "run_number": str(int(run_number)), + "flowcell_id": flowcell_id, + } + + if ( + self.machine_type == "Illumina-HiSeq" + or self.machine_type == "Illumina-NovaSeq" + or self.machine_type == "Illumina-MiniSeq" + or self.machine_type == "Illumina-NovaSeqX" + ): + vals1["flowcell_id"] = vals1["flowcell_id"][1:] + + # Extract file name info + matches = re.match( + "Undetermined_S0_L00([1-8])_([RI])([12])_001.fastq.gz", self.filepath.name + ) keys2 = ("lane", "read_or_index", "read") vals2 = dict((k, v) for k, v in zip(keys2, matches.groups())) - + vals1.update(vals2) return vals1 - @property - def machine_type(self): - instrument_code = self.fastq_info["instrument"][0] - return self.machine_types[instrument_code] + @staticmethod + def _extract_instrument_code(instrument: str) -> str: + return "".join(filter(lambda x: not x.isdigit(), instrument)) @property - def date(self): - year = self.run_name[0:2] - month = self.run_name[2:4] - day = self.run_name[4:6] - return "20{0}-{1}-{2}".format(year, month, day) + def machine_type(self): + return self.MACHINE_TYPES[ + self._extract_instrument_code(self.fastq_info["instrument"]) + ] @property - def lane(self): + def lane(self) -> str: return self.fastq_info["lane"] @property - def filepath(self): - return self.file.name - + def filepath(self) -> Path: + return Path(self.file.name) @property - def run_name(self): - dir_split = self.filepath.split(os.sep) - #return(dir_split[-2]) - matches = [re.match("\\d{6}_[DMANV]B?H?\\d{5,6}_\\d{1,4}_[\\dA-Z]{9}", d) for d in dir_split] - matches = [dir_split[i] for i, m in enumerate(matches) if m] - if len(matches) != 1: - raise ValueError("Could not find run name in directory: {0}".format(self.filepath)) - return matches[0] - - def build_archive_dir(self): - return '_'.join([self.run_name, 'L{:0>3}'.format(self.lane)]) - - def check_fp_vs_content(self): + def run_name(self) -> str: + for part in self.filepath.parts: + segments = part.split("_") + if ( + len(segments) >= 4 + and segments[0].isdigit() + and self._extract_instrument_code(segments[1]) in self.MACHINE_TYPES + and segments[2].isdigit() + ): + return part + raise ValueError(f"Run name not found in path: {self.filepath}") + + def build_archive_dir(self) -> str: + return "_".join([self.run_name, "L{:0>3}".format(self.lane)]) + + def check_fp_vs_content(self, verbose: bool = False) -> list[bool]: run_check = self.fastq_info["run_number"] == self.folder_info["run_number"] - instrument_check = self.fastq_info["instrument"] == self.folder_info["instrument"] - flowcell_check = self.fastq_info["flowcell_id"] == self.folder_info["flowcell_id"] + instrument_check = ( + self.fastq_info["instrument"] == self.folder_info["instrument"] + ) + flowcell_check = ( + self.fastq_info["flowcell_id"] == self.folder_info["flowcell_id"] + ) lane_check = self.lane == self.folder_info["lane"] read_check = self.fastq_info["read"] == self.folder_info["read"] - return ([run_check and instrument_check and flowcell_check and lane_check and read_check, run_check, instrument_check, flowcell_check, lane_check, read_check, self.fastq_info["flowcell_id"], self.folder_info["flowcell_id"]]) - - def check_file_size(self, min_file_size): - return os.path.getsize(self.filepath) > min_file_size - def check_index_read_exists(self): + if verbose: + ( + print( + "Fastq run number: ", + self.fastq_info["run_number"], + "Folder run number: ", + self.folder_info["run_number"], + ) + if not run_check + else None + ) + ( + print( + "Fastq instrument: ", + self.fastq_info["instrument"], + "Folder instrument: ", + self.folder_info["instrument"], + ) + if not instrument_check + else None + ) + ( + print( + "Fastq flowcell id: ", + self.fastq_info["flowcell_id"], + "Folder flowcell id: ", + self.folder_info["flowcell_id"], + ) + if not flowcell_check + else None + ) + ( + print( + "Fastq lane: ", self.lane, "Folder lane: ", self.folder_info["lane"] + ) + if not lane_check + else None + ) + ( + print( + "Fastq read: ", + self.fastq_info["read"], + "Folder read: ", + self.folder_info["read"], + ) + if not read_check + else None + ) + + return [ + run_check + and instrument_check + and flowcell_check + and lane_check + and read_check, + run_check, + instrument_check, + flowcell_check, + lane_check, + read_check, + self.fastq_info["flowcell_id"], + self.folder_info["flowcell_id"], + ] + + def check_file_size(self, min_file_size) -> bool: + return self.filepath.stat().st_size > min_file_size + + def check_index_read_exists(self) -> bool: return len(self.fastq_info["index_reads"]) > 2 diff --git a/setup.py b/setup.py index 66a3ca4..00df46d 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,16 @@ from distutils.core import setup # Get version number from package -exec(open('seqBackupLib/version.py').read()) +exec(open("seqBackupLib/version.py").read()) setup( - name='seqBackup', + name="seqBackup", version=__version__, - description='Set of rules to organize our fastq storage on the server.', - author='Ceylan Tanes', - author_email='ctanes@gmail.com', - url='https://github.com/PennChopMicrobiomeProgram', - packages=['seqBackupLib'], - scripts=['scripts/backup_illumina.py']#, - #install_requires=["pandas", "biopython"] - ) + description="Set of rules to organize our fastq storage on the server.", + author="Ceylan Tanes", + author_email="ctanes@gmail.com", + url="https://github.com/PennChopMicrobiomeProgram", + packages=["seqBackupLib"], + scripts=["scripts/backup_illumina.py"], + python_requires=">=3.9", +) diff --git a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I1_001.fastq.gz b/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I1_001.fastq.gz deleted file mode 100644 index 401f4df..0000000 Binary files a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I1_001.fastq.gz and /dev/null differ diff --git a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I2_001.fastq.gz b/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I2_001.fastq.gz deleted file mode 100644 index 89cb3a6..0000000 Binary files a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_I2_001.fastq.gz and /dev/null differ diff --git a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz b/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz deleted file mode 100644 index 6bc44e9..0000000 Binary files a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz and /dev/null differ diff --git a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R2_001.fastq.gz b/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R2_001.fastq.gz deleted file mode 100644 index 52fde44..0000000 Binary files a/test/170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R2_001.fastq.gz and /dev/null differ diff --git a/test/170323_M04734_0028_000000000-B2MVT/test_sample_sheet.txt b/test/170323_M04734_0028_000000000-B2MVT/test_sample_sheet.txt deleted file mode 100644 index 4af3342..0000000 --- a/test/170323_M04734_0028_000000000-B2MVT/test_sample_sheet.txt +++ /dev/null @@ -1,5 +0,0 @@ -SampleID SampleType study_group -S1 Oral swab healthy -S2 Oral swab active -S3 Feces healthy -S4 Feces active diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..2918247 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,237 @@ +import gzip +import pytest +from pathlib import Path + + +def setup_illumina_dir(fp: Path, r1: str, r1_lines: list[str]) -> Path: + fp.mkdir(parents=True, exist_ok=True) + + r1_fp = fp / r1 + with gzip.open(r1_fp, "wt") as f: + f.writelines(r1_lines) + + (fp / r1.replace("R1", "R2")).touch() + (fp / r1.replace("R1", "I1")).touch() + (fp / r1.replace("R1", "I2")).touch() + (fp / "sample_sheet.csv").touch() + + return fp + + +@pytest.fixture +def novaseq_dir(tmp_path) -> Path: + return setup_illumina_dir( + tmp_path / "250218_A00901_1295_BHTKCGDRX5", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@A00901:1295:HTKCGDRX5:1:2101:1054:1000 1:N:0:NAGTGTTAGG+CGGAACTAGC\n", + "GTAAAAAGCTAGATTTTCGCGATTTACCAGACGAACTANTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", + "+\n", + "FFFFF,FFF:FFFFFF,FFFFFFFFFFFFFFFFFFFFF#,###############################################################################################################\n", + "@A00901:1295:HTKCGDRX5:1:2101:1090:1000 1:N:0:AATTCTTGGA+AAGTTGACAA\n", + "GCTGCAATATGCGCCAACAAAACCGGTGGATAAAAAGGTTTCGTAATATAGTCATCNCNGNCNTNTNCNANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTTCTCCGC\n", + "+\n", + ",FFFFFFFF:FFFFFFFFFFFF::FFFFFFFFF,FFFFFFF:F::FFFFFFFFFF:#F#F#F#F#:#F#F#######################################################################FFFFFFFFFF\n", + ], + ) + + +@pytest.fixture +def hiseq_dir(tmp_path) -> Path: + return setup_illumina_dir( + tmp_path / "201118_D00728_0139_ACD5C3ANXX", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@D00728:139:CD5C3ANXX:1:1101:1228:2123 1:N:0:ATCTCAGG+CCTAGAGT\n", + "NTGCGCAGGGGGACCTGCACCGGCATCCCCTGTACCGGCGGGGCGCTCAGGCTGAATGCGCCGTCCTGCATCAGTACCGACTCCGGCTCGATGGCTTTATCCTGTCTCTTATACACATCTCCGAGC\n", + "+\n", + "#:<>AEBGGGGGGGGGDDG=CGGGGF=FGGGGGGGGGGGGGGGGF Path: + return setup_illumina_dir( + tmp_path / "20250429_LH00732_0028_A22YJWWLT3", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@LH00732:28:22YJWWLT3:1:1101:1213:1080 1:N:0:CCTCCGTCCA+CACCGATGTG\n", + "ACGT\n", + "+\n", + "IIII\n", + ], + ) + + +@pytest.fixture +def miseq_dir(tmp_path) -> Path: + return setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:1:1101:16223:1348 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTT\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3A333BB33B311100BA000BBBE122B110A//AA/>//>///>///<<-9-99--99---999--999@999-9999@>---9------9-9\n", + "@M03543:443:000000000-DTHBL:1:1101:15497:1351 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTC\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3B333BB33B311100BB000BBCD122B110A//AA/>//>///>////<---9------9-9\n", + ], + ) + + +@pytest.fixture +def miniseq_dir(tmp_path) -> Path: + return setup_illumina_dir( + tmp_path / "210612_NB551353_0107_AHWJFCAFX2", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@NB551353:107:HWJFCAFX2:1:11101:1486:1048 1:N:0:TAATTAGCGT+NNNTTAACCA\n", + "GAAATNGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTA\n", + "+\n", + "AAAAA#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEE\n", + "@NB551353:107:HWJFCAFX2:1:11101:6713:1048 1:N:0:GAAGACTAGA+NNNTTCTAGT\n", + "GGCTANATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGT\n", + "+\n", + "AAAAA#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\n", + ], + ) + + +@pytest.fixture +def nextseq_dir(tmp_path) -> Path: + return setup_illumina_dir( + tmp_path / "250101_VH12345_0022_222C2NYNX", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@VH12345:22:222C2NYNX:1:1101:18286:1000 1:N:0:GGCACTAAGG+GTTGACCTGA\n", + "ACGT\n", + "+\n", + "IIII\n", + ], + ) + + +@pytest.fixture +def full_miseq_dir(tmp_path) -> Path: + # Lane 1 + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L001_R1_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:1:1101:16223:1348 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTT\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3A333BB33B311100BA000BBBE122B110A//AA/>//>///>///<<-9-99--99---999--999@999-9999@>---9------9-9\n", + "@M03543:443:000000000-DTHBL:1:1101:15497:1351 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTC\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3B333BB33B311100BB000BBCD122B110A//AA/>//>///>////<---9------9-9\n", + ], + ) + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L001_R2_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:1:1101:16223:1348 2:N:0:TTTTTTTTTTTT+TTCTTTTTCCTT\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3A333BB33B311100BA000BBBE122B110A//AA/>//>///>///<<-9-99--99---999--999@999-9999@>---9------9-9\n", + "@M03543:443:000000000-DTHBL:1:1101:15497:1351 2:N:0:TTTTTTTTTTTT+TTCTTTTTCCTC\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3B333BB33B311100BB000BBCD122B110A//AA/>//>///>////<---9------9-9\n", + ], + ) + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L001_I1_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "111>111>0000\n", + "@M03543:443:000000000-DTHBL:1:2106:14807:1943 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "1>1111>100>0\n", + ], + ) + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L001_I2_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:1:2106:17605:1940 2:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "111>111>0000\n", + "@M03543:443:000000000-DTHBL:1:2106:14807:1943 2:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "1>1111>100>0\n", + ], + ) + + # Lane 2 + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L002_R1_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:2:1101:16223:1348 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTT\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3A333BB33B311100BA000BBBE122B110A//AA/>//>///>///<<-9-99--99---999--999@999-9999@>---9------9-9\n", + "@M03543:443:000000000-DTHBL:2:1101:15497:1351 1:N:0:TTTTTTTTTTTT+TTCTTTTTCCTC\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3B333BB33B311100BB000BBCD122B110A//AA/>//>///>////<---9------9-9\n", + ], + ) + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L002_R2_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:2:1101:16223:1348 2:N:0:TTTTTTTTTTTT+TTCTTTTTCCTT\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3A333BB33B311100BA000BBBE122B110A//AA/>//>///>///<<-9-99--99---999--999@999-9999@>---9------9-9\n", + "@M03543:443:000000000-DTHBL:2:1101:15497:1351 2:N:0:TTTTTTTTTTTT+TTCTTTTTCCTC\n", + "TCTTCCCTCTTTCTTCTTTCTTCCTCCCTTCCCTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", + "+\n", + ">>>>A1C1BB1B3B333BB33B311100BB000BBCD122B110A//AA/>//>///>////<---9------9-9\n", + ], + ) + setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L002_I1_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:2:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "111>111>0000\n", + "@M03543:443:000000000-DTHBL:2:2106:14807:1943 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "1>1111>100>0\n", + ], + ) + return setup_illumina_dir( + tmp_path / "250407_M03543_0443_000000000-DTHBL", + "Undetermined_S0_L002_I2_001.fastq.gz", + [ + "@M03543:443:000000000-DTHBL:2:2106:17605:1940 2:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "111>111>0000\n", + "@M03543:443:000000000-DTHBL:2:2106:14807:1943 2:N:0:TTTTTTTTTTTT+TCTTTCCCTACA\n", + "TTTTTTTTTTTT\n", + "+\n", + "1>1111>100>0\n", + ], + ) diff --git a/test/test_backup.py b/test/test_backup.py index 0ceea6b..4645a71 100644 --- a/test/test_backup.py +++ b/test/test_backup.py @@ -1,46 +1,75 @@ -import unittest -import gzip -import tempfile -from io import StringIO - -from seqBackupLib.illumina import IlluminaFastq -from seqBackupLib.backup import * - -class BackupTests(unittest.TestCase): - def setUp(self): - self.curr_dir = os.path.dirname(os.path.abspath(__file__)) - self.fastq_filepath = os.path.join(self.curr_dir, "170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz") - self.temp_out_dir = tempfile.mkdtemp(dir=self.curr_dir) - self.sample_sheet_fp = os.path.join(self.curr_dir, "170323_M04734_0028_000000000-B2MVT/test_sample_sheet.txt") - - def tearDown(self): - shutil.rmtree(self.temp_out_dir) - - def test_build_fp_to_archive(self): - list1 = build_fp_to_archive("Undetermined_S0_L001_R1_001.fastq.gz", True, "1") - self.assertCountEqual(list1, ["Undetermined_S0_L001_R1_001.fastq.gz", "Undetermined_S0_L001_R2_001.fastq.gz", "Undetermined_S0_L001_I1_001.fastq.gz", "Undetermined_S0_L001_I2_001.fastq.gz"]) - - list1 = build_fp_to_archive("Undetermined_S0_L001_R1_001.fastq.gz", False, "1") - self.assertCountEqual(list1, ["Undetermined_S0_L001_R1_001.fastq.gz", "Undetermined_S0_L001_R2_001.fastq.gz"]) - - def test_backup_fastq(self): - has_index = True - min_file_size = 5 - backup_fastq(self.fastq_filepath, self.temp_out_dir, self.sample_sheet_fp, has_index, min_file_size) - - # check the md5sums of the first fastq is the same - fq = IlluminaFastq(gzip.open(self.fastq_filepath, mode = 'rt')) - out_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(), os.path.basename(self.fastq_filepath)) - md5_orj = return_md5(self.fastq_filepath) - md5_trans = return_md5(out_fp) - self.assertEqual(md5_orj, md5_trans) - - # check the md5sum of the sample sheet - ss_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(), os.path.basename(self.sample_sheet_fp)) - self.assertEqual(return_md5(ss_fp), "92ef1ca7433cadb5267d822615cd15e2") - - # check write permissions of the files - self.assertEqual(os.stat(out_fp).st_mode, 33060) - - def test_return_md5(self): - self.assertEqual(return_md5(self.fastq_filepath), "13695e47114c02536ae3ca6823a42261") +import pytest +from pathlib import Path +from seqBackupLib.backup import backup_fastq, build_fp_to_archive, return_md5 + + +def test_build_fp_to_archive(): + archive = build_fp_to_archive( + Path("Undetermined_S0_L001_R1_001.fastq.gz"), True, "1" + ) + assert archive == [ + Path("Undetermined_S0_L001_R1_001.fastq.gz"), + Path("Undetermined_S0_L001_R2_001.fastq.gz"), + Path("Undetermined_S0_L001_I1_001.fastq.gz"), + Path("Undetermined_S0_L001_I2_001.fastq.gz"), + ] + + archive = build_fp_to_archive( + Path("Undetermined_S0_L001_R1_001.fastq.gz"), False, "1" + ) + assert archive == [ + Path("Undetermined_S0_L001_R1_001.fastq.gz"), + Path("Undetermined_S0_L001_R2_001.fastq.gz"), + ] + + archive = build_fp_to_archive( + Path("Undetermined_S0_L002_R1_001.fastq.gz"), True, "2" + ) + assert archive == [ + Path("Undetermined_S0_L002_R1_001.fastq.gz"), + Path("Undetermined_S0_L002_R2_001.fastq.gz"), + Path("Undetermined_S0_L002_I1_001.fastq.gz"), + Path("Undetermined_S0_L002_I2_001.fastq.gz"), + ] + + with pytest.raises(IOError): + build_fp_to_archive(Path("Undetermined_S0_L001_R2_001.fastq.gz"), True, "1") + + +def test_return_md5(tmp_path): + test_file = tmp_path / "test.txt" + with open(test_file, "w") as f: + f.write("Hello, World!") + + md5_hash = return_md5(test_file) + assert md5_hash == "65a8e27d8879283831b664bd8b7f0ad4" # MD5 hash of "Hello, World!" + + +def test_backup_fastq(tmp_path, full_miseq_dir): + raw = tmp_path / "raw_reads" + raw.mkdir(parents=True, exist_ok=True) + sample_sheet_fp = full_miseq_dir / "sample_sheet.csv" + + backup_fastq( + full_miseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) + backup_fastq( + full_miseq_dir / "Undetermined_S0_L002_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) + + with pytest.raises(FileNotFoundError): + backup_fastq( + full_miseq_dir / "Undetermined_S0_L003_R1_001.fastq.gz", + raw, + sample_sheet_fp, + True, + 100, + ) diff --git a/test/test_illumina.py b/test/test_illumina.py index 74a9014..6aa7939 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -1,198 +1,36 @@ -import unittest -import os import gzip -from io import StringIO - +import pytest +from pathlib import Path +from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE from seqBackupLib.illumina import IlluminaFastq -class IlluminaTests(unittest.TestCase): - - def test_illuminafastq(self): - ##miniseq - fastq_file = StringIO( - u"@NB551353:107:HWJFCAFX2:1:11101:23701:1033 1:N:0:CAAGACGTCC+NNNNTATACT") - fastq_filepath = ( - "incoming/210612_NB551353_0107_AHWJFCAFX2/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - folder_info = {"date":"210612", "instrument":"NB551353", "run_number":"107", "flowcell_id":"HWJFCAFX2", "lane":"1", "read_or_index":"R", "read":"1"} - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - ##Miseq - fastq_file = StringIO( - u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN") - fastq_filepath = ( - "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - folder_info = {"date":"160511", "instrument":"M03543", "run_number":"47", "flowcell_id":"000000000-APE6Y", "lane":"1", "read_or_index":"R", "read":"1"} - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - - self.assertEqual(fq.machine_type, "Illumina-MiSeq") - self.assertEqual(fq.date, "2016-05-11") - self.assertEqual(fq.lane, "1") - self.assertEqual(fq.filepath, fastq_filepath) - self.assertEqual(fq.run_name, "160511_M03543_0047_000000000-APE6Y") - - self.assertDictEqual(fq.folder_info, folder_info) - ##Hiseq - fastq_file = StringIO( - u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") - fastq_filepath = ( - "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - folder_info = {"date":"170330", "instrument":"D00727", "run_number":"27", "flowcell_id":"CA7HHANXX", "lane":"1", "read_or_index":"R", "read":"1"} - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - - self.assertEqual(fq.machine_type, "Illumina-HiSeq") - self.assertEqual(fq.date, "2017-03-30") - self.assertEqual(fq.lane, "1") - self.assertEqual(fq.filepath, fastq_filepath) - self.assertEqual(fq.run_name, "170330_D00727_0027_ACA7HHANXX") - - self.assertDictEqual(fq.folder_info, folder_info) - - def test_fp_vs_content(self): - # check correct case for Miseq data - fastq_file = StringIO( - u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertTrue(fq.check_fp_vs_content()) - - # check correct case for Hiseq data - fastq_file = StringIO( - u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") - fastq_filepath = ( - "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertTrue(fq.check_fp_vs_content()) - - # case when the lane number doesn't match - fastq_file = StringIO( - u"@M04734:28:000000000-B2MVT:3:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertFalse(fq.check_fp_vs_content()) - - # case when the flow cell ID doesn't match - fastq_file = StringIO( - u"@M04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertFalse(fq.check_fp_vs_content()) - - # case when the machine doesn't match - fastq_file = StringIO( - u"@D04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertFalse(fq.check_fp_vs_content()) - - # case when the read doesn't match - ### important: It won't distinguish between R1 and I1. - fastq_file = StringIO( - u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R2_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertFalse(fq.check_fp_vs_content()) - - def test_check_index_read_exists(self): - # test passing - fastq_file = StringIO( - u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertTrue(fq.check_index_read_exists()) - - # test failing - fastq_file = StringIO( - u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:0") - fastq_filepath = ( - "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertFalse(fq.check_index_read_exists()) - - def test_build_archive_dir(self): - # for MiniSeq - fastq_file=StringIO( - u"@NB551353:107:HWJFCAFX2:1:11101:23701:1033 1:N:0:CAAGACGTCC+NNNNTATACT") - fastq_filepath = ( - "icoming/210621_A00901_0361_BHFWM2DRXY/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertEqual(fq.build_archive_dir(), "210621_A00901_0361_BHFWM2DRXY_L001") - - fastq_file = StringIO( - u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN") - fastq_filepath = ( - "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertEqual(fq.build_archive_dir(), "160511_M03543_0047_000000000-APE6Y_L001") - # for HiSeq - fastq_file = StringIO( - u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") - fastq_filepath = ( - "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - self.assertEqual(fq.build_archive_dir(), "170330_D00727_0027_ACA7HHANXX_L001") +machine_fixtures = { + "A": "novaseq_dir", + "D": "hiseq_dir", + "LH": "novaseqx_dir", + "M": "miseq_dir", + "NB": "miniseq_dir", + "VH": "nextseq_dir", +} - def test_check_file_size(self): - curr_dir = os.path.dirname(os.path.abspath(__file__)) - fastq_filepath = os.path.join(curr_dir, "170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz") - fq = IlluminaFastq(gzip.open(fastq_filepath, mode = 'rt')) - self.assertTrue(fq.check_file_size(50)) - self.assertFalse(fq.check_file_size(50000)) - def test_is_same_run(self): - fastq_file = StringIO( - u"@NB551353:107:HWJFCAFX2:2:11101:23701:1033 1:N:0:CAAGACGTCC+NNNNTATACT") - fastq_filepath = ( - "incoming/210612_NB551353_0107_AHWJFCAFX2/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq = IlluminaFastq(fastq_file) - fastq_file.seek(0) - fq1 = IlluminaFastq(fastq_file) +@pytest.mark.parametrize("machine_type", IlluminaFastq.MACHINE_TYPES.keys()) +def test_illumina_fastq(machine_type, request): + fixture_name = machine_fixtures.get(machine_type) + if not fixture_name: + raise ValueError( + f"All supported machine types must be tested. Missing: {machine_type}" + ) - fastq_file = StringIO( - u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") - fastq_filepath = ( - "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" - "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") - fastq_file.name = fastq_filepath - fq2 = IlluminaFastq(fastq_file) + fp = request.getfixturevalue(fixture_name) - self.assertTrue(fq.is_same_run(fq1)) - self.assertFalse(fq.is_same_run(fq2)) + with gzip.open(fp / "Undetermined_S0_L001_R1_001.fastq.gz", "rt") as f: + r1 = IlluminaFastq(f) -if __name__ == "__main__": - unittest.main() + print("FASTQ info: ", r1.fastq_info, "\nFolder info: ", r1.folder_info) + assert r1.machine_type == IlluminaFastq.MACHINE_TYPES[machine_type] + assert r1.check_fp_vs_content()[0], r1.check_fp_vs_content() + assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE) + assert r1.check_file_size(100) + assert r1.check_index_read_exists()