From d56f9dd52c335f8992281197fbe4e60ea66452a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 19:05:42 +0100 Subject: [PATCH 1/2] Add test suite and CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a pytest battery (32 tests) covering: - the per-segment metric helpers (identity, matched, query length, MD-tag detection) against hand-crafted segments with known values; - filterSAMbyIdentity / filterSAMbyPercentMatched retention at several cutoffs, on both SAM and BAM input, and that MD-less reads are dropped; - output-format selection (SAM stays text, BAM is binary, and the *output* extension wins over the input one) — guarding the single-process fixes; - default output-path naming for filenames containing 'sam'/'bam'; - filterSAM dispatch: invalid filter / out-of-range cutoff raising, and the single-vs-parallel routing, including that n_processes<=1 never invokes the splitting machinery (regression guard for #3), plus an end-to-end parallel run that matches the serial result. Adds a GitHub Actions workflow (conda + bioconda samtools/pysam, pip parallelbam) running pytest on push and PRs. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/tests.yml | 31 +++++++++++++ .gitignore | 1 + pytest.ini | 3 ++ tests/conftest.py | 89 +++++++++++++++++++++++++++++++++++++ tests/test_filter.py | 87 ++++++++++++++++++++++++++++++++++++ tests/test_filterSAM.py | 60 +++++++++++++++++++++++++ tests/test_metrics.py | 59 ++++++++++++++++++++++++ 7 files changed, 330 insertions(+) create mode 100644 .github/workflows/tests.yml create mode 100644 pytest.ini create mode 100644 tests/conftest.py create mode 100644 tests/test_filter.py create mode 100644 tests/test_filterSAM.py create mode 100644 tests/test_metrics.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..49091cb --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,31 @@ +name: tests + +on: + push: + branches: [main] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} + steps: + - uses: actions/checkout@v4 + + - name: Set up conda environment + uses: conda-incubator/setup-miniconda@v3 + with: + channels: bioconda,conda-forge + channel-priority: strict + python-version: "3.10" + + - name: Install dependencies + run: | + conda install -y pysam samtools numpy pytest + python -m pip install parallelbam + python -m pip install -e . --no-deps + + - name: Run tests + run: python -m pytest diff --git a/.gitignore b/.gitignore index 84c3967..1ced7b7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ lib64 __pycache__ .ipynb_checkpoints +.pytest_cache diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..0bc5130 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +addopts = -ra diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..001ca22 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,89 @@ +""" +Shared fixtures for the filtersam test suite. + +A small, hand-crafted SAM file with segments whose percent-identity and +percent-matched values are known exactly is used throughout. The values were +verified against pysam's own ``get_aligned_pairs``/CIGAR parsing: + + name %identity %matched MD tag + read_perfect 100.0 100.0 MD:Z:10 + read_90id 90.0 90.0 MD:Z:5A4 + read_softclip 100.0 80.0 MD:Z:8 (CIGAR 2S8M) + read_70id 70.0 70.0 MD:Z:1A2C2A2 + read_nomd ---- ---- (no MD tag, always dropped) +""" + +import pysam +import pytest + +SAM_TEXT = """@HD\tVN:1.6\tSO:unsorted +@SQ\tSN:ref\tLN:100 +read_perfect\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tMD:Z:10 +read_90id\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTAGGTAC\t*\tMD:Z:5A4 +read_softclip\t0\tref\t1\t60\t2S8M\t*\t0\t0\tTTACGTACGT\t*\tMD:Z:8 +read_70id\t0\tref\t1\t60\t10M\t*\t0\t0\tAGGTGGTGAC\t*\tMD:Z:1A2C2A2 +read_nomd\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tNM:i:0 +""" + +# Expected segments retained for a given filter and cutoff. +IDENTITY_KEPT = { + 95.0: {"read_perfect", "read_softclip"}, + 90.0: {"read_perfect", "read_90id", "read_softclip"}, + 70.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"}, +} +MATCHED_KEPT = { + 100.0: {"read_perfect"}, + 85.0: {"read_perfect", "read_90id"}, + 50.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"}, +} + + +def _write_sam(path): + path.write_text(SAM_TEXT) + return path + + +def read_segment_names(path): + """Return the set of query names in a SAM/BAM file (format auto-detected).""" + save = pysam.set_verbosity(0) + with pysam.AlignmentFile(str(path), "r") as handle: + names = {seg.query_name for seg in handle} + pysam.set_verbosity(save) + return names + + +def detect_format(path): + """Return 'bam' if the file is BGZF/BAM-compressed, 'sam' if plain text.""" + with open(path, "rb") as handle: + magic = handle.read(2) + return "bam" if magic == b"\x1f\x8b" else "sam" + + +@pytest.fixture +def sam_path(tmp_path): + """A plain-text SAM fixture file.""" + return _write_sam(tmp_path / "sample.sam") + + +@pytest.fixture +def bam_path(tmp_path): + """A BAM fixture file built from the same records as ``sam_path``.""" + sam = _write_sam(tmp_path / "_src.sam") + bam = tmp_path / "sample.bam" + save = pysam.set_verbosity(0) + with pysam.AlignmentFile(str(sam), "r") as src: + with pysam.AlignmentFile(str(bam), "wb", template=src) as dst: + for seg in src: + dst.write(seg) + pysam.set_verbosity(save) + return bam + + +@pytest.fixture +def segments(sam_path): + """The parsed AlignedSegment objects, keyed by query name.""" + save = pysam.set_verbosity(0) + with pysam.AlignmentFile(str(sam_path), "r") as handle: + segs = {seg.query_name: seg for seg in handle} + pysam.set_verbosity(save) + return segs diff --git a/tests/test_filter.py b/tests/test_filter.py new file mode 100644 index 0000000..6494133 --- /dev/null +++ b/tests/test_filter.py @@ -0,0 +1,87 @@ +""" +Tests for the single-file filtering functions: +filterSAMbyIdentity and filterSAMbyPercentMatched. +""" + +from pathlib import Path + +import pytest + +from filtersam.filtersam import filterSAMbyIdentity, filterSAMbyPercentMatched + +from conftest import IDENTITY_KEPT, MATCHED_KEPT, detect_format, read_segment_names + + +@pytest.mark.parametrize("cutoff", sorted(IDENTITY_KEPT)) +def test_filter_by_identity_keeps_expected(sam_path, tmp_path, cutoff): + out = tmp_path / "out.sam" + filterSAMbyIdentity(sam_path, out, identity_cutoff=cutoff) + assert read_segment_names(out) == IDENTITY_KEPT[cutoff] + + +@pytest.mark.parametrize("cutoff", sorted(MATCHED_KEPT)) +def test_filter_by_matched_keeps_expected(sam_path, tmp_path, cutoff): + out = tmp_path / "out.sam" + filterSAMbyPercentMatched(sam_path, out, matched_cutoff=cutoff) + assert read_segment_names(out) == MATCHED_KEPT[cutoff] + + +def test_segments_without_md_tag_are_always_dropped(sam_path, tmp_path): + # Cutoff 0 keeps everything that has an MD tag, but never the MD-less read. + out = tmp_path / "out.sam" + filterSAMbyIdentity(sam_path, out, identity_cutoff=0.0) + assert "read_nomd" not in read_segment_names(out) + + +def test_works_on_bam_input(bam_path, tmp_path): + out = tmp_path / "out.bam" + filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0) + assert read_segment_names(out) == IDENTITY_KEPT[95.0] + + +# --- Output format selection (regression guard for the single-process BAM fix) --- + +def test_output_sam_is_text(sam_path, tmp_path): + out = tmp_path / "out.sam" + filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0) + assert detect_format(out) == "sam" + + +def test_output_bam_is_binary(sam_path, tmp_path): + out = tmp_path / "out.bam" + filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0) + assert detect_format(out) == "bam" + + +def test_output_format_follows_output_extension_not_input(bam_path, tmp_path): + # BAM input but a .sam output request must yield text SAM. + out = tmp_path / "out.sam" + filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0) + assert detect_format(out) == "sam" + assert read_segment_names(out) == IDENTITY_KEPT[95.0] + + +# --- Default output path naming (regression guard for the suffix fix) --- + +def test_default_output_path_naming(tmp_path): + # A filename containing 'sam' before the extension used to break the old + # regex-based extension detection; the suffix-based logic handles it. + src = tmp_path / "mysample.bam" + # Build a tiny BAM from the SAM fixture text. + from conftest import SAM_TEXT + import pysam + sam = tmp_path / "_seed.sam" + sam.write_text(SAM_TEXT) + save = pysam.set_verbosity(0) + with pysam.AlignmentFile(str(sam), "r") as s, \ + pysam.AlignmentFile(str(src), "wb", template=s) as d: + for seg in s: + d.write(seg) + pysam.set_verbosity(save) + + filterSAMbyIdentity(src, identity_cutoff=95.0) + + expected = tmp_path / "mysample.identity_filtered_at_95.0.bam" + assert expected.is_file() + assert detect_format(expected) == "bam" + assert read_segment_names(expected) == IDENTITY_KEPT[95.0] diff --git a/tests/test_filterSAM.py b/tests/test_filterSAM.py new file mode 100644 index 0000000..316bef9 --- /dev/null +++ b/tests/test_filterSAM.py @@ -0,0 +1,60 @@ +""" +Tests for the filterSAM dispatcher: argument validation and the +single-vs-parallel routing logic. +""" + +import pytest + +from filtersam import filtersam as fs +from filtersam.filtersam import filterSAM + +from conftest import IDENTITY_KEPT, read_segment_names + + +def test_invalid_filter_by_raises(sam_path, tmp_path): + with pytest.raises(ValueError): + filterSAM(sam_path, tmp_path / "out.sam", filter_by="nonsense", cutoff=95.0) + + +@pytest.mark.parametrize("cutoff", [-1.0, 100.1, 1000.0]) +def test_out_of_range_cutoff_raises(sam_path, tmp_path, cutoff): + with pytest.raises(ValueError): + filterSAM(sam_path, tmp_path / "out.sam", filter_by="identity", cutoff=cutoff) + + +def test_none_processes_uses_single_path(bam_path, tmp_path, monkeypatch): + calls = [] + monkeypatch.setattr(fs, "parallelizeBAMoperation", + lambda *a, **k: calls.append((a, k))) + out = tmp_path / "out.bam" + filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=None) + assert calls == [] + assert read_segment_names(out) == IDENTITY_KEPT[95.0] + + +def test_single_process_does_not_split(bam_path, tmp_path, monkeypatch): + # Regression guard for #3 / PR #6: `-p 1` must take the direct path and + # never invoke the (expensive) parallel splitting machinery. + calls = [] + monkeypatch.setattr(fs, "parallelizeBAMoperation", + lambda *a, **k: calls.append((a, k))) + out = tmp_path / "out.bam" + filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=1) + assert calls == [], "n_processes=1 should not call parallelizeBAMoperation" + assert read_segment_names(out) == IDENTITY_KEPT[95.0] + + +def test_multiple_processes_use_parallel_path(bam_path, tmp_path, monkeypatch): + calls = [] + monkeypatch.setattr(fs, "parallelizeBAMoperation", + lambda *a, **k: calls.append((a, k))) + out = tmp_path / "out.bam" + filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=2) + assert len(calls) == 1, "n_processes>1 should call parallelizeBAMoperation" + + +def test_parallel_run_matches_serial(bam_path, tmp_path): + # End-to-end parallel run (uses samtools-backed splitting from parallelbam). + out = tmp_path / "out_parallel.bam" + filterSAM(bam_path, out, filter_by="identity", cutoff=70.0, n_processes=2) + assert read_segment_names(out) == IDENTITY_KEPT[70.0] diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..40d1c11 --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,59 @@ +""" +Unit tests for the per-segment metric helpers in filtersam.filtersam. +""" + +import pytest + +from filtersam import filtersam as fs + + +def test_has_md_tag(segments): + assert fs.has_MD_tag(segments["read_perfect"]) + assert not fs.has_MD_tag(segments["read_nomd"]) + + +def test_sum_matches_and_mismatches(segments): + # Sum of CIGAR M lengths. + assert fs.sumMatchesAndMismatches(segments["read_perfect"]) == 10 + assert fs.sumMatchesAndMismatches(segments["read_90id"]) == 10 + # Soft-clipped bases (2S) do not count towards M. + assert fs.sumMatchesAndMismatches(segments["read_softclip"]) == 8 + + +def test_get_number_of_matches(segments): + assert fs.getNumberOfMatches(segments["read_perfect"]) == 10 + assert fs.getNumberOfMatches(segments["read_90id"]) == 9 + assert fs.getNumberOfMatches(segments["read_softclip"]) == 8 + assert fs.getNumberOfMatches(segments["read_70id"]) == 7 + + +def test_get_query_length(segments): + # M + I + S + = + X consume query; soft clip is included. + assert fs.getQueryLength(segments["read_perfect"]) == 10 + assert fs.getQueryLength(segments["read_softclip"]) == 10 + + +@pytest.mark.parametrize( + "name,expected", + [ + ("read_perfect", 100.0), + ("read_90id", 90.0), + ("read_softclip", 100.0), + ("read_70id", 70.0), + ], +) +def test_percent_identity(segments, name, expected): + assert fs.percent_identity(segments[name]) == pytest.approx(expected) + + +@pytest.mark.parametrize( + "name,expected", + [ + ("read_perfect", 100.0), + ("read_90id", 90.0), + ("read_softclip", 80.0), + ("read_70id", 70.0), + ], +) +def test_percent_matched(segments, name, expected): + assert fs.percent_matched(segments[name]) == pytest.approx(expected) From 7343a55c29a707337140a95af505e8554554d719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 19:15:29 +0100 Subject: [PATCH 2/2] CI: install deps via pip + apt instead of conda The conda-based workflow failed to solve: with channel-priority strict and the action's implicit `defaults` channel, the solver pulled ancient `defaults` pysam builds (pysam 0.7.7 needs python <3.0) and hit a libdeflate conflict. Switch to actions/setup-python + apt samtools + pip (pysam, numpy, pytest, parallelbam all have wheels / are pure-python). Simpler, faster, and avoids the conda channel/ToS mess. Verified locally against the PyPI pysam 0.24.0 wheel: 32 passed. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/tests.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 49091cb..c5f508a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,23 +8,22 @@ on: jobs: test: runs-on: ubuntu-latest - defaults: - run: - shell: bash -el {0} steps: - uses: actions/checkout@v4 - - name: Set up conda environment - uses: conda-incubator/setup-miniconda@v3 + - uses: actions/setup-python@v5 with: - channels: bioconda,conda-forge - channel-priority: strict python-version: "3.10" + - name: Install samtools + run: | + sudo apt-get update + sudo apt-get install -y samtools + - name: Install dependencies run: | - conda install -y pysam samtools numpy pytest - python -m pip install parallelbam + python -m pip install --upgrade pip + python -m pip install pysam numpy pytest parallelbam python -m pip install -e . --no-deps - name: Run tests