From d56f9dd52c335f8992281197fbe4e60ea66452a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?=
 <semidan.robaina@gmail.com>
Date: Fri, 19 Jun 2026 19:05:42 +0100
Subject: [PATCH 1/2] Add test suite and CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a pytest battery (32 tests) covering:
- the per-segment metric helpers (identity, matched, query length, MD-tag
  detection) against hand-crafted segments with known values;
- filterSAMbyIdentity / filterSAMbyPercentMatched retention at several
  cutoffs, on both SAM and BAM input, and that MD-less reads are dropped;
- output-format selection (SAM stays text, BAM is binary, and the *output*
  extension wins over the input one) — guarding the single-process fixes;
- default output-path naming for filenames containing 'sam'/'bam';
- filterSAM dispatch: invalid filter / out-of-range cutoff raising, and the
  single-vs-parallel routing, including that n_processes<=1 never invokes the
  splitting machinery (regression guard for #3), plus an end-to-end parallel
  run that matches the serial result.

Adds a GitHub Actions workflow (conda + bioconda samtools/pysam, pip
parallelbam) running pytest on push and PRs.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/tests.yml | 31 +++++++++++++
 .gitignore                  |  1 +
 pytest.ini                  |  3 ++
 tests/conftest.py           | 89 +++++++++++++++++++++++++++++++++++++
 tests/test_filter.py        | 87 ++++++++++++++++++++++++++++++++++++
 tests/test_filterSAM.py     | 60 +++++++++++++++++++++++++
 tests/test_metrics.py       | 59 ++++++++++++++++++++++++
 7 files changed, 330 insertions(+)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 pytest.ini
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_filter.py
 create mode 100644 tests/test_filterSAM.py
 create mode 100644 tests/test_metrics.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..49091cb
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+name: tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up conda environment
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          channels: bioconda,conda-forge
+          channel-priority: strict
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          conda install -y pysam samtools numpy pytest
+          python -m pip install parallelbam
+          python -m pip install -e . --no-deps
+
+      - name: Run tests
+        run: python -m pytest
diff --git a/.gitignore b/.gitignore
index 84c3967..1ced7b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ lib64
 
 __pycache__
 .ipynb_checkpoints
+.pytest_cache
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..0bc5130
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+addopts = -ra
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..001ca22
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,89 @@
+"""
+Shared fixtures for the filtersam test suite.
+
+A small, hand-crafted SAM file with segments whose percent-identity and
+percent-matched values are known exactly is used throughout. The values were
+verified against pysam's own ``get_aligned_pairs``/CIGAR parsing:
+
+    name            %identity   %matched   MD tag
+    read_perfect       100.0      100.0     MD:Z:10
+    read_90id           90.0       90.0     MD:Z:5A4
+    read_softclip      100.0       80.0     MD:Z:8     (CIGAR 2S8M)
+    read_70id           70.0       70.0     MD:Z:1A2C2A2
+    read_nomd           ----       ----     (no MD tag, always dropped)
+"""
+
+import pysam
+import pytest
+
+SAM_TEXT = """@HD\tVN:1.6\tSO:unsorted
+@SQ\tSN:ref\tLN:100
+read_perfect\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tMD:Z:10
+read_90id\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTAGGTAC\t*\tMD:Z:5A4
+read_softclip\t0\tref\t1\t60\t2S8M\t*\t0\t0\tTTACGTACGT\t*\tMD:Z:8
+read_70id\t0\tref\t1\t60\t10M\t*\t0\t0\tAGGTGGTGAC\t*\tMD:Z:1A2C2A2
+read_nomd\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tNM:i:0
+"""
+
+# Expected segments retained for a given filter and cutoff.
+IDENTITY_KEPT = {
+    95.0: {"read_perfect", "read_softclip"},
+    90.0: {"read_perfect", "read_90id", "read_softclip"},
+    70.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
+}
+MATCHED_KEPT = {
+    100.0: {"read_perfect"},
+    85.0: {"read_perfect", "read_90id"},
+    50.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
+}
+
+
+def _write_sam(path):
+    path.write_text(SAM_TEXT)
+    return path
+
+
+def read_segment_names(path):
+    """Return the set of query names in a SAM/BAM file (format auto-detected)."""
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(path), "r") as handle:
+        names = {seg.query_name for seg in handle}
+    pysam.set_verbosity(save)
+    return names
+
+
+def detect_format(path):
+    """Return 'bam' if the file is BGZF/BAM-compressed, 'sam' if plain text."""
+    with open(path, "rb") as handle:
+        magic = handle.read(2)
+    return "bam" if magic == b"\x1f\x8b" else "sam"
+
+
+@pytest.fixture
+def sam_path(tmp_path):
+    """A plain-text SAM fixture file."""
+    return _write_sam(tmp_path / "sample.sam")
+
+
+@pytest.fixture
+def bam_path(tmp_path):
+    """A BAM fixture file built from the same records as ``sam_path``."""
+    sam = _write_sam(tmp_path / "_src.sam")
+    bam = tmp_path / "sample.bam"
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam), "r") as src:
+        with pysam.AlignmentFile(str(bam), "wb", template=src) as dst:
+            for seg in src:
+                dst.write(seg)
+    pysam.set_verbosity(save)
+    return bam
+
+
+@pytest.fixture
+def segments(sam_path):
+    """The parsed AlignedSegment objects, keyed by query name."""
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam_path), "r") as handle:
+        segs = {seg.query_name: seg for seg in handle}
+    pysam.set_verbosity(save)
+    return segs
diff --git a/tests/test_filter.py b/tests/test_filter.py
new file mode 100644
index 0000000..6494133
--- /dev/null
+++ b/tests/test_filter.py
@@ -0,0 +1,87 @@
+"""
+Tests for the single-file filtering functions:
+filterSAMbyIdentity and filterSAMbyPercentMatched.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from filtersam.filtersam import filterSAMbyIdentity, filterSAMbyPercentMatched
+
+from conftest import IDENTITY_KEPT, MATCHED_KEPT, detect_format, read_segment_names
+
+
+@pytest.mark.parametrize("cutoff", sorted(IDENTITY_KEPT))
+def test_filter_by_identity_keeps_expected(sam_path, tmp_path, cutoff):
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=cutoff)
+    assert read_segment_names(out) == IDENTITY_KEPT[cutoff]
+
+
+@pytest.mark.parametrize("cutoff", sorted(MATCHED_KEPT))
+def test_filter_by_matched_keeps_expected(sam_path, tmp_path, cutoff):
+    out = tmp_path / "out.sam"
+    filterSAMbyPercentMatched(sam_path, out, matched_cutoff=cutoff)
+    assert read_segment_names(out) == MATCHED_KEPT[cutoff]
+
+
+def test_segments_without_md_tag_are_always_dropped(sam_path, tmp_path):
+    # Cutoff 0 keeps everything that has an MD tag, but never the MD-less read.
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=0.0)
+    assert "read_nomd" not in read_segment_names(out)
+
+
+def test_works_on_bam_input(bam_path, tmp_path):
+    out = tmp_path / "out.bam"
+    filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+# --- Output format selection (regression guard for the single-process BAM fix) ---
+
+def test_output_sam_is_text(sam_path, tmp_path):
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "sam"
+
+
+def test_output_bam_is_binary(sam_path, tmp_path):
+    out = tmp_path / "out.bam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "bam"
+
+
+def test_output_format_follows_output_extension_not_input(bam_path, tmp_path):
+    # BAM input but a .sam output request must yield text SAM.
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "sam"
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+# --- Default output path naming (regression guard for the suffix fix) ---
+
+def test_default_output_path_naming(tmp_path):
+    # A filename containing 'sam' before the extension used to break the old
+    # regex-based extension detection; the suffix-based logic handles it.
+    src = tmp_path / "mysample.bam"
+    # Build a tiny BAM from the SAM fixture text.
+    from conftest import SAM_TEXT
+    import pysam
+    sam = tmp_path / "_seed.sam"
+    sam.write_text(SAM_TEXT)
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam), "r") as s, \
+            pysam.AlignmentFile(str(src), "wb", template=s) as d:
+        for seg in s:
+            d.write(seg)
+    pysam.set_verbosity(save)
+
+    filterSAMbyIdentity(src, identity_cutoff=95.0)
+
+    expected = tmp_path / "mysample.identity_filtered_at_95.0.bam"
+    assert expected.is_file()
+    assert detect_format(expected) == "bam"
+    assert read_segment_names(expected) == IDENTITY_KEPT[95.0]
diff --git a/tests/test_filterSAM.py b/tests/test_filterSAM.py
new file mode 100644
index 0000000..316bef9
--- /dev/null
+++ b/tests/test_filterSAM.py
@@ -0,0 +1,60 @@
+"""
+Tests for the filterSAM dispatcher: argument validation and the
+single-vs-parallel routing logic.
+"""
+
+import pytest
+
+from filtersam import filtersam as fs
+from filtersam.filtersam import filterSAM
+
+from conftest import IDENTITY_KEPT, read_segment_names
+
+
+def test_invalid_filter_by_raises(sam_path, tmp_path):
+    with pytest.raises(ValueError):
+        filterSAM(sam_path, tmp_path / "out.sam", filter_by="nonsense", cutoff=95.0)
+
+
+@pytest.mark.parametrize("cutoff", [-1.0, 100.1, 1000.0])
+def test_out_of_range_cutoff_raises(sam_path, tmp_path, cutoff):
+    with pytest.raises(ValueError):
+        filterSAM(sam_path, tmp_path / "out.sam", filter_by="identity", cutoff=cutoff)
+
+
+def test_none_processes_uses_single_path(bam_path, tmp_path, monkeypatch):
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=None)
+    assert calls == []
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+def test_single_process_does_not_split(bam_path, tmp_path, monkeypatch):
+    # Regression guard for #3 / PR #6: `-p 1` must take the direct path and
+    # never invoke the (expensive) parallel splitting machinery.
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=1)
+    assert calls == [], "n_processes=1 should not call parallelizeBAMoperation"
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+def test_multiple_processes_use_parallel_path(bam_path, tmp_path, monkeypatch):
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=2)
+    assert len(calls) == 1, "n_processes>1 should call parallelizeBAMoperation"
+
+
+def test_parallel_run_matches_serial(bam_path, tmp_path):
+    # End-to-end parallel run (uses samtools-backed splitting from parallelbam).
+    out = tmp_path / "out_parallel.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=70.0, n_processes=2)
+    assert read_segment_names(out) == IDENTITY_KEPT[70.0]
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000..40d1c11
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,59 @@
+"""
+Unit tests for the per-segment metric helpers in filtersam.filtersam.
+"""
+
+import pytest
+
+from filtersam import filtersam as fs
+
+
+def test_has_md_tag(segments):
+    assert fs.has_MD_tag(segments["read_perfect"])
+    assert not fs.has_MD_tag(segments["read_nomd"])
+
+
+def test_sum_matches_and_mismatches(segments):
+    # Sum of CIGAR M lengths.
+    assert fs.sumMatchesAndMismatches(segments["read_perfect"]) == 10
+    assert fs.sumMatchesAndMismatches(segments["read_90id"]) == 10
+    # Soft-clipped bases (2S) do not count towards M.
+    assert fs.sumMatchesAndMismatches(segments["read_softclip"]) == 8
+
+
+def test_get_number_of_matches(segments):
+    assert fs.getNumberOfMatches(segments["read_perfect"]) == 10
+    assert fs.getNumberOfMatches(segments["read_90id"]) == 9
+    assert fs.getNumberOfMatches(segments["read_softclip"]) == 8
+    assert fs.getNumberOfMatches(segments["read_70id"]) == 7
+
+
+def test_get_query_length(segments):
+    # M + I + S + = + X consume query; soft clip is included.
+    assert fs.getQueryLength(segments["read_perfect"]) == 10
+    assert fs.getQueryLength(segments["read_softclip"]) == 10
+
+
+@pytest.mark.parametrize(
+    "name,expected",
+    [
+        ("read_perfect", 100.0),
+        ("read_90id", 90.0),
+        ("read_softclip", 100.0),
+        ("read_70id", 70.0),
+    ],
+)
+def test_percent_identity(segments, name, expected):
+    assert fs.percent_identity(segments[name]) == pytest.approx(expected)
+
+
+@pytest.mark.parametrize(
+    "name,expected",
+    [
+        ("read_perfect", 100.0),
+        ("read_90id", 90.0),
+        ("read_softclip", 80.0),
+        ("read_70id", 70.0),
+    ],
+)
+def test_percent_matched(segments, name, expected):
+    assert fs.percent_matched(segments[name]) == pytest.approx(expected)

From 7343a55c29a707337140a95af505e8554554d719 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?=
 <semidan.robaina@gmail.com>
Date: Fri, 19 Jun 2026 19:15:29 +0100
Subject: [PATCH 2/2] CI: install deps via pip + apt instead of conda

The conda-based workflow failed to solve: with channel-priority strict
and the action's implicit `defaults` channel, the solver pulled ancient
`defaults` pysam builds (pysam 0.7.7 needs python <3.0) and hit a
libdeflate conflict.

Switch to actions/setup-python + apt samtools + pip (pysam, numpy,
pytest, parallelbam all have wheels / are pure-python). Simpler, faster,
and avoids the conda channel/ToS mess. Verified locally against the
PyPI pysam 0.24.0 wheel: 32 passed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/tests.yml | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 49091cb..c5f508a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,23 +8,22 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash -el {0}
     steps:
       - uses: actions/checkout@v4
 
-      - name: Set up conda environment
-        uses: conda-incubator/setup-miniconda@v3
+      - uses: actions/setup-python@v5
         with:
-          channels: bioconda,conda-forge
-          channel-priority: strict
           python-version: "3.10"
 
+      - name: Install samtools
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y samtools
+
       - name: Install dependencies
         run: |
-          conda install -y pysam samtools numpy pytest
-          python -m pip install parallelbam
+          python -m pip install --upgrade pip
+          python -m pip install pysam numpy pytest parallelbam
           python -m pip install -e . --no-deps
 
       - name: Run tests