From 9cfde0e5e0a3915d9c94a1ab070ddbf9ff862f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 19:26:05 +0100 Subject: [PATCH 1/3] Skip parallelbam name-sort in parallel mode (closes #3) The identity and matched filters evaluate each segment independently, so the parallel chunks do not need reads grouped by query name. Pass sort_by_name=False to parallelizeBAMoperation so it skips the full serial samtools name-sort that dominated runtime on large files and made parallel mode slower than single-processor mode. Requires the sort_by_name option added in parallelbam 0.0.20, so: - bump the dependency to parallelbam>=0.0.20 (setup.py, requirements.txt); - CI installs parallelbam from the repo until 0.0.20 is on PyPI. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/tests.yml | 5 ++++- filtersam/filtersam.py | 6 +++++- requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c5f508a..343e4f9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,7 +23,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install pysam numpy pytest parallelbam + python -m pip install pysam numpy pytest + # parallelbam>=0.0.20 (sort_by_name support) is not on PyPI yet; + # install from the repo until it is released, then switch back to PyPI. + python -m pip install "parallelbam @ git+https://github.com/Robaina/parallelBAM.git@main" python -m pip install -e . --no-deps - name: Run tests diff --git a/filtersam/filtersam.py b/filtersam/filtersam.py index ed817fb..db64c21 100644 --- a/filtersam/filtersam.py +++ b/filtersam/filtersam.py @@ -173,6 +173,10 @@ def filterSAM(input_path: Path, output_path: Path = None, print('Converting sam file to bam for processing') input_path = Path(input_path.as_posix().replace('.sam', '.bam')) sam2bam(input_path) + # Both filters evaluate each segment independently, so the chunks do + # not need reads grouped by query name. Skipping parallelbam's + # name-sort avoids a full serial pass over the file (see #3). parallelizeBAMoperation(path_to_bam=input_path.as_posix(), callback=filter_method, callback_additional_args=[cutoff], - n_processes=n_processes, output_path=output_path.as_posix()) + n_processes=n_processes, output_path=output_path.as_posix(), + sort_by_name=False) diff --git a/requirements.txt b/requirements.txt index d3f7e3d..692d836 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy==1.21.2 pysam==0.16.0.1 -parallelbam==0.0.12 +parallelbam>=0.0.20 diff --git a/setup.py b/setup.py index 619339b..dc5eb11 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ download_url=DOWNLOAD_URL, license=LICENSE, packages=find_packages(), - install_requires=['numpy', 'pysam', 'parallelbam'], + install_requires=['numpy', 'pysam', 'parallelbam>=0.0.20'], entry_points ={ 'console_scripts': [ 'filtersam = filtersam.cli:main' From f87db720d7b23803914ad2d570b0dc960da3acf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 22:56:46 +0100 Subject: [PATCH 2/3] Rename distribution to filter-sam + add PyPI publish workflow Re-publishing under a new PyPI account requires a new distribution name (the old `filtersam` name belongs to the previous account). The import name is unchanged (`import filtersam`); only `pip install` changes to `filter-sam`. The parallelbam dependency is likewise renamed to `parallel-bam>=0.0.20`. - setup.py: NAME -> 'filter-sam'; dependency -> 'parallel-bam>=0.0.20'. - requirements.txt: parallel-bam>=0.0.20. - pyproject.toml: declare the setuptools build backend (PEP 517). - publish.yml: build + publish to PyPI via Trusted Publishing (OIDC) on GitHub Release. - tests.yml: install parallel-bam from the parallelBAM repo without asserting the distribution name, so CI is robust to the rename until parallel-bam 0.0.20 is on PyPI. Verified locally: builds as filter_sam-0.0.11 (Requires-Dist parallel-bam>=0.0.20), twine check passes, 32 tests pass. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/publish.yml | 44 +++++++++++++++++++++++++++++++++++ .github/workflows/tests.yml | 7 +++--- pyproject.toml | 3 +++ requirements.txt | 2 +- setup.py | 4 ++-- 5 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/publish.yml create mode 100644 pyproject.toml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..76ccd84 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,44 @@ +name: publish + +# Publishes the package to PyPI when a GitHub Release is published. +# Uses PyPI Trusted Publishing (OIDC) — no API token is stored in the repo. +# The matching publisher must be configured on PyPI for project "filter-sam" +# with this workflow filename (publish.yml) and environment name (pypi). + +on: + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build sdist and wheel + run: | + python -m pip install --upgrade build + python -m build + + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write # required for Trusted Publishing + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 343e4f9..1332464 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,9 +24,10 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install pysam numpy pytest - # parallelbam>=0.0.20 (sort_by_name support) is not on PyPI yet; - # install from the repo until it is released, then switch back to PyPI. - python -m pip install "parallelbam @ git+https://github.com/Robaina/parallelBAM.git@main" + # parallel-bam>=0.0.20 (sort_by_name support) is not on PyPI yet; + # install from the repo until it is released, then switch to: + # python -m pip install "parallel-bam>=0.0.20" + python -m pip install "git+https://github.com/Robaina/parallelBAM.git@main" python -m pip install -e . --no-deps - name: Run tests diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4a85092 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 692d836..0ecfce7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy==1.21.2 pysam==0.16.0.1 -parallelbam>=0.0.20 +parallel-bam>=0.0.20 diff --git a/setup.py b/setup.py index dc5eb11..4c1548b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ DESCRIPTION = 'Tools to filter sam o bam files by percent identity or percent of matched sequence' LONG_DESCRIPTION = long_description LONG_DESCRIPTION_CONTENT_TYPE = 'text/markdown' -NAME = 'filtersam' +NAME = 'filter-sam' AUTHOR = "Semidán Robaina Estévez, 2021-2022" AUTHOR_EMAIL = "srobaina@ull.edu.es" MAINTAINER = "Semidán Robaina Estévez" @@ -32,7 +32,7 @@ download_url=DOWNLOAD_URL, license=LICENSE, packages=find_packages(), - install_requires=['numpy', 'pysam', 'parallelbam>=0.0.20'], + install_requires=['numpy', 'pysam', 'parallel-bam>=0.0.20'], entry_points ={ 'console_scripts': [ 'filtersam = filtersam.cli:main' From fd6c53f5e7f53fc41281a195d2b21df103fad5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 23:08:43 +0100 Subject: [PATCH 3/3] Use distribution names filtersam-tools / parallelbam-tools (filter-sam/parallel-bam too similar to existing PyPI projects) Co-Authored-By: Claude Opus 4.8 --- .github/workflows/publish.yml | 2 +- .github/workflows/tests.yml | 4 ++-- requirements.txt | 2 +- setup.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 76ccd84..cc9da72 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -2,7 +2,7 @@ name: publish # Publishes the package to PyPI when a GitHub Release is published. # Uses PyPI Trusted Publishing (OIDC) — no API token is stored in the repo. -# The matching publisher must be configured on PyPI for project "filter-sam" +# The matching publisher must be configured on PyPI for project "filtersam-tools" # with this workflow filename (publish.yml) and environment name (pypi). on: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1332464..75a701c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,9 +24,9 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install pysam numpy pytest - # parallel-bam>=0.0.20 (sort_by_name support) is not on PyPI yet; + # parallelbam-tools>=0.0.20 (sort_by_name support) is not on PyPI yet; # install from the repo until it is released, then switch to: - # python -m pip install "parallel-bam>=0.0.20" + # python -m pip install "parallelbam-tools>=0.0.20" python -m pip install "git+https://github.com/Robaina/parallelBAM.git@main" python -m pip install -e . --no-deps diff --git a/requirements.txt b/requirements.txt index 0ecfce7..82aa511 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy==1.21.2 pysam==0.16.0.1 -parallel-bam>=0.0.20 +parallelbam-tools>=0.0.20 diff --git a/setup.py b/setup.py index 4c1548b..fba8cee 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ DESCRIPTION = 'Tools to filter sam o bam files by percent identity or percent of matched sequence' LONG_DESCRIPTION = long_description LONG_DESCRIPTION_CONTENT_TYPE = 'text/markdown' -NAME = 'filter-sam' +NAME = 'filtersam-tools' AUTHOR = "Semidán Robaina Estévez, 2021-2022" AUTHOR_EMAIL = "srobaina@ull.edu.es" MAINTAINER = "Semidán Robaina Estévez" @@ -32,7 +32,7 @@ download_url=DOWNLOAD_URL, license=LICENSE, packages=find_packages(), - install_requires=['numpy', 'pysam', 'parallel-bam>=0.0.20'], + install_requires=['numpy', 'pysam', 'parallelbam-tools>=0.0.20'], entry_points ={ 'console_scripts': [ 'filtersam = filtersam.cli:main'