From e091f93a646b48c019d9be31d1d41846054bca1e Mon Sep 17 00:00:00 2001 From: Elarwei Date: Wed, 24 Jun 2026 23:10:29 +0800 Subject: [PATCH 1/2] feat(ucsc): add gget ucsc module to fetch UCSC IDs (#18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module `gget ucsc` searches the UCSC Genome Browser REST API for a gene symbol, accession, or term and returns the matching UCSC identifiers (e.g. known gene / transcript IDs) with their genomic positions, grouped by track — analogous to gget search for Ensembl. Supports filtering by genome, track, and limit. Exposed via the Python API and the command line. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/src/SUMMARY.md | 1 + docs/src/en/ucsc.md | 58 +++++++++++ docs/src/en/updates.md | 1 + gget/__init__.py | 1 + gget/constants.py | 3 + gget/gget_ucsc.py | 180 ++++++++++++++++++++++++++++++++++ gget/main.py | 101 +++++++++++++++++++ tests/fixtures/test_ucsc.json | 10 ++ tests/test_ucsc.py | 119 ++++++++++++++++++++++ 9 files changed, 474 insertions(+) create mode 100644 docs/src/en/ucsc.md create mode 100644 gget/gget_ucsc.py create mode 100644 tests/fixtures/test_ucsc.json create mode 100644 tests/test_ucsc.py diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index ca34b5f9..79a4bf9d 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -31,6 +31,7 @@ * [gget search](en/search.md) * [gget setup](en/setup.md) * [gget seq](en/seq.md) +* [gget ucsc](en/ucsc.md) * [gget virus](en/virus.md) --- diff --git a/docs/src/en/ucsc.md b/docs/src/en/ucsc.md new file mode 100644 index 00000000..36973a5c --- /dev/null +++ b/docs/src/en/ucsc.md @@ -0,0 +1,58 @@ +[ View page source on GitHub ](https://github.com/scverse/gget/blob/main/docs/src/en/ucsc.md) + +> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. +# gget ucsc 🔎 +Fetch [UCSC Genome Browser](https://genome.ucsc.edu/) IDs for a gene or term, similar to `gget search` for Ensembl. +`gget ucsc` searches the UCSC Genome Browser for a gene symbol, accession, or free-text term and returns the matching identifiers (e.g. UCSC known gene / transcript IDs) together with their genomic positions, grouped by the track they come from. +Return format: JSON (command-line) or data frame/CSV (Python). + +**Positional argument** +`search_term` +Gene symbol, accession, or free-text term to search for, e.g. `BRCA2`. + +**Optional arguments** +`-g` `--genome` +UCSC genome assembly to search, e.g. `hg38`, `hg19`, `mm39`. Default: `hg38`. + +`-t` `--track` +Only return matches from tracks whose name contains this (case-insensitive) substring, e.g. `knownGene`. Default: None. + +`-l` `--limit` +Maximum number of matches to return. Default: None (all matches). + +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +Python: `save=True` will save the output in the current working directory. + +**Flags** +`-csv` `--csv` +Command-line only. Returns results in CSV format. +Python: Use `json=True` to return output in JSON format. + +`-q` `--quiet` +Command-line only. Prevents progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. + +### Example +```bash +gget ucsc BRCA2 --genome hg38 --track knownGene +``` +```python +# Python +gget.ucsc("BRCA2", genome="hg38", track="knownGene") +``` +→ Returns the UCSC IDs matching the search term, with their genomic positions. + +| track | ucsc_id | chrom | start | end | name | description | +| --- | --- | --- | --- | --- | --- | --- | +| knownGene | ENST00000380152.8 | chr13 | 32315508 | 32400268 | BRCA2 (ENST00000380152.8) | breast cancer type 2 susceptibility protein | +| . . . | . . . | . . . | . . . | . . . | . . . | . . . | + +A UCSC ID (e.g. a known gene `ucsc_id`) can be inspected on the UCSC gene page, e.g. `https://genome.ucsc.edu/cgi-bin/hgGene?hgg_gene={ucsc_id}&db=hg38`. + +# References +If you use `gget ucsc` in a publication, please cite the following articles: + +- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) + +- Kent WJ, Sugnet CW, Furey TS, et al. (2002). The human genome browser at UCSC. Genome Research. [https://doi.org/10.1101/gr.229102](https://doi.org/10.1101/gr.229102) diff --git a/docs/src/en/updates.md b/docs/src/en/updates.md index f2c52c13..82d1f54f 100644 --- a/docs/src/en/updates.md +++ b/docs/src/en/updates.md @@ -5,6 +5,7 @@ #### *gget* officially became part of [*scverse*](https://scverse.org/) on June 9, 2026. 🥳🥳🥳 **Version ≥ 0.30.8** (XXX XX, 2026): +- [`gget ucsc`](ucsc.md): **New module** to fetch [UCSC Genome Browser](https://genome.ucsc.edu/) IDs for a gene or term, analogous to `gget search` for Ensembl. Searches the UCSC Genome Browser for a symbol/accession/term and returns the matching identifiers (e.g. UCSC known gene / transcript IDs) with their genomic positions, grouped by track; supports filtering by `genome`, `track`, and `limit`. Available in the Python API and on the command line. Resolves [issue 18](https://github.com/scverse/gget/issues/18). - [`gget pdb`](pdb.md): Added support for the PDBx/mmCIF structure format (fixes [issue 178](https://github.com/scverse/gget/issues/178) and [issue 177](https://github.com/scverse/gget/issues/177)). - New `resource="mmcif"` option downloads the structure in PDBx/mmCIF format (`.cif`). - The default `resource="pdb"` now automatically falls back to PDBx/mmCIF when the legacy PDB file is unavailable (e.g. for large structures), since the legacy PDB format is being phased out by RCSB. A warning is logged and saved files use the correct extension (`.cif`). diff --git a/gget/__init__.py b/gget/__init__.py index f56cbcdd..aebe645f 100644 --- a/gget/__init__.py +++ b/gget/__init__.py @@ -26,6 +26,7 @@ from .gget_search import search from .gget_seq import seq from .gget_setup import setup +from .gget_ucsc import ucsc from .gget_virus import virus # Mute numexpr threads info diff --git a/gget/constants.py b/gget/constants.py index 38f90119..81e87ac6 100644 --- a/gget/constants.py +++ b/gget/constants.py @@ -7,6 +7,9 @@ # strategy avoid hanging indefinitely on slow upstreams. DEFAULT_REQUESTS_TIMEOUT = (10, 60) +# UCSC Genome Browser REST API for gget ucsc +UCSC_API_URL = "https://api.genome.ucsc.edu" + # Ensembl REST API server for gget seq and info ENSEMBL_REST_API = "http://rest.ensembl.org/" ENSEMBL_FTP_URL = "http://ftp.ensembl.org/pub/" diff --git a/gget/gget_ucsc.py b/gget/gget_ucsc.py new file mode 100644 index 00000000..f82fbe70 --- /dev/null +++ b/gget/gget_ucsc.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +import html +import json as json_package +from typing import Any, Literal, overload +from urllib.parse import unquote + +import pandas as pd +import requests + +from .constants import DEFAULT_REQUESTS_TIMEOUT, UCSC_API_URL +from .utils import set_up_logger + +logger = set_up_logger() + +_COLUMNS = [ + "track", + "ucsc_id", + "chrom", + "start", + "end", + "name", + "description", +] + + +def _parse_position(position: str | None) -> tuple[str | None, int | None, int | None]: + """Parse a UCSC position string 'chr13:32315508-32400268' into (chrom, start, end).""" + if not position or ":" not in position: + return position, None, None + chrom, _, span = position.partition(":") + if "-" not in span: + return chrom, None, None + start_str, _, end_str = span.partition("-") + start_str = start_str.replace(",", "").strip() + end_str = end_str.replace(",", "").strip() + start = int(start_str) if start_str.isdigit() else None + end = int(end_str) if end_str.isdigit() else None + return chrom, start, end + + +def _match_rows(group: dict[str, Any]) -> list[dict[str, Any]]: + """Flatten one UCSC positionMatches track group into rows.""" + track = group.get("trackName") or group.get("name") + group_desc = group.get("description") + rows = [] + for m in group.get("matches", []): + chrom, start, end = _parse_position(m.get("position")) + ucsc_id = m.get("hgFindMatches") + if ucsc_id is not None: + ucsc_id = unquote(str(ucsc_id)) + pos_name = m.get("posName") + match_desc = m.get("description") or group_desc + rows.append( + { + "track": track, + "ucsc_id": ucsc_id, + "chrom": chrom, + "start": start, + "end": end, + "name": html.unescape(pos_name) if isinstance(pos_name, str) else pos_name, + "description": html.unescape(match_desc) if isinstance(match_desc, str) else match_desc, + } + ) + return rows + + +@overload +def ucsc( + search_term: str, + genome: str = "hg38", + track: str | None = None, + limit: int | None = None, + save: bool = False, + verbose: bool = True, + *, + json: Literal[True], +) -> list[dict[str, Any]] | None: ... + + +@overload +def ucsc( + search_term: str, + genome: str = "hg38", + track: str | None = None, + limit: int | None = None, + save: bool = False, + verbose: bool = True, + json: Literal[False] = False, +) -> pd.DataFrame | None: ... + + +def ucsc( + search_term: str, + genome: str = "hg38", + track: str | None = None, + limit: int | None = None, + save: bool = False, + verbose: bool = True, + json: bool = False, +) -> pd.DataFrame | list[dict[str, Any]] | None: + """Fetch UCSC Genome Browser IDs for a gene/term, similar to gget search. + + Searches the UCSC Genome Browser for a gene symbol, accession, or other term + and returns the matching identifiers (e.g. UCSC known gene / transcript IDs) + together with their genomic positions, grouped by the track they come from. + + Args: + - search_term Gene symbol, accession, or free-text term to search for, e.g. "BRCA2". + - genome UCSC genome assembly to search, e.g. "hg38", "hg19", "mm39". Default: "hg38". + - track If provided, only return matches from tracks whose name contains + this (case-insensitive) substring, e.g. "knownGene". Default: None. + - limit Maximum number of matches to return. Default: None (all matches). + - save If True, save the results table as csv/json in the working directory. Default: False. + - verbose True/False whether to print progress information. Default: True. + - json If True, returns results in json format instead of data frame. Default: False. + + Returns a data frame (or list of dicts if json=True) with one row per match, + including the track, UCSC ID, chromosome, start, end, name, and description. + Returns None if no matches are found. + """ + if search_term is None or str(search_term).strip() == "": + raise ValueError("Please provide a gene symbol or search term in 'search_term'.") + + term = str(search_term).strip() + url = f"{UCSC_API_URL}/search" + params = {"search": term, "genome": genome} + + if verbose: + logger.info(f"Searching UCSC ({genome}) for '{term}'...") + + try: + response = requests.get( + url, + params=params, + headers={"Accept": "application/json"}, + timeout=DEFAULT_REQUESTS_TIMEOUT, + ) + except requests.exceptions.RequestException as exc: + raise RuntimeError(f"The UCSC server request failed: {exc}") from exc + + if not response.ok: + raise RuntimeError( + f"The UCSC server returned error status code {response.status_code}. Please try again later." + ) + + data = response.json() + if isinstance(data, dict) and data.get("error"): + raise ValueError(f"UCSC returned an error: {data['error']}") + + rows = [] + for group in data.get("positionMatches", []): + rows.extend(_match_rows(group)) + + # Optional track filter + if track is not None: + track_lower = str(track).lower() + rows = [r for r in rows if r["track"] and track_lower in str(r["track"]).lower()] + + # Optional limit + if limit is not None: + rows = rows[: int(limit)] + + results_df = pd.DataFrame(rows, columns=_COLUMNS) + + if len(results_df) == 0: + logger.warning(f"No UCSC matches found for '{term}' in genome '{genome}'.") + return None + + if json: + results_dict = json_package.loads(results_df.to_json(orient="records")) + if save: + with open("gget_ucsc_results.json", "w", encoding="utf-8") as f: + json_package.dump(results_dict, f, ensure_ascii=False, indent=4) + return results_dict + + if save: + results_df.to_csv("gget_ucsc_results.csv", index=False) + + return results_df diff --git a/gget/main.py b/gget/main.py index 36bdb3da..13dc38a1 100644 --- a/gget/main.py +++ b/gget/main.py @@ -41,6 +41,7 @@ from .gget_search import search # noqa: E402 from .gget_seq import seq # noqa: E402 from .gget_setup import setup # noqa: E402 +from .gget_ucsc import ucsc # noqa: E402 from .gget_virus import virus # noqa: E402 @@ -1002,6 +1003,72 @@ def main() -> None: help="DEPRECATED - json is now the default output format (convert to csv using flag [--csv]).", ) + ## gget ucsc subparser + ucsc_desc = "Fetch UCSC Genome Browser IDs for a gene/term (similar to gget search)." + parser_ucsc = parent_subparsers.add_parser( + "ucsc", + parents=[parent], + description=ucsc_desc, + help=ucsc_desc, + add_help=True, + formatter_class=CustomHelpFormatter, + ) + parser_ucsc.add_argument( + "search_term", + type=str, + help="Gene symbol, accession, or free-text term to search for, e.g. 'BRCA2'.", + ) + parser_ucsc.add_argument( + "-g", + "--genome", + type=str, + default="hg38", + required=False, + help="UCSC genome assembly to search, e.g. 'hg38', 'hg19', 'mm39'. Default: 'hg38'.", + ) + parser_ucsc.add_argument( + "-t", + "--track", + type=str, + default=None, + required=False, + help="Only return matches from tracks whose name contains this substring, e.g. 'knownGene'. Default: None.", + ) + parser_ucsc.add_argument( + "-l", + "--limit", + type=int, + default=None, + required=False, + help="Maximum number of matches to return. Default: None (all matches).", + ) + parser_ucsc.add_argument( + "-csv", + "--csv", + default=True, + action="store_false", + required=False, + help="Returns results in csv format instead of json.", + ) + parser_ucsc.add_argument( + "-o", + "--out", + type=str, + required=False, + help=( + "Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json).\n" + "Default: Standard out." + ), + ) + parser_ucsc.add_argument( + "-q", + "--quiet", + default=True, + action="store_false", + required=False, + help="Does not print progress information.", + ) + ## gget enrichr subparser enrichr_desc = "Perform an enrichment analysis on a list of genes using Enrichr." parser_enrichr = parent_subparsers.add_parser( @@ -2921,6 +2988,7 @@ def main() -> None: "muscle": parser_muscle, "blast": parser_blast, "blat": parser_blat, + "ucsc": parser_ucsc, "enrichr": parser_enrichr, "archs4": parser_archs4, "setup": parser_setup, @@ -3467,6 +3535,39 @@ def main() -> None: if not args.out and args.csv: print(json.dumps(gget_results, ensure_ascii=False, indent=4)) + ## ucsc return + if args.command == "ucsc": + ucsc_results = ucsc( + search_term=args.search_term, + genome=args.genome, + track=args.track, + limit=args.limit, + json=args.csv, + verbose=args.quiet, + ) + + # Check if the function returned something + if ucsc_results is not None: + # Save results if args.out specified + if args.out and not args.csv: + directory = "/".join(args.out.split("/")[:-1]) + if directory != "": + os.makedirs(directory, exist_ok=True) + ucsc_results.to_csv(args.out, index=False) + + if args.out and args.csv: + directory = "/".join(args.out.split("/")[:-1]) + if directory != "": + os.makedirs(directory, exist_ok=True) + with open(args.out, "w", encoding="utf-8") as f: + json.dump(ucsc_results, f, ensure_ascii=False, indent=4) + + # Print results if no directory specified + if not args.out and not args.csv: + ucsc_results.to_csv(sys.stdout, index=False) + if not args.out and args.csv: + print(json.dumps(ucsc_results, ensure_ascii=False, indent=4)) + ## enrichr return if args.command == "enrichr": # Handle deprecated flags for backwards compatibility diff --git a/tests/fixtures/test_ucsc.json b/tests/fixtures/test_ucsc.json new file mode 100644 index 00000000..fdd0f6a9 --- /dev/null +++ b/tests/fixtures/test_ucsc.json @@ -0,0 +1,10 @@ +{ + "test_ucsc_no_term": { + "type": "error", + "args": { + "search_term": "" + }, + "expected_result": "ValueError", + "expected_msg": "Please provide a gene symbol or search term in 'search_term'." + } +} diff --git a/tests/test_ucsc.py b/tests/test_ucsc.py new file mode 100644 index 00000000..fe13aee8 --- /dev/null +++ b/tests/test_ucsc.py @@ -0,0 +1,119 @@ +import json +import unittest +from unittest.mock import patch + +import gget.gget_ucsc as gget_ucsc +from gget.gget_ucsc import _match_rows, _parse_position, ucsc + +from .from_json import from_json + +with open("./tests/fixtures/test_ucsc.json") as json_file: + ucsc_dict = json.load(json_file) + + +class TestUcsc(unittest.TestCase, metaclass=from_json(ucsc_dict, ucsc)): + pass # tests loaded from json + + +class _FakeResponse: + """Minimal stand-in for a requests.Response used to test parsing offline.""" + + def __init__(self, payload, ok=True, status_code=200): + self._payload = payload + self.ok = ok + self.status_code = status_code + + def json(self): + return self._payload + + +_SEARCH_PAYLOAD = { + "genome": "hg38", + "positionMatches": [ + { + "trackName": "knownGene", + "description": "GENCODE", + "matches": [ + { + "position": "chr13:32315508-32400268", + "hgFindMatches": "ENST00000380152.8", + "posName": "BRCA2 (ENST00000380152.8)", + "description": "breast cancer type 2 susceptibility protein", + } + ], + }, + { + "trackName": "hgnc", + "description": "HUGO Gene Nomenclature", + "matches": [ + { + "position": "chr13:32315086-32400268", + "hgFindMatches": "HGNC%3A1101", + "posName": "BRCA2", + "description": None, + } + ], + }, + ], +} + + +class TestUcscHelpers(unittest.TestCase): + """Network-free tests of the UCSC helpers (issue #18).""" + + def test_parse_position(self): + self.assertEqual(_parse_position("chr13:32315508-32400268"), ("chr13", 32315508, 32400268)) + self.assertEqual(_parse_position("chrX"), ("chrX", None, None)) + self.assertEqual(_parse_position(None), (None, None, None)) + + def test_match_rows_decoding(self): + rows = _match_rows(_SEARCH_PAYLOAD["positionMatches"][1]) + self.assertEqual(rows[0]["ucsc_id"], "HGNC:1101") # URL-decoded + self.assertEqual(rows[0]["track"], "hgnc") + # description falls back to the group description when the match has none + self.assertEqual(rows[0]["description"], "HUGO Gene Nomenclature") + + @patch.object(gget_ucsc.requests, "get") + def test_search(self, mock_get): + mock_get.return_value = _FakeResponse(_SEARCH_PAYLOAD) + df = ucsc("BRCA2", verbose=False) + self.assertEqual(list(df.columns), gget_ucsc._COLUMNS) + self.assertEqual(df.shape[0], 2) + self.assertEqual(df.iloc[0]["ucsc_id"], "ENST00000380152.8") + self.assertEqual(df.iloc[0]["chrom"], "chr13") + self.assertEqual(df.iloc[0]["start"], 32315508) + + @patch.object(gget_ucsc.requests, "get") + def test_track_filter(self, mock_get): + mock_get.return_value = _FakeResponse(_SEARCH_PAYLOAD) + df = ucsc("BRCA2", track="knownGene", verbose=False) + self.assertEqual(df.shape[0], 1) + self.assertEqual(df.iloc[0]["track"], "knownGene") + + @patch.object(gget_ucsc.requests, "get") + def test_limit_and_json(self, mock_get): + mock_get.return_value = _FakeResponse(_SEARCH_PAYLOAD) + result = ucsc("BRCA2", limit=1, json=True, verbose=False) + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + + @patch.object(gget_ucsc.requests, "get") + def test_no_results_returns_none(self, mock_get): + mock_get.return_value = _FakeResponse({"positionMatches": []}) + self.assertIsNone(ucsc("nonexistentxyz", verbose=False)) + + @patch.object(gget_ucsc.requests, "get") + def test_error_payload_raises(self, mock_get): + mock_get.return_value = _FakeResponse({"error": "No such genome 'banana'"}) + with self.assertRaises(ValueError): + ucsc("BRCA2", genome="banana", verbose=False) + + @patch.object(gget_ucsc.requests, "get") + def test_http_error_raises(self, mock_get): + mock_get.return_value = _FakeResponse({}, ok=False, status_code=500) + with self.assertRaises(RuntimeError): + ucsc("BRCA2", verbose=False) + + +if __name__ == "__main__": + unittest.main() From 4ed3b2322a3c50e3c4017eaad2737a15cf151a28 Mon Sep 17 00:00:00 2001 From: Elarwei Date: Thu, 25 Jun 2026 00:18:10 +0800 Subject: [PATCH 2/2] test(ucsc): cover remaining lines for codecov (#18) Add network-free mocked tests for empty search_term, _parse_position no-range branch, verbose logging, request-exception handling, and the save CSV/JSON branches. gget_ucsc.py now 100%. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_ucsc.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_ucsc.py b/tests/test_ucsc.py index fe13aee8..7c87727b 100644 --- a/tests/test_ucsc.py +++ b/tests/test_ucsc.py @@ -1,8 +1,11 @@ import json +import os +import tempfile import unittest from unittest.mock import patch import gget.gget_ucsc as gget_ucsc +import requests from gget.gget_ucsc import _match_rows, _parse_position, ucsc from .from_json import from_json @@ -114,6 +117,44 @@ def test_http_error_raises(self, mock_get): with self.assertRaises(RuntimeError): ucsc("BRCA2", verbose=False) + def test_empty_search_term_raises(self): + # Covers the empty/None search_term ValueError branch. + with self.assertRaises(ValueError): + ucsc(" ", verbose=False) + + def test_parse_position_no_range(self): + # Covers the "chrom with colon but no range" branch of _parse_position. + self.assertEqual(_parse_position("chr1:5000"), ("chr1", None, None)) + + @patch.object(gget_ucsc.requests, "get") + def test_search_verbose(self, mock_get): + # Covers the verbose logging line. + mock_get.return_value = _FakeResponse(_SEARCH_PAYLOAD) + df = ucsc("BRCA2", verbose=True) + self.assertEqual(df.shape[0], 2) + + @patch.object(gget_ucsc.requests, "get") + def test_request_exception_raises(self, mock_get): + # Covers the requests.RequestException -> RuntimeError branch. + mock_get.side_effect = requests.exceptions.ConnectionError("no network") + with self.assertRaises(RuntimeError): + ucsc("BRCA2", verbose=False) + + @patch.object(gget_ucsc.requests, "get") + def test_save_csv_and_json(self, mock_get): + # Covers the save-to-CSV and json+save branches. + mock_get.return_value = _FakeResponse(_SEARCH_PAYLOAD) + with tempfile.TemporaryDirectory() as tmp: + cwd = os.getcwd() + os.chdir(tmp) + try: + ucsc("BRCA2", save=True, verbose=False) + self.assertTrue(os.path.exists("gget_ucsc_results.csv")) + ucsc("BRCA2", save=True, json=True, verbose=False) + self.assertTrue(os.path.exists("gget_ucsc_results.json")) + finally: + os.chdir(cwd) + if __name__ == "__main__": unittest.main()