From fde0e10f216494cc5a4581608a53702599f15b7b Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Thu, 28 May 2026 07:07:07 -0700 Subject: [PATCH 1/4] Auto-download missing BUSCO lineage in annotate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The annotate command builds the BUSCO lineage path from FUNANNOTATE2_DB and the species/odb_version, but unlike train and predict it never checked whether that path actually exists on disk before handing it to buscolite. If it doesn't, buscolite dies with a confusing " is not a directory" error. This shows up in docker. The previous train/predict run inside the container downloads the lineage to /opt/funannotate2_db/_odbXX, but that path is ephemeral — when the user launches a new container for the annotate step the lineage is gone, even though the host's predict_results/ still has the gene models from the earlier run. Mirror the isdir-check + tarball download/extract block that already lives in predict.py and train.py, and fall back to a clear critical error if the download/extract somehow doesn't materialise the directory (rather than letting buscolite crash with its less informative message). - funannotate2/annotate.py: add `download` and `runSubprocess` to the utilities import, then insert the download block right after the busco_model_path is constructed. --- funannotate2/annotate.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/funannotate2/annotate.py b/funannotate2/annotate.py index a462b9d..64cac69 100755 --- a/funannotate2/annotate.py +++ b/funannotate2/annotate.py @@ -41,10 +41,12 @@ choose_best_busco_species, create_directories, create_tmpdir, + download, find_files, load_json, lookup_taxonomy, naming_slug, + runSubprocess, get_odb_version, validate_busco_lineage, ) @@ -370,6 +372,37 @@ def annotate(args): env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}" ) + # download the lineage if it isn't present. The lineage may have been + # fetched by a previous train/predict run, but inside an ephemeral + # docker container that path is lost between invocations, so re-check + # here and pull it again rather than dying with a confusing + # "is not a directory" error from buscolite. + if not os.path.isdir(busco_model_path): + download_urls = load_json( + os.path.join(os.path.dirname(__file__), "downloads.json") + ) + busco_url = download_urls["busco"][busco_species][0] + busco_tgz = os.path.join( + env["FUNANNOTATE2_DB"], os.path.basename(busco_url) + ) + logger.info( + f"Downloading {busco_species}_{odb_version} model from {busco_url}" + ) + download(busco_url, busco_tgz, wget=False) + if os.path.isfile(busco_tgz): + runSubprocess( + ["tar", "-zxf", os.path.basename(busco_tgz)], + logger, + cwd=env["FUNANNOTATE2_DB"], + ) + if os.path.isdir(busco_model_path): + os.remove(busco_tgz) + if not os.path.isdir(busco_model_path): + logger.critical( + f"Unable to download/extract BUSCO lineage to {busco_model_path}; skipping BUSCO annotation" + ) + raise SystemExit(1) + # run busco proteome screen logger.info( f"BUSCOlite [conserved ortholog] search using {busco_species} models" From 4d55eed4c9d47508cc88faf25c450aafca6c1e05 Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Thu, 28 May 2026 07:07:07 -0700 Subject: [PATCH 2/4] Demote per-contig success log in predict to debug level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On an assembly with many contigs the "Successfully ran tools for : snap, glimmerhmm, augustus" line emitted once per contig inside abinitio_wrapper drowns the user-facing info log — a 1000+ contig draft assembly would push every other useful message off-screen. The downstream " predictions filtered: N kept, M filtered" summary lines that follow the parallel run already convey aggregate per-tool success at info level, so the per-contig detail is redundant at info. Demote to debug so it's still available with --debug for troubleshooting individual contigs but stays out of the default log. Failure and OOM lines are intentionally left at warning/error — when something breaks the per-contig identity is essential context. - funannotate2/predict.py: change `logger.info(...)` to `logger.debug(...)` for the per-contig success line in abinitio_wrapper. --- funannotate2/predict.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/funannotate2/predict.py b/funannotate2/predict.py index 96c0845..eb80807 100755 --- a/funannotate2/predict.py +++ b/funannotate2/predict.py @@ -1611,7 +1611,11 @@ def run_tool_with_error_tracking(tool_name, run_func, *args, **kwargs): ) if tools_run: - logger.info(f"Successfully ran tools for {contig_name}: {', '.join(tools_run)}") + # Per-contig success is a debug-level detail; on assemblies with many + # contigs an info-level line per contig drowns the user-facing log. + # The downstream " predictions filtered: ..." summary lines + # already convey aggregate success at info level. + logger.debug(f"Successfully ran tools for {contig_name}: {', '.join(tools_run)}") # Store error information in memory stats for tracking if monitor_memory: From dab40908212f5a42561f8a14b1a11ba7c8942b8e Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Thu, 28 May 2026 07:16:11 -0700 Subject: [PATCH 3/4] refactor: centralize BUSCO lineage download into utilities helper Replace the three identical copies of "compute lineage path, download tarball if missing, extract, clean up" in train.py, predict.py and annotate.py with a single ensure_busco_lineage(species, logger) helper in utilities.py. - utilities.py: add ensure_busco_lineage(); imports env from .config. - train.py: drop get_odb_version/download/load_json imports; call the helper once near the top of train() so busco_model_path is always defined for the params.json output, regardless of --training-set. - predict.py: drop get_odb_version/download/load_json/runSubprocess imports; replace the inline block with a single helper call. - annotate.py: drop get_odb_version/download/runSubprocess imports; replace the recently-added download block with a single helper call. No behavior change beyond train.py now resolving the lineage up front (previously it was lazy and skipped when --training-set was supplied); the download is idempotent and re-uses the cached directory when present, so the only observable effect is a one-time download on a fresh DB if a user supplies their own training set. --- funannotate2/annotate.py | 47 ++++++-------------------------------- funannotate2/predict.py | 29 ++--------------------- funannotate2/train.py | 35 ++++------------------------ funannotate2/utilities.py | 48 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 98 deletions(-) diff --git a/funannotate2/annotate.py b/funannotate2/annotate.py index 64cac69..e5ee0d9 100755 --- a/funannotate2/annotate.py +++ b/funannotate2/annotate.py @@ -41,13 +41,11 @@ choose_best_busco_species, create_directories, create_tmpdir, - download, + ensure_busco_lineage, find_files, load_json, lookup_taxonomy, naming_slug, - runSubprocess, - get_odb_version, validate_busco_lineage, ) @@ -347,9 +345,6 @@ def annotate(args): # busco proteome analysis busco_all = os.path.join(misc_dir, "busco.results.json") busco_annots = os.path.join(misc_dir, "annotations.busco.tsv") - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) if not checkfile(busco_annots): if not taxonomy: # get taxonomy information @@ -368,40 +363,12 @@ def annotate(args): else: # choose best busco species busco_species = choose_best_busco_species(taxonomy) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}" - ) - - # download the lineage if it isn't present. The lineage may have been - # fetched by a previous train/predict run, but inside an ephemeral - # docker container that path is lost between invocations, so re-check - # here and pull it again rather than dying with a confusing - # "is not a directory" error from buscolite. - if not os.path.isdir(busco_model_path): - download_urls = load_json( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_url = download_urls["busco"][busco_species][0] - busco_tgz = os.path.join( - env["FUNANNOTATE2_DB"], os.path.basename(busco_url) - ) - logger.info( - f"Downloading {busco_species}_{odb_version} model from {busco_url}" - ) - download(busco_url, busco_tgz, wget=False) - if os.path.isfile(busco_tgz): - runSubprocess( - ["tar", "-zxf", os.path.basename(busco_tgz)], - logger, - cwd=env["FUNANNOTATE2_DB"], - ) - if os.path.isdir(busco_model_path): - os.remove(busco_tgz) - if not os.path.isdir(busco_model_path): - logger.critical( - f"Unable to download/extract BUSCO lineage to {busco_model_path}; skipping BUSCO annotation" - ) - raise SystemExit(1) + # Ensure the BUSCO lineage is available under FUNANNOTATE2_DB, + # downloading it if needed. In dockerized usage the lineage from a + # previous train/predict run is gone when annotate starts in a fresh + # container, so this guards against the cryptic " is not a + # directory" failure from buscolite further down. + busco_model_path = ensure_busco_lineage(busco_species, logger) # run busco proteome screen logger.info( diff --git a/funannotate2/predict.py b/funannotate2/predict.py index eb80807..7bbe0f8 100755 --- a/funannotate2/predict.py +++ b/funannotate2/predict.py @@ -38,15 +38,12 @@ choose_best_busco_species, create_directories, create_tmpdir, - download, + ensure_busco_lineage, find_files, - load_json, lookup_taxonomy, naming_slug, runProcessJob, - runSubprocess, which_path, - get_odb_version, ) @@ -906,29 +903,7 @@ def sort_gff_line(line): busco_tax = choose_best_busco_species( {"superkingdom": taxonomy.get("superkingdom"), "kingdom": taxonomy.get("kingdom")} ) - # pull the latest odb version from downloads link - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_tax}_{odb_version}" - ) - if not os.path.isdir(busco_model_path): - download_urls = load_json( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_url = download_urls["busco"][busco_tax][0] - busco_tgz = os.path.join(env["FUNANNOTATE2_DB"], os.path.basename(busco_url)) - logger.info(f"Downloading {busco_tax}_{odb_version} model from {busco_url}") - download(busco_url, busco_tgz, wget=False) - if os.path.isfile(busco_tgz): - runSubprocess( - ["tar", "-zxf", os.path.basename(busco_tgz)], - logger, - cwd=env["FUNANNOTATE2_DB"], - ) - if os.path.isdir(busco_model_path): - os.remove(busco_tgz) + busco_model_path = ensure_busco_lineage(busco_tax, logger) # now we can loop through the abinitio predictions and run busco for completion # write this to file for re-use if consensus file already present? diff --git a/funannotate2/train.py b/funannotate2/train.py index 21ceaa7..fe2e125 100755 --- a/funannotate2/train.py +++ b/funannotate2/train.py @@ -24,13 +24,11 @@ choose_best_augustus_species, choose_best_busco_species, create_directories, - download, - load_json, + ensure_busco_lineage, lookup_taxonomy, naming_slug, runSubprocess, which_path, - get_odb_version, rename_gff_contigs, validate_busco_lineage, validate_augustus_species, @@ -142,13 +140,10 @@ def train(args): else: # choose best busco species busco_species = choose_best_busco_species(taxonomy) - # pull the latest odb version from downloads link - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}" - ) + # Ensure the BUSCO lineage exists under FUNANNOTATE2_DB (downloading + # it if necessary) and capture its on-disk path for buscolite + the + # params.json output below. + busco_model_path = ensure_busco_lineage(busco_species, logger) # run buscolite on genome to get training set filt_train_models = os.path.join(misc_dir, "training-models.final.gff3") @@ -159,26 +154,6 @@ def train(args): logger.info( f"Choosing best busco species based on taxonomy: {busco_species}" ) - if not os.path.isdir(busco_model_path): - download_urls = load_json( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_url = download_urls["busco"][busco_species][0] - busco_tgz = os.path.join( - env["FUNANNOTATE2_DB"], os.path.basename(busco_url) - ) - logger.info( - f"Downloading {busco_species}_{odb_version} model from {busco_url}" - ) - download(busco_url, busco_tgz, wget=False) - if os.path.isfile(busco_tgz): - runSubprocess( - ["tar", "-zxf", os.path.basename(busco_tgz)], - logger, - cwd=env["FUNANNOTATE2_DB"], - ) - if os.path.isdir(busco_model_path): - os.remove(busco_tgz) log("Running buscolite to generate training set using filtered genome") buscolite( TrainingGenomeFasta, diff --git a/funannotate2/utilities.py b/funannotate2/utilities.py index 4690ecd..abbb781 100755 --- a/funannotate2/utilities.py +++ b/funannotate2/utilities.py @@ -20,7 +20,7 @@ import requests -from .config import augustus_species, busco_taxonomy +from .config import augustus_species, busco_taxonomy, env # disable insecure warning requests.packages.urllib3.disable_warnings() @@ -159,6 +159,52 @@ def get_odb_version(downloads_json_file): return sorted(odb_versions, reverse=True)[0] +def ensure_busco_lineage(species, logger): + """ + Ensure the BUSCO lineage `_` is present under + FUNANNOTATE2_DB, downloading and extracting it from the URL in + downloads.json if missing. The check is a no-op when the directory + already exists, so callers can invoke this idempotently. + + Parameters: + - species (str): BUSCO lineage species name (e.g. "fungi", "aspergillus"). + - logger: Logger exposing .info / .critical (e.g. from startLogging). + + Returns: + - str: Absolute path to the lineage directory. + + Raises: + - SystemExit(1): If the directory could not be made present after the + download/extract attempt. + """ + downloads_json = os.path.join(os.path.dirname(__file__), "downloads.json") + odb_version = get_odb_version(downloads_json) + busco_model_path = os.path.join( + env["FUNANNOTATE2_DB"], f"{species}_{odb_version}" + ) + if os.path.isdir(busco_model_path): + return busco_model_path + download_urls = load_json(downloads_json) + busco_url = download_urls["busco"][species][0] + busco_tgz = os.path.join(env["FUNANNOTATE2_DB"], os.path.basename(busco_url)) + logger.info(f"Downloading {species}_{odb_version} model from {busco_url}") + download(busco_url, busco_tgz, wget=False) + if os.path.isfile(busco_tgz): + runSubprocess( + ["tar", "-zxf", os.path.basename(busco_tgz)], + logger, + cwd=env["FUNANNOTATE2_DB"], + ) + if os.path.isdir(busco_model_path): + os.remove(busco_tgz) + if not os.path.isdir(busco_model_path): + logger.critical( + f"Unable to download/extract BUSCO lineage to {busco_model_path}" + ) + raise SystemExit(1) + return busco_model_path + + def download(url, name, wget=False, timeout=60, retries=3): """ Download a file from a given URL with improved error handling and retries. From 723a7b322fa6c79cb3fab6f0313dc39aad436932 Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Thu, 28 May 2026 07:18:45 -0700 Subject: [PATCH 4/4] test: update test for per-contig log demotion to debug Commit 4d55eed4 moved the 'Successfully ran tools for : ...' message from logger.info to logger.debug to avoid log pollution on multi-contig assemblies. Update the test that asserted the message landed on info so it now checks debug_messages instead, plus a negative assertion that nothing matching 'Successfully ran tools' is emitted at info level. --- tests/unit/test_predict.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_predict.py b/tests/unit/test_predict.py index 1eb8862..faec5c1 100644 --- a/tests/unit/test_predict.py +++ b/tests/unit/test_predict.py @@ -310,7 +310,8 @@ def fake_run_snap(*args, **kwargs): debug_messages = [call.args[0] for call in logger.debug.call_args_list] assert "snap tool output" in info_messages - assert "Successfully ran tools for scaffold_1.fasta: snap" in info_messages + assert "Successfully ran tools for scaffold_1.fasta: snap" in debug_messages + assert not any("Successfully ran tools" in message for message in info_messages) assert not any("Processing contig" in message for message in info_messages) assert not any("memory prediction for scaffold_1.fasta" in message for message in info_messages) assert not any("Processing contig" in message for message in debug_messages)