diff --git a/funannotate2/annotate.py b/funannotate2/annotate.py index a462b9d..e5ee0d9 100755 --- a/funannotate2/annotate.py +++ b/funannotate2/annotate.py @@ -41,11 +41,11 @@ choose_best_busco_species, create_directories, create_tmpdir, + ensure_busco_lineage, find_files, load_json, lookup_taxonomy, naming_slug, - get_odb_version, validate_busco_lineage, ) @@ -345,9 +345,6 @@ def annotate(args): # busco proteome analysis busco_all = os.path.join(misc_dir, "busco.results.json") busco_annots = os.path.join(misc_dir, "annotations.busco.tsv") - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) if not checkfile(busco_annots): if not taxonomy: # get taxonomy information @@ -366,9 +363,12 @@ def annotate(args): else: # choose best busco species busco_species = choose_best_busco_species(taxonomy) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}" - ) + # Ensure the BUSCO lineage is available under FUNANNOTATE2_DB, + # downloading it if needed. In dockerized usage the lineage from a + # previous train/predict run is gone when annotate starts in a fresh + # container, so this guards against the cryptic " is not a + # directory" failure from buscolite further down. + busco_model_path = ensure_busco_lineage(busco_species, logger) # run busco proteome screen logger.info( diff --git a/funannotate2/predict.py b/funannotate2/predict.py index 96c0845..7bbe0f8 100755 --- a/funannotate2/predict.py +++ b/funannotate2/predict.py @@ -38,15 +38,12 @@ choose_best_busco_species, create_directories, create_tmpdir, - download, + ensure_busco_lineage, find_files, - load_json, lookup_taxonomy, naming_slug, runProcessJob, - runSubprocess, which_path, - get_odb_version, ) @@ -906,29 +903,7 @@ def sort_gff_line(line): busco_tax = choose_best_busco_species( {"superkingdom": taxonomy.get("superkingdom"), "kingdom": taxonomy.get("kingdom")} ) - # pull the latest odb version from downloads link - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_tax}_{odb_version}" - ) - if not os.path.isdir(busco_model_path): - download_urls = load_json( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_url = download_urls["busco"][busco_tax][0] - busco_tgz = os.path.join(env["FUNANNOTATE2_DB"], os.path.basename(busco_url)) - logger.info(f"Downloading {busco_tax}_{odb_version} model from {busco_url}") - download(busco_url, busco_tgz, wget=False) - if os.path.isfile(busco_tgz): - runSubprocess( - ["tar", "-zxf", os.path.basename(busco_tgz)], - logger, - cwd=env["FUNANNOTATE2_DB"], - ) - if os.path.isdir(busco_model_path): - os.remove(busco_tgz) + busco_model_path = ensure_busco_lineage(busco_tax, logger) # now we can loop through the abinitio predictions and run busco for completion # write this to file for re-use if consensus file already present? @@ -1611,7 +1586,11 @@ def run_tool_with_error_tracking(tool_name, run_func, *args, **kwargs): ) if tools_run: - logger.info(f"Successfully ran tools for {contig_name}: {', '.join(tools_run)}") + # Per-contig success is a debug-level detail; on assemblies with many + # contigs an info-level line per contig drowns the user-facing log. + # The downstream " predictions filtered: ..." summary lines + # already convey aggregate success at info level. + logger.debug(f"Successfully ran tools for {contig_name}: {', '.join(tools_run)}") # Store error information in memory stats for tracking if monitor_memory: diff --git a/funannotate2/train.py b/funannotate2/train.py index 21ceaa7..fe2e125 100755 --- a/funannotate2/train.py +++ b/funannotate2/train.py @@ -24,13 +24,11 @@ choose_best_augustus_species, choose_best_busco_species, create_directories, - download, - load_json, + ensure_busco_lineage, lookup_taxonomy, naming_slug, runSubprocess, which_path, - get_odb_version, rename_gff_contigs, validate_busco_lineage, validate_augustus_species, @@ -142,13 +140,10 @@ def train(args): else: # choose best busco species busco_species = choose_best_busco_species(taxonomy) - # pull the latest odb version from downloads link - odb_version = get_odb_version( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_model_path = os.path.join( - env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}" - ) + # Ensure the BUSCO lineage exists under FUNANNOTATE2_DB (downloading + # it if necessary) and capture its on-disk path for buscolite + the + # params.json output below. + busco_model_path = ensure_busco_lineage(busco_species, logger) # run buscolite on genome to get training set filt_train_models = os.path.join(misc_dir, "training-models.final.gff3") @@ -159,26 +154,6 @@ def train(args): logger.info( f"Choosing best busco species based on taxonomy: {busco_species}" ) - if not os.path.isdir(busco_model_path): - download_urls = load_json( - os.path.join(os.path.dirname(__file__), "downloads.json") - ) - busco_url = download_urls["busco"][busco_species][0] - busco_tgz = os.path.join( - env["FUNANNOTATE2_DB"], os.path.basename(busco_url) - ) - logger.info( - f"Downloading {busco_species}_{odb_version} model from {busco_url}" - ) - download(busco_url, busco_tgz, wget=False) - if os.path.isfile(busco_tgz): - runSubprocess( - ["tar", "-zxf", os.path.basename(busco_tgz)], - logger, - cwd=env["FUNANNOTATE2_DB"], - ) - if os.path.isdir(busco_model_path): - os.remove(busco_tgz) log("Running buscolite to generate training set using filtered genome") buscolite( TrainingGenomeFasta, diff --git a/funannotate2/utilities.py b/funannotate2/utilities.py index 4690ecd..abbb781 100755 --- a/funannotate2/utilities.py +++ b/funannotate2/utilities.py @@ -20,7 +20,7 @@ import requests -from .config import augustus_species, busco_taxonomy +from .config import augustus_species, busco_taxonomy, env # disable insecure warning requests.packages.urllib3.disable_warnings() @@ -159,6 +159,52 @@ def get_odb_version(downloads_json_file): return sorted(odb_versions, reverse=True)[0] +def ensure_busco_lineage(species, logger): + """ + Ensure the BUSCO lineage `_` is present under + FUNANNOTATE2_DB, downloading and extracting it from the URL in + downloads.json if missing. The check is a no-op when the directory + already exists, so callers can invoke this idempotently. + + Parameters: + - species (str): BUSCO lineage species name (e.g. "fungi", "aspergillus"). + - logger: Logger exposing .info / .critical (e.g. from startLogging). + + Returns: + - str: Absolute path to the lineage directory. + + Raises: + - SystemExit(1): If the directory could not be made present after the + download/extract attempt. + """ + downloads_json = os.path.join(os.path.dirname(__file__), "downloads.json") + odb_version = get_odb_version(downloads_json) + busco_model_path = os.path.join( + env["FUNANNOTATE2_DB"], f"{species}_{odb_version}" + ) + if os.path.isdir(busco_model_path): + return busco_model_path + download_urls = load_json(downloads_json) + busco_url = download_urls["busco"][species][0] + busco_tgz = os.path.join(env["FUNANNOTATE2_DB"], os.path.basename(busco_url)) + logger.info(f"Downloading {species}_{odb_version} model from {busco_url}") + download(busco_url, busco_tgz, wget=False) + if os.path.isfile(busco_tgz): + runSubprocess( + ["tar", "-zxf", os.path.basename(busco_tgz)], + logger, + cwd=env["FUNANNOTATE2_DB"], + ) + if os.path.isdir(busco_model_path): + os.remove(busco_tgz) + if not os.path.isdir(busco_model_path): + logger.critical( + f"Unable to download/extract BUSCO lineage to {busco_model_path}" + ) + raise SystemExit(1) + return busco_model_path + + def download(url, name, wget=False, timeout=60, retries=3): """ Download a file from a given URL with improved error handling and retries. diff --git a/tests/unit/test_predict.py b/tests/unit/test_predict.py index 1eb8862..faec5c1 100644 --- a/tests/unit/test_predict.py +++ b/tests/unit/test_predict.py @@ -310,7 +310,8 @@ def fake_run_snap(*args, **kwargs): debug_messages = [call.args[0] for call in logger.debug.call_args_list] assert "snap tool output" in info_messages - assert "Successfully ran tools for scaffold_1.fasta: snap" in info_messages + assert "Successfully ran tools for scaffold_1.fasta: snap" in debug_messages + assert not any("Successfully ran tools" in message for message in info_messages) assert not any("Processing contig" in message for message in info_messages) assert not any("memory prediction for scaffold_1.fasta" in message for message in info_messages) assert not any("Processing contig" in message for message in debug_messages)