From 6f7611ee3ed1aa20db9d42a43db6bf7503158605 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 13:59:16 +0100
Subject: [PATCH 01/19] only argparse

---
 dfpl/__main__.py | 165 ++++------
 dfpl/options.py  | 814 ++++++++++++++++++++++++++++-------------------
 dfpl/utils.py    | 180 +++++++----
 3 files changed, 669 insertions(+), 490 deletions(-)

diff --git a/dfpl/__main__.py b/dfpl/__main__.py
index 7896d451..8d035579 100755
--- a/dfpl/__main__.py
+++ b/dfpl/__main__.py
@@ -1,12 +1,10 @@
 import dataclasses
 import logging
-import os.path
-import pathlib
+import os
 from argparse import Namespace
 from os import path
 
-import chemprop as cp
-import pandas as pd
+import chemprop
 from keras.models import load_model
 
 from dfpl import autoencoder as ac
@@ -17,43 +15,8 @@
 from dfpl import vae as vae
 from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute
 
-project_directory = pathlib.Path(".").parent.parent.absolute()
-test_train_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    type="smiles",
-    fpType="topological",
-    epochs=100,
-    batchSize=1024,
-    fpSize=2048,
-    encFPSize=256,
-    enableMultiLabel=False,
-    testSize=0.2,
-    kFolds=2,
-    verbose=2,
-    trainAC=False,
-    trainFNN=True,
-    compressFeatures=True,
-    activationFunction="selu",
-    lossFunction="bce",
-    optimizer="Adam",
-    fnnType="FNN",
-)
-
-test_pred_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
-    type="smiles",
-    fpType="topological",
-)
-
-
-def traindmpnn(opts: options.GnnOptions):
+
+def traindmpnn(opts: options.GnnOptions) -> None:
     """
     Train a D-MPNN model using the given options.
     Args:
@@ -61,54 +24,46 @@ def traindmpnn(opts: options.GnnOptions):
     Returns:
     - None
     """
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-    ignore_elements = ["py/object"]
     # Load options from a JSON file and replace the relevant attributes in `opts`
-    arguments = createArgsFromJson(
-        opts.configFile, ignore_elements, return_json_object=False
-    )
-    opts = cp.args.TrainArgs().parse_args(arguments)
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.TrainArgs().parse_args(arguments)
     logging.info("Training DMPNN...")
-    # Train the model and get the mean and standard deviation of AUC score from cross-validation
-    mean_score, std_score = cp.train.cross_validate(
-        args=opts, train_func=cp.train.run_training
+    mean_score, std_score = chemprop.train.cross_validate(
+        args=opts, train_func=chemprop.train.run_training
     )
     logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")
 
 
-def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
+def predictdmpnn(opts: options.GnnOptions) -> None:
     """
     Predict the values using a trained D-MPNN model with the given options.
     Args:
     - opts: options.GnnOptions instance containing the details of the prediction
-    - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
     Returns:
     - None
     """
-    ignore_elements = [
-        "py/object",
-        "checkpoint_paths",
-        "save_dir",
-        "saving_name",
-    ]
     # Load options and additional arguments from a JSON file
-    arguments, data = createArgsFromJson(
-        json_arg_path, ignore_elements, return_json_object=True
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.PredictArgs().parse_args(arguments)
+
+    chemprop.train.make_predictions(args=opts)
+
+
+def interpretdmpnn(opts: options.GnnOptions) -> None:
+    """
+    Interpret the predictions of a trained D-MPNN model with the given options.
+    Args:
+    - opts: options.GnnOptions instance containing the details of the prediction
+    Returns:
+    - None
+    """
+    # Load options and additional arguments from a JSON file
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.InterpretArgs().parse_args(arguments)
+
+    chemprop.interpret.interpret(
+        args=opts, save_to_csv=True
     )
-    arguments.append("--preds_path")
-    arguments.append("")
-    save_dir = data.get("save_dir")
-    name = data.get("saving_name")
-    # Replace relevant attributes in `opts` with loaded options
-    opts = cp.args.PredictArgs().parse_args(arguments)
-    opts.preds_path = save_dir + "/" + name
-    df = pd.read_csv(opts.test_path)
-    smiles = []
-    for index, rows in df.iterrows():
-        my_list = [rows.smiles]
-        smiles.append(my_list)
-    # Make predictions and return the result
-    cp.train.make_predictions(args=opts, smiles=smiles)
 
 
 def train(opts: options.Options):
@@ -116,9 +71,6 @@ def train(opts: options.Options):
     Run the main training procedure
     :param opts: Options defining the details of the training
     """
-
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-
     # import data from file and create DataFrame
     if "tsv" in opts.inputFile:
         df = fp.importDataFile(
@@ -128,7 +80,7 @@ def train(opts: options.Options):
         df = fp.importDataFile(
             opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
         )
-    # initialize encoders to None
+    # initialize (auto)encoders to None
     encoder = None
     autoencoder = None
     if opts.trainAC:
@@ -142,11 +94,12 @@ def train(opts: options.Options):
     # if feature compression is enabled
     if opts.compressFeatures:
         if not opts.trainAC:
-            if opts.aeType == "deterministic":
-                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
-            elif opts.aeType == "variational":
+            if opts.aeType == "variational":
                 (autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
-            elif opts.ecWeightsFile == "":
+            else:
+                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
+
+            if opts.ecWeightsFile == "":
                 encoder = load_model(opts.ecModelDir)
             else:
                 autoencoder.load_weights(
@@ -154,14 +107,18 @@ def train(opts: options.Options):
                 )
         # compress the fingerprints using the autoencoder
         df = ac.compress_fingerprints(df, encoder)
-        # ac.visualize_fingerprints(
-        #     df,
-        #     before_col="fp",
-        #     after_col="fpcompressed",
-        #     train_indices=train_indices,
-        #     test_indices=test_indices,
-        #     save_as=f"UMAP_{opts.aeSplitType}.png",
-        # )
+        if opts.visualizeLatent and opts.trainAC:
+            ac.visualize_fingerprints(
+                df,
+                train_indices=train_indices,
+                test_indices=test_indices,
+                save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png",
+            )
+        elif opts.visualizeLatent:
+            logging.info(
+                "Visualizing latent space is only available if you train the autoencoder. Skipping visualization."
+            )
+
     # train single label models if requested
     if opts.trainFNN and not opts.enableMultiLabel:
         sl.train_single_label_models(df=df, opts=opts)
@@ -257,24 +214,22 @@ def main():
                 raise ValueError("Input directory is not a directory")
         elif prog_args.method == "traingnn":
             traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-
+            createLogger("traingnn.log")
             traindmpnn(traingnn_opts)
 
         elif prog_args.method == "predictgnn":
-            predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-            fixed_opts = dataclasses.replace(
-                predictgnn_opts,
-                test_path=makePathAbsolute(predictgnn_opts.test_path),
-                preds_path=makePathAbsolute(predictgnn_opts.preds_path),
-            )
-
-            logging.info(
-                f"The following arguments are received or filled with default values:\n{prog_args}"
-            )
-
-            predictdmpnn(fixed_opts, prog_args.configFile)
+            predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args)
+            createLogger("predictgnn.log")
+            predictdmpnn(predictgnn_opts)
+        elif prog_args.method == "interpretgnn":
+            interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args)
+            createLogger("interpretgnn.log")
+            interpretdmpnn(interpretgnn_opts)
 
         elif prog_args.method == "train":
+            if prog_args.configFile is None and prog_args.inputFile is None:
+                parser.error("Either --configFile or --inputFile must be provided.")
+
             train_opts = options.Options.fromCmdArgs(prog_args)
             fixed_opts = dataclasses.replace(
                 train_opts,
@@ -288,6 +243,8 @@ def main():
             )
             train(fixed_opts)
         elif prog_args.method == "predict":
+            if prog_args.configFile is None and prog_args.inputFile is None:
+                parser.error("Either --configFile or --inputFile must be provided.")
             predict_opts = options.Options.fromCmdArgs(prog_args)
             fixed_opts = dataclasses.replace(
                 predict_opts,
@@ -298,8 +255,6 @@ def main():
                 ),
                 ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
                 fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
-                trainAC=False,
-                trainFNN=False,
             )
             createDirectory(fixed_opts.outputDir)
             createLogger(path.join(fixed_opts.outputDir, "predict.log"))
diff --git a/dfpl/options.py b/dfpl/options.py
index 6d84dbc4..85e245bc 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -3,12 +3,13 @@
 import argparse
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional, Literal, List
 
 import jsonpickle
 import torch
-from chemprop.args import TrainArgs
+from chemprop.args import TrainArgs, PredictArgs, InterpretArgs
 
-from dfpl.utils import makePathAbsolute
+from dfpl.utils import parseCmdArgs
 
 
 @dataclass
@@ -17,51 +18,51 @@ class Options:
     Dataclass for all options necessary for training the neural nets
     """
 
-    configFile: str = "./example/train.json"
-    inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv"
-    outputDir: str = "."
-    outputFile: str = ""
-    ecWeightsFile: str = "AE.encoder.weights.hdf5"
-    ecModelDir: str = "AE_encoder"
-    fnnModelDir: str = "modeltraining"
+    configFile: str = None
+    inputFile: str = "tests/data/smiles.csv"
+    outputDir: str = "example/results_train/"  # changes according to mode
+    outputFile: str = "results.csv"
+    ecWeightsFile: str = ""
+    ecModelDir: str = "example/results_train/AE_encoder/"
+    fnnModelDir: str = "example/results_train/AR_saved_model/"
     type: str = "smiles"
     fpType: str = "topological"  # also "MACCS", "atompairs"
-    epochs: int = 512
+    epochs: int = 100
     fpSize: int = 2048
     encFPSize: int = 256
-    kFolds: int = 0
+    kFolds: int = 1
     testSize: float = 0.2
     enableMultiLabel: bool = False
-    verbose: int = 0
-    trainAC: bool = True  # if set to False, an AC weight file must be provided!
+    verbose: int = 2
+    trainAC: bool = False
     trainFNN: bool = True
-    compressFeatures: bool = True
-    sampleFractionOnes: float = 0.5  # Only used when value is in [0,1]
+    compressFeatures: bool = False
+    sampleFractionOnes: float = 0.5
     sampleDown: bool = False
     split_type: str = "random"
     aeSplitType: str = "random"
     aeType: str = "deterministic"
-    aeEpochs: int = 3000
+    aeEpochs: int = 100
     aeBatchSize: int = 512
     aeLearningRate: float = 0.001
-    aeLearningRateDecay: float = 0.01
-    aeActivationFunction: str = "relu"
+    aeLearningRateDecay: float = 0.96
+    aeActivationFunction: str = "selu"
     aeOptimizer: str = "Adam"
     fnnType: str = "FNN"
     batchSize: int = 128
     optimizer: str = "Adam"
     learningRate: float = 0.001
+    learningRateDecay: float = 0.96
     lossFunction: str = "bce"
     activationFunction: str = "relu"
     l2reg: float = 0.001
     dropout: float = 0.2
     threshold: float = 0.5
-    gpu: str = ""
-    snnDepth = 8
-    snnWidth = 50
-    aeWabTracking: str = ""  # Wand & Biases autoencoder tracking
-    wabTracking: str = ""  # Wand & Biases FNN tracking
-    wabTarget: str = "ER"  # Wand & Biases target used for showing training progress
+    visualizeLatent: bool = False  # only if autoencoder is trained or loaded
+    gpu: int = None
+    aeWabTracking: bool = False  # Wand & Biases autoencoder tracking
+    wabTracking: bool = False  # Wand & Biases FNN tracking
+    wabTarget: str = "AR"  # Wand & Biases target used for showing training progress
 
     def saveToFile(self, file: str) -> None:
         """
@@ -72,42 +73,8 @@ def saveToFile(self, file: str) -> None:
             f.write(jsonpickle.encode(self))
 
     @classmethod
-    def fromJson(cls, file: str) -> Options:
-        """
-        Create an instance from a JSON file
-        """
-        jsonFile = Path(file)
-        if jsonFile.exists() and jsonFile.is_file():
-            with jsonFile.open() as f:
-                content = f.read()
-                return jsonpickle.decode(content)
-        raise ValueError("JSON file does not exist or is not readable")
-
-    @classmethod
-    def fromCmdArgs(cls, args: argparse.Namespace) -> Options:
-        """
-        Creates Options instance from cmdline arguments.
-
-        If a training file (JSON) is provided, the values from that file are used.
-        However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the
-        JSON file and on the commandline, then the value of the commandline argument will be used.
-        """
-        result = Options()
-        if "configFile" in vars(args).keys():
-            jsonFile = Path(makePathAbsolute(args.configFile))
-            if jsonFile.exists() and jsonFile.is_file():
-                with jsonFile.open() as f:
-                    content = f.read()
-                    result = jsonpickle.decode(content)
-            else:
-                raise ValueError("Could not find JSON input file")
-
-        for key, value in vars(args).items():
-            # The args dict will contain a "method" key from the subparser.
-            # We don't use this.
-            if key != "method":
-                result.__setattr__(key, value)
-        return result
+    def fromCmdArgs(cls, args: argparse.Namespace) -> "Options":
+        return parseCmdArgs(cls, args)
 
 
 @dataclass
@@ -132,39 +99,114 @@ class GnnOptions(TrainArgs):
     preds_path: str = "./tox21dmpnn.csv"
     test_path: str = ""
     save_preds: bool = True
+    calibration_method: str = "none"
+    uncertainty_method: str = "none"
+    calibration_path: str = ""
+    evaluation_methods: str = "none"
+    evaluation_scores_path: str = ""
+    wabTracking: bool = False
+    split_sizes: List[float] = None
+    # save_smiles_splits: bool = False
+    @classmethod
+    def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):
+        # Initialize with JSON config if provided
+        if json_config:
+            opts = cls(**json_config)
+        else:
+            opts = cls()
+
+        # Update with command-line arguments
+        for key, value in vars(args).items():
+            if value is not None:
+                setattr(opts, key, value)
+
+        return opts
+
+
+class PredictGnnOptions(PredictArgs):
+    """
+    Dataclass to hold all options used for training the graph models
+    """
+
+    configFile: str = "./example/predictgnn.json"
+    calibration_atom_descriptors_path: str = None
+    calibration_features_path: str = None
+    calibration_interval_percentile: float = 95
+    calibration_method: Literal[
+        "zscaling",
+        "tscaling",
+        "zelikman_interval",
+        "mve_weighting",
+        "platt",
+        "isotonic",
+    ] = None
+    calibration_path: str = None
+    calibration_phase_features_path: str = None
+    drop_extra_columns: bool = False
+    dropout_sampling_size: int = 10
+    evaluation_methods: List[str] = None
+    evaluation_scores_path: str = None
+    # no_features_scaling: bool = True
+    individual_ensemble_predictions: bool = False
+    preds_path: str = None
+    regression_calibrator_metric: Literal["stdev", "interval"] = None
+    test_path: str = None
+    uncertainty_dropout_p: float = 0.1
+    uncertainty_method: Literal[
+        "mve",
+        "ensemble",
+        "evidential_epistemic",
+        "evidential_aleatoric",
+        "evidential_total",
+        "classification",
+        "dropout",
+    ] = None
 
     @classmethod
-    def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions:
-        """
-        Creates Options instance from cmdline arguments.
+    def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):
+        # Initialize with JSON config if provided
+        if json_config:
+            opts = cls(**json_config)
+        else:
+            opts = cls()
 
-        If a training file (JSON) is provided, the values from that file are used.
-        However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the
-        JSON file and on the commandline, then the value of the commandline argument will be used.
-        """
-        result = GnnOptions()
-        if "configFile" in vars(args).keys():
-            jsonFile = Path(makePathAbsolute(args.configFile))
-            if jsonFile.exists() and jsonFile.is_file():
-                with jsonFile.open() as f:
-                    content = f.read()
-                    result = jsonpickle.decode(content)
-            else:
-                raise ValueError("Could not find JSON input file")
-
-        return result
+        # Update with command-line arguments
+        for key, value in vars(args).items():
+            if value is not None:
+                setattr(opts, key, value)
+
+        return opts
+
+
+class InterpretGNNoptions(InterpretArgs):
+    """
+    Dataclass to hold all options used for training the graph models
+    """
+
+    configFile: str = "./example/interpret.json"
+    data_path: str = "./example/data/smiles.csv"
+    batch_size: int = 500
+    c_puct: float = 10.0
+    max_atoms: int = 20
+    min_atoms: int = 8
+    prop_delta: float = 0.5
+    property_id: List[int] = None
+    rollout: int = 20
 
     @classmethod
-    def fromJson(cls, file: str) -> GnnOptions:
-        """
-        Create an instance from a JSON file
-        """
-        jsonFile = Path(file)
-        if jsonFile.exists() and jsonFile.is_file():
-            with jsonFile.open() as f:
-                content = f.read()
-                return jsonpickle.decode(content)
-        raise ValueError("JSON file does not exist or is not readable")
+    def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):
+        # Initialize with JSON config if provided
+        if json_config:
+            opts = cls(**json_config)
+        else:
+            opts = cls()
+
+        # Update with command-line arguments
+        for key, value in vars(args).items():
+            if value is not None:
+                setattr(opts, key, value)
+
+        return opts
 
 
 def createCommandlineParser() -> argparse.ArgumentParser:
@@ -186,6 +228,12 @@ def createCommandlineParser() -> argparse.ArgumentParser:
     parser_predict_gnn.set_defaults(method="predictgnn")
     parsePredictGnn(parser_predict_gnn)
 
+    parser_interpret_gnn = subparsers.add_parser(
+        "interpretgnn", help="Interpret your GNN models"
+    )
+    parser_interpret_gnn.set_defaults(method="interpretgnn")
+    parseInterpretGnn(parser_interpret_gnn)
+
     parser_train = subparsers.add_parser(
         "train", help="Train new models with your data"
     )
@@ -225,7 +273,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="FILE",
         type=str,
         help="Input JSON file that contains all information for training/predicting.",
-        default=argparse.SUPPRESS,
+        default="example/train.json",
     )
     general_args.add_argument(
         "-i",
@@ -234,7 +282,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         help="The file containing the data for training in "
         "comma separated CSV format.The first column should be smiles.",
-        default=argparse.SUPPRESS,
+        default="tests/data/smiles.csv",
     )
     general_args.add_argument(
         "-o",
@@ -243,8 +291,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         help="Prefix of output file name. Trained model and "
         "respective stats will be returned in this directory.",
-        default=argparse.SUPPRESS,
+        default="example/results_train/",
     )
+
+    # TODO CHECK WHAT IS TYPE DOING?
     general_args.add_argument(
         "-t",
         "--type",
@@ -252,7 +302,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["fp", "smiles"],
         help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
-        default=argparse.SUPPRESS,
+        default="fp",
     )
     general_args.add_argument(
         "-thr",
@@ -260,47 +310,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=float,
         metavar="FLOAT",
         help="Threshold for binary classification.",
-        default=argparse.SUPPRESS,
+        default=0.5,
     )
     general_args.add_argument(
         "-gpu",
         "--gpu",
         metavar="INT",
         type=int,
-        help="Select which gpu to use. If not available, leave empty.",
-        default=argparse.SUPPRESS,
+        help="Select which gpu to use by index. If not available, leave empty",
+        default=None,
     )
     general_args.add_argument(
-        "-k",
         "--fpType",
         metavar="STR",
         type=str,
-        choices=["topological", "MACCS"],  # , 'atompairs', 'torsions'],
-        help="The type of fingerprint to be generated/used in input file.",
-        default=argparse.SUPPRESS,
+        choices=["topological", "MACCS"],
+        help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.",
+        default="topological",
     )
     general_args.add_argument(
-        "-s",
         "--fpSize",
         type=int,
-        help="Size of fingerprint that should be generated.",
-        default=argparse.SUPPRESS,
+        help="Length of the fingerprint that should be generated.",
+        default=2048,
     )
     general_args.add_argument(
-        "-c",
         "--compressFeatures",
-        metavar="BOOL",
-        type=bool,
-        help="Should the fingerprints be compressed or not. Activates the autoencoder. ",
-        default=argparse.SUPPRESS,
+        action="store_true",
+        help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.",
+        default=False,
     )
     general_args.add_argument(
-        "-m",
         "--enableMultiLabel",
-        metavar="BOOL",
-        type=bool,
+        action="store_true",
         help="Train multi-label classification model in addition to the individual models.",
-        default=argparse.SUPPRESS,
+        default=False,
     )
     # Autoencoder Configuration
     autoencoder_args.add_argument(
@@ -309,14 +353,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         metavar="FILE",
         help="The .hdf5 file of a trained encoder",
-        default=argparse.SUPPRESS,
+        default="",
     )
     autoencoder_args.add_argument(
         "--ecModelDir",
         type=str,
         metavar="DIR",
         help="The directory where the full model of the encoder will be saved",
-        default=argparse.SUPPRESS,
+        default="example/results_train/AE_encoder/",
     )
     autoencoder_args.add_argument(
         "--aeType",
@@ -324,21 +368,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["variational", "deterministic"],
         help="Autoencoder type, variational or deterministic.",
-        default=argparse.SUPPRESS,
+        default="deterministic",
     )
     autoencoder_args.add_argument(
         "--aeEpochs",
         metavar="INT",
         type=int,
         help="Number of epochs for autoencoder training.",
-        default=argparse.SUPPRESS,
+        default=100,
     )
     autoencoder_args.add_argument(
         "--aeBatchSize",
         metavar="INT",
         type=int,
         help="Batch size in autoencoder training.",
-        default=argparse.SUPPRESS,
+        default=512,
     )
     autoencoder_args.add_argument(
         "--aeActivationFunction",
@@ -346,21 +390,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["relu", "selu"],
         help="The activation function for the hidden layers in the autoencoder.",
-        default=argparse.SUPPRESS,
+        default="relu",
     )
     autoencoder_args.add_argument(
         "--aeLearningRate",
         metavar="FLOAT",
         type=float,
         help="Learning rate for autoencoder training.",
-        default=argparse.SUPPRESS,
+        default=0.001,
     )
     autoencoder_args.add_argument(
         "--aeLearningRateDecay",
         metavar="FLOAT",
         type=float,
         help="Learning rate decay for autoencoder training.",
-        default=argparse.SUPPRESS,
+        default=0.96,
     )
     autoencoder_args.add_argument(
         "--aeSplitType",
@@ -368,7 +412,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
         help="Set how the data is going to be split for the autoencoder",
-        default=argparse.SUPPRESS,
+        default="random",
     )
     autoencoder_args.add_argument(
         "-d",
@@ -376,7 +420,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="INT",
         type=int,
         help="Size of encoded fingerprint (z-layer of autoencoder).",
-        default=argparse.SUPPRESS,
+        default=256,
+    )
+    autoencoder_args.add_argument(
+        "--visualizeLatent",
+        action="store_true",
+        help="UMAP the latent space for exploration",
+        default=False,
     )
     # Training Configuration
     training_args.add_argument(
@@ -385,15 +435,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
         help="Set how the data is going to be split for the feedforward neural network",
-        default=argparse.SUPPRESS,
+        default="random",
     )
     training_args.add_argument(
-        "-l",
         "--testSize",
         metavar="FLOAT",
         type=float,
         help="Fraction of the dataset that should be used for testing. Value in [0,1].",
-        default=argparse.SUPPRESS,
+        default=0.2,
     )
     training_args.add_argument(
         "-K",
@@ -401,7 +450,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="INT",
         type=int,
         help="K that is used for K-fold cross-validation in the training procedure.",
-        default=argparse.SUPPRESS,
+        default=1,
     )
     training_args.add_argument(
         "-v",
@@ -411,21 +460,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         choices=[0, 1, 2],
         help="Verbosity level. O: No additional output, "
         + "1: Some additional output, 2: full additional output",
-        default=argparse.SUPPRESS,
+        default=2,
     )
     training_args.add_argument(
         "--trainAC",
-        metavar="BOOL",
-        type=bool,
+        action="store_true",
         help="Choose to train or not, the autoencoder based on the input file",
-        default=argparse.SUPPRESS,
+        default=False,
     )
     training_args.add_argument(
         "--trainFNN",
-        metavar="BOOL",
-        type=bool,
-        help="Train the feedforward network either with provided weights.",
-        default=argparse.SUPPRESS,
+        action="store_false",
+        help="When called it deactivates the training.",
+        default=True,
     )
     training_args.add_argument(
         "--sampleFractionOnes",
@@ -433,14 +480,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=float,
         help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)."
         "only works if --sampleDown is enabled",
-        default=argparse.SUPPRESS,
+        default=0.5,
     )
     training_args.add_argument(
         "--sampleDown",
         metavar="BOOL",
         type=bool,
         help="Enable automatic down sampling of the 0 valued samples.",
-        default=argparse.SUPPRESS,
+        default=False,
     )
     training_args.add_argument(
         "-e",
@@ -448,52 +495,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="INT",
         type=int,
         help="Number of epochs that should be used for the FNN training",
-        default=argparse.SUPPRESS,
+        default=100,
     )
-
+    # TODO CHECK IF ALL LOSSES MAKE SENSE HERE
     training_args.add_argument(
         "--lossFunction",
         metavar="STRING",
         type=str,
         choices=["mse", "bce", "focal"],
         help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.",
-        default=argparse.SUPPRESS,
+        default="bce",
     )
+    # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS?
     training_args.add_argument(
         "--optimizer",
         metavar="STRING",
         type=str,
         choices=["Adam", "SGD"],
         help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"',
-        default=argparse.SUPPRESS,
+        default="Adam",
     )
     training_args.add_argument(
         "--batchSize",
         metavar="INT",
         type=int,
         help="Batch size in FNN training.",
-        default=argparse.SUPPRESS,
+        default=128,
     )
     training_args.add_argument(
         "--l2reg",
         metavar="FLOAT",
         type=float,
         help="Value for l2 kernel regularizer.",
-        default=argparse.SUPPRESS,
+        default=0.001,
     )
     training_args.add_argument(
         "--dropout",
         metavar="FLOAT",
         type=float,
         help="The fraction of data that is dropped out in each dropout layer.",
-        default=argparse.SUPPRESS,
+        default=0.2,
     )
     training_args.add_argument(
         "--learningRate",
         metavar="FLOAT",
         type=float,
         help="Learning rate size in FNN training.",
-        default=argparse.SUPPRESS,
+        default=0.000022,
+    )
+    training_args.add_argument(
+        "--learningRateDecay",
+        metavar="FLOAT",
+        type=float,
+        help="Learning rate size in FNN training.",
+        default=0.96,
     )
     training_args.add_argument(
         "--activationFunction",
@@ -501,7 +556,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["relu", "selu"],
         help="The activation function for hidden layers in the FNN.",
-        default=argparse.SUPPRESS,
+        default="relu",
     )
     # Tracking Configuration
     tracking_args.add_argument(
@@ -509,14 +564,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="BOOL",
         type=bool,
         help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.",
-        default=argparse.SUPPRESS,
+        default=False,
     )
     tracking_args.add_argument(
         "--wabTracking",
         metavar="BOOL",
         type=bool,
         help="Track FNN performance via Weights & Biases, see https://wandb.ai.",
-        default=argparse.SUPPRESS,
+        default=False,
     )
     tracking_args.add_argument(
         "--wabTarget",
@@ -524,7 +579,112 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         type=str,
         choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"],
         help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.",
-        default=argparse.SUPPRESS,
+        default="AR",
+    )
+
+
+def parseInputPredict(parser: argparse.ArgumentParser) -> None:
+    """
+    Parse the input arguments.
+
+    :return: A namespace object built up from attributes parsed out of the cmd line.
+    """
+
+    general_args = parser.add_argument_group("General Configuration")
+    files_args = parser.add_argument_group("Files")
+    files_args.add_argument(
+        "-f",
+        "--configFile",
+        metavar="FILE",
+        type=str,
+        help="Input JSON file that contains all information for training/predicting.",
+    )
+    files_args.add_argument(
+        "-i",
+        "--inputFile",
+        metavar="FILE",
+        type=str,
+        help="The file containing the data for the prediction in (unquoted) "
+        "comma separated CSV format. The column named 'smiles' or 'fp'"
+        "contains the field to be predicted. Please adjust the type "
+        "that should be predicted (fp or smile) with -t option appropriately."
+        "An optional column 'id' is used to assign the outcomes to the"
+        "original identifiers. If this column is missing, the results are"
+        "numbered in the order of their appearance in the input file."
+        "A header is expected and respective column names are used.",
+        default="tests/data/smiles.csv",
+    )
+    files_args.add_argument(
+        "-o",
+        "--outputDir",
+        metavar="DIR",
+        type=str,
+        help="Prefix of output directory. It will contain a log file and the file specified"
+        "with --outputFile.",
+        default="example/results_predict/",
+    )
+    files_args.add_argument(
+        "--outputFile",
+        metavar="FILE",
+        type=str,
+        help="Output .CSV file name which will contain one prediction per input line. "
+        "Default: prefix of input file name.",
+        default="results.csv",
+    )
+    # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES?
+    general_args.add_argument(
+        "-t",
+        "--type",
+        metavar="STR",
+        type=str,
+        choices=["fp", "smiles"],
+        help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
+        default="fp",
+    )
+    general_args.add_argument(
+        "-k",
+        "--fpType",
+        metavar="STR",
+        type=str,
+        choices=["topological", "MACCS"],
+        help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.",
+        default="topological",
+    )
+    files_args.add_argument(
+        "--ecModelDir",
+        type=str,
+        metavar="DIR",
+        help="The directory where the full model of the encoder will be saved (if trainAE=True) or "
+        "loaded from (if trainAE=False). Provide a full path here.",
+        default="",
+    )
+    files_args.add_argument(
+        "--ecWeightsFile",
+        type=str,
+        metavar="STR",
+        help="The file  where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.",
+        default="",
+    )
+    files_args.add_argument(
+        "--fnnModelDir",
+        type=str,
+        metavar="DIR",
+        help="The directory where the full model of the fnn is loaded from. "
+        "Provide a full path here.",
+        default="example/results_train/AR_saved_model",
+    )
+    general_args.add_argument(
+        "-c", "--compressFeatures", action="store_true", default=False
+    )
+    (
+        general_args.add_argument(
+            "--aeType",
+            metavar="STRING",
+            type=str,
+            choices=["variational", "deterministic"],
+            help="Autoencoder type, variational or deterministic.",
+            default="deterministic",
+        )
     )
 
 
@@ -534,21 +694,62 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     files_args = parser.add_argument_group("Files")
     model_args = parser.add_argument_group("Model arguments")
     training_args = parser.add_argument_group("Training Configuration")
+    uncertainty_args = parser.add_argument_group("Uncertainty Configuration")
+    uncertainty_args.add_argument(
+        "--uncertainty_method",
+        type=str,
+        metavar="STRING",
+        choices=[
+            "mve",
+            "ensemble",
+            "evidential_epistemic",
+            "evidential_aleatoric",
+            "evidential_total",
+            "classification",
+            "dropout",
+            "dirichlet",
+        ],
+        help="Method to use for uncertainty estimation",
+        default="none",
+    )
+    # Uncertainty arguments
+    uncertainty_args.add_argument(
+        "--calibration_method",
+        type=str,
+        metavar="STRING",
+        choices=[
+            "zscaling",
+            "tscaling",
+            "zelikman_interval",
+            "mve_weighting",
+            "platt",
+            "isotonic",
+        ],
+        help="Method to use for calibration",
+        default="none",
+    )
+    uncertainty_args.add_argument(
+        "--calibration_path",
+        type=str,
+        metavar="FILE",
+        help="Path to file with calibration data",
+    )
 
     # General arguments
     general_args.add_argument("--split_key_molecule", type=int)
     general_args.add_argument("--pytorch_seed", type=int)
     general_args.add_argument("--cache_cutoff", type=float)
     general_args.add_argument("--save_preds", type=bool)
+    general_args.add_argument("--wabTracking", action="store_true", default=False)
     general_args.add_argument(
         "--cuda", action="store_true", default=False, help="Turn on cuda"
     )
-    general_args.add_argument(
-        "--save_smiles_splits",
-        action="store_true",
-        default=False,
-        help="Save smiles for each train/val/test splits for prediction convenience later",
-    )
+    # general_args.add_argument(
+    #     "--save_smiles_splits",
+    #     action="store_true",
+    #     default=False,
+    #     help="Save smiles for each train/val/test splits for prediction convenience later",
+    # )
     general_args.add_argument(
         "--test",
         action="store_true",
@@ -575,9 +776,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         default=10,
         help="The number of batches between each logging of the training loss",
     )
-    general_args.add_argument(
-        "--no_cuda", action="store_true", default=True, help="Turn off cuda"
-    )
     general_args.add_argument(
         "--no_cache",
         action="store_true",
@@ -593,13 +791,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         type=str,
         help="Input JSON file that contains all information for training/predicting.",
     )
-    files_args.add_argument(
-        "--config_path",
-        type=str,
-        metavar="FILE",
-        help="Path to a .json file containing arguments. Any arguments present in the config"
-        "file will override arguments specified via the command line or by the defaults.",
-    )
     files_args.add_argument(
         "--save_dir",
         type=str,
@@ -1034,141 +1225,149 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
 
-def parseInputPredict(parser: argparse.ArgumentParser) -> None:
-    """
-    Parse the input arguments.
-
-    :return: A namespace object built up from attributes parsed out of the cmd line.
-    """
-
+def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
     general_args = parser.add_argument_group("General Configuration")
     files_args = parser.add_argument_group("Files")
+    uncertainty_args = parser.add_argument_group("Uncertainty Configuration")
+
+    general_args.add_argument(
+        "--checkpoint_path",
+        type=str,
+        metavar="FILE",
+        help="Path to model checkpoint (.pt file)"
+    )
+    # general_args.add_argument(
+    #     "--no_features_scaling",
+    #     action="store_true",
+    #     help="Turn on scaling of features",
+    # )
     files_args.add_argument(
         "-f",
         "--configFile",
-        metavar="FILE",
         type=str,
-        help="Input JSON file that contains all information for training/predicting.",
-        default=argparse.SUPPRESS,
-    )
-    files_args.add_argument(
-        "-i",
-        "--inputFile",
         metavar="FILE",
-        type=str,
-        help="The file containing the data for the prediction in (unquoted) "
-        "comma separated CSV format. The column named 'smiles' or 'fp'"
-        "contains the field to be predicted. Please adjust the type "
-        "that should be predicted (fp or smile) with -t option appropriately."
-        "An optional column 'id' is used to assign the outcomes to the"
-        "original identifiers. If this column is missing, the results are"
-        "numbered in the order of their appearance in the input file."
-        "A header is expected and respective column names are used.",
-        default=argparse.SUPPRESS,
+        help="Path to a .json file containing arguments. Any arguments present in the config"
+        "file will override arguments specified via the command line or by the defaults.",
     )
     files_args.add_argument(
-        "-o",
-        "--outputDir",
-        metavar="DIR",
+        "--test_path",
         type=str,
-        help="Prefix of output directory. It will contain a log file and the file specified"
-        "with --outputFile.",
-        default=argparse.SUPPRESS,
+        help="Path to CSV file containing testing data for which predictions will be made.",
     )
     files_args.add_argument(
-        "--outputFile",
-        metavar="FILE",
+        "--preds_path",
         type=str,
-        help="Output .CSV file name which will contain one prediction per input line. "
-        "Default: prefix of input file name.",
-        default=argparse.SUPPRESS,
+        help="Path to CSV or PICKLE file where predictions will be saved.",
     )
-    general_args.add_argument(
-        "-t",
-        "--type",
-        metavar="STR",
+    files_args.add_argument(
+        "--calibration_path",
         type=str,
-        choices=["fp", "smiles"],
-        help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
-        default=argparse.SUPPRESS,
+        help="Path to data file to be used for uncertainty calibration.",
     )
-    general_args.add_argument(
-        "-k",
-        "--fpType",
-        metavar="STR",
+    files_args.add_argument(
+        "--calibration_features_path",
         type=str,
-        choices=["topological", "MACCS"],  # , 'atompairs', 'torsions'],
-        help="The type of fingerprint to be generated/used in input file.",
-        default=argparse.SUPPRESS,
+        nargs="+",
+        help="Path to features data to be used with the uncertainty calibration dataset.",
     )
+    files_args.add_argument("--calibration_phase_features_path", type=str, help="")
     files_args.add_argument(
-        "--ecModelDir",
+        "--calibration_atom_descriptors_path",
         type=str,
-        metavar="DIR",
-        help="The directory where the full model of the encoder will be saved (if trainAE=True) or "
-        "loaded from (if trainAE=False). Provide a full path here.",
-        default=argparse.SUPPRESS,
+        help="Path to the extra atom descriptors.",
     )
     files_args.add_argument(
-        "--fnnModelDir",
+        "--calibration_bond_descriptors_path",
         type=str,
-        metavar="DIR",
-        help="The directory where the full model of the fnn is loaded from. "
-        "Provide a full path here.",
-        default=argparse.SUPPRESS,
+        help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.",
     )
 
+    general_args.add_argument(
+        "--drop_extra_columns",
+        action="store_true",
+        help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.",
+    )
 
-def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
-    general_args = parser.add_argument_group("General Configuration")
-    data_args = parser.add_argument_group("Data Configuration")
-    files_args = parser.add_argument_group("Files")
-    training_args = parser.add_argument_group("Training Configuration")
-    files_args.add_argument(
-        "-f",
-        "--configFile",
-        metavar="FILE",
+    uncertainty_args.add_argument(
+        "--uncertainty_method",
         type=str,
-        help="Input JSON file that contains all information for training/predicting.",
-        default=argparse.SUPPRESS,
+        choices=[
+            "mve",
+            "ensemble",
+            "evidential_epistemic",
+            "evidential_aleatoric",
+            "evidential_total",
+            "classification",
+            "dropout",
+            "spectra_roundrobin",
+            "dirichlet",
+        ],
+        help="The method of calculating uncertainty.",
     )
-    general_args.add_argument(
-        "--gpu",
-        type=int,
-        metavar="INT",
-        choices=list(range(torch.cuda.device_count())),
-        help="Which GPU to use",
+    uncertainty_args.add_argument(
+        "--calibration_method",
+        type=str,
+        nargs="+",
+        choices=[
+            "zscaling",
+            "tscaling",
+            "zelikman_interval",
+            "mve_weighting",
+            "platt",
+            "isotonic",
+        ],
+        help="Methods used for calibrating the uncertainty calculated with uncertainty method.",
     )
-    general_args.add_argument(
-        "--no_cuda", action="store_true", default=False, help="Turn off cuda"
+    uncertainty_args.add_argument("--individual_ensemble_predictions",
+        action="store_true",
+        default=False,
+        help="Whether to save individual ensemble predictions.")
+    uncertainty_args.add_argument(
+        "--evaluation_methods",
+        type=str,
+        nargs="+",
+        help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
     )
-    general_args.add_argument(
-        "--num_workers",
+    uncertainty_args.add_argument(
+        "--evaluation_scores_path",
+        type=str,
+        help="Location to save the results of uncertainty evaluations.",
+    )
+    uncertainty_args.add_argument(
+        "--uncertainty_dropout_p",
+        type=float,
+        default=0.1,
+        help="The probability to use for Monte Carlo dropout uncertainty estimation.",
+    )
+    uncertainty_args.add_argument(
+        "--dropout_sampling_size",
         type=int,
-        metavar="INT",
-        help="Number of workers for the parallel data loading 0 means sequential",
+        default=10,
+        help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.",
     )
-    general_args.add_argument(
-        "--no_cache",
-        type=bool,
-        metavar="BOOL",
-        default=False,
-        help="Turn off caching mol2graph computation",
+    uncertainty_args.add_argument(
+        "--calibration_interval_percentile",
+        type=float,
+        default=95,
+        help="Sets the percentile used in the calibration methods. Must be in the range (1,100).",
     )
-    general_args.add_argument(
-        "--no_cache_mol",
-        type=bool,
-        metavar="BOOL",
-        default=False,
-        help="Whether to not cache the RDKit molecule for each SMILES string to reduce memory\
-                             usage cached by default",
+    uncertainty_args.add_argument(
+        "--regression_calibrator_metric",
+        type=str,
+        choices=["stdev", "interval"],
+        help="Regression calibrators can output either a stdev or an inverval.",
     )
-    general_args.add_argument(
-        "--empty_cache",
-        type=bool,
-        metavar="BOOL",
-        help="Whether to empty all caches before training or predicting. This is necessary if\
-                             multiple jobs are run within a single script and the atom or bond features change",
+
+
+def parseInterpretGnn(parser: argparse.ArgumentParser) -> None:
+    files_args = parser.add_argument_group("Files")
+    interpret_args = parser.add_argument_group("Interpretation Configuration")
+    files_args.add_argument(
+        "-f",
+        "--configFile",
+        metavar="FILE",
+        type=str,
+        help="Input JSON file that contains all information for interpretation.",
     )
     files_args.add_argument(
         "--preds_path",
@@ -1191,89 +1390,44 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         metavar="DIR",
         help="Path to model checkpoint (.pt file)",
     )
-    files_args.add_argument(
-        "--checkpoint_paths",
-        type=str,
-        metavar="FILE",
-        nargs="*",
-        help="Path to model checkpoint (.pt file)",
-    )
     files_args.add_argument(
         "--data_path",
         type=str,
         metavar="FILE",
         help="Path to CSV file containing testing data for which predictions will be made",
-        default="",
     )
-    files_args.add_argument(
-        "--test_path",
-        type=str,
-        metavar="FILE",
-        help="Path to CSV file containing testing data for which predictions will be made",
-        default="",
-    )
-    files_args.add_argument(
-        "--features_path",
-        type=str,
-        metavar="FILE",
-        nargs="*",
-        help="Path to features to use in FNN (instead of features_generator)",
-    )
-    files_args.add_argument(
-        "--atom_descriptors_path",
-        type=str,
-        metavar="FILE",
-        help="Path to the extra atom descriptors.",
-    )
-    data_args.add_argument(
-        "--use_compound_names",
-        action="store_true",
-        default=False,
-        help="Use when test data file contains compound names in addition to SMILES strings",
-    )
-    data_args.add_argument(
-        "--no_features_scaling",
-        action="store_true",
-        default=False,
-        help="Turn off scaling of features",
-    )
-    data_args.add_argument(
-        "--max_data_size",
+    interpret_args.add_argument(
+        "--max_atoms",
         type=int,
         metavar="INT",
-        help="Maximum number of data points to load",
+        help="Maximum number of atoms to use for interpretation",
     )
-    data_args.add_argument(
-        "--smiles_columns",
-        type=str,
-        metavar="STRING",
-        help="List of names of the columns containing SMILES strings.By default, uses the first\
-                             number_of_molecules columns.",
-    )
-    data_args.add_argument(
-        "--number_of_molecules",
+
+    interpret_args.add_argument(
+        "--min_atoms",
         type=int,
         metavar="INT",
-        help="Number of molecules in each input to the model.This must equal the length of\
-                             smiles_columns if not None",
+        help="Minimum number of atoms to use for interpretation",
     )
 
-    data_args.add_argument(
-        "--atom_descriptors",
-        type=bool,
-        metavar="Bool",
-        help="Use or not atom descriptors",
+    interpret_args.add_argument(
+        "--prop_delta",
+        type=float,
+        metavar="FLOAT",
+        help="The minimum change in the property of interest that is considered significant",
     )
-
-    data_args.add_argument(
-        "--bond_features_size",
+    interpret_args.add_argument(
+        "--property_id",
         type=int,
         metavar="INT",
-        help="Size of the extra bond descriptors that will be used as bond features to featurize a\
-                             given molecule",
+        help="The index of the property of interest",
     )
-    training_args.add_argument(
-        "--batch_size", type=int, metavar="INT", default=50, help="Batch size"
+    # write the argument for rollouts
+    interpret_args.add_argument(
+        "--rollout",
+        type=int,
+        metavar="INT",
+        help="The number of rollouts to use for interpretation",
     )
 
 
diff --git a/dfpl/utils.py b/dfpl/utils.py
index db3d6ec1..ccf931df 100644
--- a/dfpl/utils.py
+++ b/dfpl/utils.py
@@ -1,12 +1,16 @@
+import argparse
 import json
 import logging
 import os
 import pathlib
+import sys
 import warnings
 from collections import defaultdict
+from pathlib import Path
 from random import Random
-from typing import Dict, List, Set, Tuple, Union
+from typing import Dict, List, Set, Tuple, Type, TypeVar, Union
 
+import jsonpickle
 import numpy as np
 import pandas as pd
 from rdkit import Chem, RDLogger
@@ -14,7 +18,48 @@
 from rdkit.Chem.Scaffolds import MurckoScaffold
 from tqdm import tqdm
 
+# Define a type variable
+
+
 RDLogger.DisableLog("rdApp.*")
+T = TypeVar("T")
+
+
+def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T:
+    """
+    Parses command-line arguments to create an instance of the given class.
+
+    Args:
+    cls: The class to create an instance of.
+    args: argparse.Namespace containing the command-line arguments.
+
+    Returns:
+    An instance of cls populated with values from the command-line arguments.
+    """
+    # Extract argument flags from sys.argv
+    arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")}
+
+    # Create the result instance, which will be modified and returned
+    result = cls()
+
+    # Load JSON file if specified
+    if hasattr(args, "configFile") and args.configFile:
+        jsonFile = Path(args.configFile)
+        if jsonFile.exists() and jsonFile.is_file():
+            with jsonFile.open() as f:
+                content = jsonpickle.decode(f.read())
+                for key, value in vars(content).items():
+                    setattr(result, key, value)
+        else:
+            raise ValueError("Could not find JSON input file")
+
+    # Override with user-provided command-line arguments
+    for key in arg_flags:
+        if hasattr(args, key):
+            user_value = getattr(args, key, None)
+            setattr(result, key, user_value)
+
+    return result
 
 
 def makePathAbsolute(p: str) -> str:
@@ -30,23 +75,49 @@ def createDirectory(directory: str):
     if not os.path.exists(path):
         os.makedirs(path)
 
+def parse_cli_list(value: str):
+    # Simple parser for lists passed as comma-separated values
+    return value.split(',')
+
+def parse_cli_boolean(cli_args, cli_arg_key):
+    # Determines boolean value based on command line presence
+    if cli_arg_key in cli_args:
+        return True  # Presence of flag implies True
+    return False
 
-def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool):
+def createArgsFromJson(jsonFile: str):
     arguments = []
-    with open(in_json, "r") as f:
+    ignore_elements = ["py/object"]
+    cli_args = sys.argv[1:]  # Skipping the script name itself
+
+    with open(jsonFile, "r") as f:
         data = json.load(f)
+
+    processed_cli_keys = []  # To track which CLI keys have been processed
+
     for key, value in data.items():
         if key not in ignore_elements:
-            if key == "extra_metrics" and isinstance(value, list):
-                arguments.append("--extra_metrics")
-                arguments.extend(value)
+            cli_arg_key = f"--{key}"
+            if cli_arg_key in cli_args:
+                processed_cli_keys.append(cli_arg_key)
+                arg_index = cli_args.index(cli_arg_key) + 1
+                if isinstance(value, bool):
+                    value = parse_cli_boolean(cli_args, cli_arg_key)
+                elif arg_index < len(cli_args):
+                    cli_value = cli_args[arg_index]
+                    if isinstance(value, list):
+                        value = parse_cli_list(cli_value)
+                    else:
+                        value = cli_value  # Override JSON value with command-line value
+            if isinstance(value, bool) and value:
+                arguments.append(cli_arg_key)
+            elif isinstance(value, list):
+                arguments.append(cli_arg_key)
+                arguments.extend(map(str, value))  # Ensure all elements are strings
             else:
-                arguments.append("--" + str(key))
-                arguments.append(str(value))
-    if return_json_object:
-        return arguments, data
-    return arguments
+                arguments.extend([cli_arg_key, str(value)])
 
+    return arguments
 
 def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool):
     """
@@ -76,49 +147,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool):
     return mol
 
 
-def generate_scaffold(
-    mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True
-) -> str:
-    """
-    Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey.
-
-    :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string.
-    :param include_chirality: Whether to include chirality in the computed scaffold.
-    :return: The Bemis-Murcko scaffold for the molecule.
-    """
-    if isinstance(mol, str):
-        if mol.startswith("InChI="):
-            mol = inchi_to_mol(mol)
-        else:
-            mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False)
-    elif isinstance(mol, tuple):
-        mol = mol[0]
-    scaffold = MurckoScaffold.MurckoScaffoldSmiles(
-        mol=mol, includeChirality=include_chirality
-    )
-
-    return scaffold
-
-
-def scaffold_to_smiles(
-    mols: List[str], use_indices: bool = False
-) -> Dict[str, Union[Set[str], Set[int]]]:
-    """
-    Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices).
-    :param mols: A list of SMILES.
-    :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than
-                        mapping to the smiles string itself. This is necessary if there are duplicate smiles.
-    :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold.
-    """
-    scaffolds = defaultdict(set)
-    for i, mol in tqdm(enumerate(mols), total=len(mols)):
-        scaffold = generate_scaffold(mol)
-        if use_indices:
-            scaffolds[scaffold].add(i)
-        else:
-            scaffolds[scaffold].add(mol)
-
-    return scaffolds
 
 
 # def inchi_to_mol(inchi: str) -> Chem.Mol:
@@ -184,7 +212,49 @@ def weight_split(
     test_df = sorted_data.iloc[test_indices].reset_index(drop=True)
 
     return train_df, val_df, test_df
+def generate_scaffold(
+    mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True
+) -> str:
+    """
+    Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey.
 
+    :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string.
+    :param include_chirality: Whether to include chirality in the computed scaffold.
+    :return: The Bemis-Murcko scaffold for the molecule.
+    """
+    if isinstance(mol, str):
+        if mol.startswith("InChI="):
+            mol = inchi_to_mol(mol)
+        else:
+            mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False)
+    elif isinstance(mol, tuple):
+        mol = mol[0]
+    scaffold = MurckoScaffold.MurckoScaffoldSmiles(
+        mol=mol, includeChirality=include_chirality
+    )
+
+    return scaffold
+
+
+def scaffold_to_smiles(
+    mols: List[str], use_indices: bool = False
+) -> Dict[str, Union[Set[str], Set[int]]]:
+    """
+    Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices).
+    :param mols: A list of SMILES.
+    :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than
+                        mapping to the smiles string itself. This is necessary if there are duplicate smiles.
+    :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold.
+    """
+    scaffolds = defaultdict(set)
+    for i, mol in tqdm(enumerate(mols), total=len(mols)):
+        scaffold = generate_scaffold(mol)
+        if use_indices:
+            scaffolds[scaffold].add(i)
+        else:
+            scaffolds[scaffold].add(mol)
+
+    return scaffolds
 
 def ae_scaffold_split(
     data: pd.DataFrame,
@@ -309,7 +379,7 @@ def log_scaffold_stats(
         targets = [
             c
             for c in data.columns
-            if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"]
+            if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",]
         ]
         # targets = data_set.iloc[:, 2:].values
         targets = data_set.loc[:, targets].values

From 16a24f4c77a0a2510217e8c4dc96065740ede221 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 14:39:51 +0100
Subject: [PATCH 02/19] flaked and fixed predictgnn arg

---
 dfpl/__main__.py        |  4 +---
 dfpl/options.py         | 13 ++++++++-----
 dfpl/utils.py           | 22 ++++++++++++++++------
 example/predictgnn.json |  1 +
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/dfpl/__main__.py b/dfpl/__main__.py
index 8d035579..fe66eec8 100755
--- a/dfpl/__main__.py
+++ b/dfpl/__main__.py
@@ -61,9 +61,7 @@ def interpretdmpnn(opts: options.GnnOptions) -> None:
     arguments = createArgsFromJson(jsonFile=opts.configFile)
     opts = chemprop.args.InterpretArgs().parse_args(arguments)
 
-    chemprop.interpret.interpret(
-        args=opts, save_to_csv=True
-    )
+    chemprop.interpret.interpret(args=opts, save_to_csv=True)
 
 
 def train(opts: options.Options):
diff --git a/dfpl/options.py b/dfpl/options.py
index 85e245bc..5def434d 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -3,11 +3,11 @@
 import argparse
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional, Literal, List
+from typing import List, Literal, Optional
 
 import jsonpickle
 import torch
-from chemprop.args import TrainArgs, PredictArgs, InterpretArgs
+from chemprop.args import InterpretArgs, PredictArgs, TrainArgs
 
 from dfpl.utils import parseCmdArgs
 
@@ -107,6 +107,7 @@ class GnnOptions(TrainArgs):
     wabTracking: bool = False
     split_sizes: List[float] = None
     # save_smiles_splits: bool = False
+
     @classmethod
     def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):
         # Initialize with JSON config if provided
@@ -1234,7 +1235,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--checkpoint_path",
         type=str,
         metavar="FILE",
-        help="Path to model checkpoint (.pt file)"
+        help="Path to model checkpoint (.pt file)",
     )
     # general_args.add_argument(
     #     "--no_features_scaling",
@@ -1318,10 +1319,12 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         ],
         help="Methods used for calibrating the uncertainty calculated with uncertainty method.",
     )
-    uncertainty_args.add_argument("--individual_ensemble_predictions",
+    uncertainty_args.add_argument(
+        "--individual_ensemble_predictions",
         action="store_true",
         default=False,
-        help="Whether to save individual ensemble predictions.")
+        help="Whether to save individual ensemble predictions.",
+    )
     uncertainty_args.add_argument(
         "--evaluation_methods",
         type=str,
diff --git a/dfpl/utils.py b/dfpl/utils.py
index ccf931df..338981c9 100644
--- a/dfpl/utils.py
+++ b/dfpl/utils.py
@@ -75,9 +75,11 @@ def createDirectory(directory: str):
     if not os.path.exists(path):
         os.makedirs(path)
 
+
 def parse_cli_list(value: str):
     # Simple parser for lists passed as comma-separated values
-    return value.split(',')
+    return value.split(",")
+
 
 def parse_cli_boolean(cli_args, cli_arg_key):
     # Determines boolean value based on command line presence
@@ -85,6 +87,7 @@ def parse_cli_boolean(cli_args, cli_arg_key):
         return True  # Presence of flag implies True
     return False
 
+
 def createArgsFromJson(jsonFile: str):
     arguments = []
     ignore_elements = ["py/object"]
@@ -119,6 +122,7 @@ def createArgsFromJson(jsonFile: str):
 
     return arguments
 
+
 def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool):
     """
     Builds an RDKit molecule from a SMILES string.
@@ -147,10 +151,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool):
     return mol
 
 
-
-
-# def inchi_to_mol(inchi: str) -> Chem.Mol:
-#     return Chem.inchi.MolFromInchi(inchi)
 def smiles_to_mol(smiles: str) -> Chem.Mol:
     mol = Chem.MolFromSmiles(smiles)
     if mol is None:
@@ -212,6 +212,8 @@ def weight_split(
     test_df = sorted_data.iloc[test_indices].reset_index(drop=True)
 
     return train_df, val_df, test_df
+
+
 def generate_scaffold(
     mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True
 ) -> str:
@@ -256,6 +258,7 @@ def scaffold_to_smiles(
 
     return scaffolds
 
+
 def ae_scaffold_split(
     data: pd.DataFrame,
     sizes: Tuple[float, float, float] = (0.8, 0, 0.2),
@@ -379,7 +382,14 @@ def log_scaffold_stats(
         targets = [
             c
             for c in data.columns
-            if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",]
+            if c
+            not in [
+                "fp",
+                "morganfp",
+                "fpcompressed",
+                "id",
+                "smiles",
+            ]
         ]
         # targets = data_set.iloc[:, 2:].values
         targets = data_set.loc[:, targets].values
diff --git a/example/predictgnn.json b/example/predictgnn.json
index 157b5e05..221622de 100644
--- a/example/predictgnn.json
+++ b/example/predictgnn.json
@@ -1,6 +1,7 @@
 {
   "py/object": "dfpl.options.GnnOptions",
   "test_path": "tests/data/smiles.csv",
+  "preds_path": "preds.csv",
   "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt",
   "save_dir": "preds_dmpnn",
   "saving_name": "DMPNN_preds.csv"

From 774b0a1afa30efa4b3afa14221d2cd4ad04df6b4 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 15:29:45 +0100
Subject: [PATCH 03/19] add json

---
 example/predictgnn.json | 2 --
 1 file changed, 2 deletions(-)

diff --git a/example/predictgnn.json b/example/predictgnn.json
index 221622de..c76aa96c 100644
--- a/example/predictgnn.json
+++ b/example/predictgnn.json
@@ -3,6 +3,4 @@
   "test_path": "tests/data/smiles.csv",
   "preds_path": "preds.csv",
   "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt",
-  "save_dir": "preds_dmpnn",
-  "saving_name": "DMPNN_preds.csv"
 }
\ No newline at end of file

From 11fb829840f7c1bb5574cab4855e4de7fec78b38 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 15:40:21 +0100
Subject: [PATCH 04/19] remove comma

---
 example/predictgnn.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/predictgnn.json b/example/predictgnn.json
index c76aa96c..1055230f 100644
--- a/example/predictgnn.json
+++ b/example/predictgnn.json
@@ -2,5 +2,5 @@
   "py/object": "dfpl.options.GnnOptions",
   "test_path": "tests/data/smiles.csv",
   "preds_path": "preds.csv",
-  "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt",
+  "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt"
 }
\ No newline at end of file

From add3993e610e5f7e14b308a0e098d29669ee23a0 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 16:28:19 +0100
Subject: [PATCH 05/19] final fix

---
 .github/workflows/pr.yml | 12 +++++-------
 example/predictgnn.json  |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 27f43c34..47c709e6 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -94,12 +94,10 @@ jobs:
           echo "predict result directory missing" >&2 
           exit 1
         fi
-      
-        echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv)
-        if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then
-          echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2 
+        
+        dfpl convert -f tests/data
+        if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
+          echo "not all csv files are converted to pickle ones" >&2 
           exit 1
         fi
-        
-        
-        dfpl convert -f tests/data
\ No newline at end of file
+        echo "All tests passed!"
\ No newline at end of file
diff --git a/example/predictgnn.json b/example/predictgnn.json
index 1055230f..dfdd6a8d 100644
--- a/example/predictgnn.json
+++ b/example/predictgnn.json
@@ -1,6 +1,6 @@
 {
   "py/object": "dfpl.options.GnnOptions",
   "test_path": "tests/data/smiles.csv",
-  "preds_path": "preds.csv",
+  "preds_path": "preds_dmpnn/preds.csv",
   "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt"
 }
\ No newline at end of file

From fa33f2f847761a76ba1602e340e07797144d8338 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 18:06:01 +0100
Subject: [PATCH 06/19] final fix

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 47c709e6..87173151 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -96,7 +96,7 @@ jobs:
         fi
         
         dfpl convert -f tests/data
-        if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
+        if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
           echo "not all csv files are converted to pickle ones" >&2 
           exit 1
         fi

From 1f59fe911ed726d3b7f71073e6634c15b1e4e98a Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Wed, 6 Mar 2024 18:18:11 +0100
Subject: [PATCH 07/19] final fix

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 87173151..dd97e1aa 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -96,7 +96,7 @@ jobs:
         fi
         
         dfpl convert -f tests/data
-        if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
+        if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
           echo "not all csv files are converted to pickle ones" >&2 
           exit 1
         fi

From 00fa01280f2c531726d09cc50dea652b3b6df201 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Thu, 7 Mar 2024 10:10:02 +0100
Subject: [PATCH 08/19] convert fix

---
 .github/workflows/pr.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index dd97e1aa..c854fb43 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -96,8 +96,8 @@ jobs:
         fi
         
         dfpl convert -f tests/data
-        if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
-          echo "not all csv files are converted to pickle ones" >&2 
-          exit 1
+        if [ "$(find tests/data \( -name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
+            echo "not all csv files are converted to pickle ones" >&2 
+            exit 1                                                                                                                                                                                    
         fi
         echo "All tests passed!"
\ No newline at end of file

From ace62d3d467f346b42e91891901bd6dbf600bd51 Mon Sep 17 00:00:00 2001
From: soulios <90351285+soulios@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:18:18 +0100
Subject: [PATCH 09/19] Update dfpl/options.py

Co-authored-by: M Bernt <m.bernt@ufz.de>
---
 dfpl/options.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 5def434d..60112423 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -1358,7 +1358,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--regression_calibrator_metric",
         type=str,
         choices=["stdev", "interval"],
-        help="Regression calibrators can output either a stdev or an inverval.",
+        help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.",
     )
 
 

From 8a1b334b2f227e02cd625358c2911277f846ed76 Mon Sep 17 00:00:00 2001
From: soulios <90351285+soulios@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:30:24 +0100
Subject: [PATCH 10/19] Apply suggestions from code review

Co-authored-by: M Bernt <m.bernt@ufz.de>
---
 dfpl/options.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 60112423..9f304c1a 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -1258,35 +1258,35 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
     files_args.add_argument(
         "--preds_path",
         type=str,
-        help="Path to CSV or PICKLE file where predictions will be saved.",
+        help="Predictions output file. CSV or PICKLE file where predictions will be saved.",
     )
     files_args.add_argument(
         "--calibration_path",
         type=str,
-        help="Path to data file to be used for uncertainty calibration.",
+        help="Data file to be used for uncertainty calibration.",
     )
     files_args.add_argument(
         "--calibration_features_path",
         type=str,
         nargs="+",
-        help="Path to features data to be used with the uncertainty calibration dataset.",
+        help="Feature data file to be used with the uncertainty calibration dataset.",
     )
     files_args.add_argument("--calibration_phase_features_path", type=str, help="")
     files_args.add_argument(
         "--calibration_atom_descriptors_path",
         type=str,
-        help="Path to the extra atom descriptors.",
+    help="Extra atom descriptors file.",
     )
     files_args.add_argument(
         "--calibration_bond_descriptors_path",
         type=str,
-        help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.",
+        help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.",
     )
 
     general_args.add_argument(
         "--drop_extra_columns",
         action="store_true",
-        help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.",
+        help="Keep only SMILES and new prediction columns in the test data files.",
     )
 
     uncertainty_args.add_argument(
@@ -1323,13 +1323,13 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--individual_ensemble_predictions",
         action="store_true",
         default=False,
-        help="Whether to save individual ensemble predictions.",
+        help="Save individual ensemble predictions.",
     )
     uncertainty_args.add_argument(
         "--evaluation_methods",
         type=str,
         nargs="+",
-        help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
+        help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
     )
     uncertainty_args.add_argument(
         "--evaluation_scores_path",
@@ -1352,7 +1352,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--calibration_interval_percentile",
         type=float,
         default=95,
-        help="Sets the percentile used in the calibration methods. Must be in the range (1,100).",
+        help="Percentile used in calibration methods. Must be in the range (1,100).",
     )
     uncertainty_args.add_argument(
         "--regression_calibrator_metric",

From 3c92b98600fe67e87f4fe30d6ed016e70965ab49 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Fri, 8 Mar 2024 11:51:20 +0100
Subject: [PATCH 11/19] edited help  in args

---
 dfpl/options.py | 204 +++++++++++++++++++++++++++++-------------------
 1 file changed, 124 insertions(+), 80 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 9f304c1a..782e55d8 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -338,13 +338,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     general_args.add_argument(
         "--compressFeatures",
         action="store_true",
-        help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.",
+        help="Compresses the fingerprints. Needs a path of a trained autoencoder or needs the trainAC also set to True.",
         default=False,
     )
     general_args.add_argument(
         "--enableMultiLabel",
         action="store_true",
-        help="Train multi-label classification model in addition to the individual models.",
+        help="Train multi-label classification model. individual models.",
         default=False,
     )
     # Autoencoder Configuration
@@ -360,7 +360,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "--ecModelDir",
         type=str,
         metavar="DIR",
-        help="The directory where the full model of the encoder will be saved",
+        help="The directory where the full encoder will be saved",
         default="example/results_train/AE_encoder/",
     )
     autoencoder_args.add_argument(
@@ -390,7 +390,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["relu", "selu"],
-        help="The activation function for the hidden layers in the autoencoder.",
+        help="The activation function of the autoencoder.",
         default="relu",
     )
     autoencoder_args.add_argument(
@@ -412,7 +412,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
-        help="Set how the data is going to be split for the autoencoder",
+        help="Set how the data is split for the autoencoder",
         default="random",
     )
     autoencoder_args.add_argument(
@@ -435,14 +435,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
-        help="Set how the data is going to be split for the feedforward neural network",
+        help="Set how the data is split for the feedforward neural network",
         default="random",
     )
     training_args.add_argument(
         "--testSize",
         metavar="FLOAT",
         type=float,
-        help="Fraction of the dataset that should be used for testing. Value in [0,1].",
+        help="Fraction[0,1] of the dataset that should be used for testing",
         default=0.2,
     )
     training_args.add_argument(
@@ -450,7 +450,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "--kFolds",
         metavar="INT",
         type=int,
-        help="K that is used for K-fold cross-validation in the training procedure.",
+        help="Number of folds for cross-validation.",
         default=1,
     )
     training_args.add_argument(
@@ -466,28 +466,27 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     training_args.add_argument(
         "--trainAC",
         action="store_true",
-        help="Choose to train or not, the autoencoder based on the input file",
+        help="Trains the autoencoder.",
         default=False,
     )
     training_args.add_argument(
         "--trainFNN",
         action="store_false",
-        help="When called it deactivates the training.",
+        help="Deactivates the FNN training.",
         default=True,
     )
     training_args.add_argument(
         "--sampleFractionOnes",
         metavar="FLOAT",
         type=float,
-        help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)."
-        "only works if --sampleDown is enabled",
+        help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled",
         default=0.5,
     )
     training_args.add_argument(
         "--sampleDown",
         metavar="BOOL",
         type=bool,
-        help="Enable automatic down sampling of the 0 valued samples.",
+        help="Down sampling of the 0 valued samples.",
         default=False,
     )
     training_args.add_argument(
@@ -495,7 +494,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "--epochs",
         metavar="INT",
         type=int,
-        help="Number of epochs that should be used for the FNN training",
+        help="Number of epochs for the FNN training",
         default=100,
     )
     # TODO CHECK IF ALL LOSSES MAKE SENSE HERE
@@ -504,7 +503,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["mse", "bce", "focal"],
-        help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.",
+        help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy.",
         default="bce",
     )
     # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS?
@@ -513,7 +512,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["Adam", "SGD"],
-        help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"',
+        help='Optimizer of the FNN.',
         default="Adam",
     )
     training_args.add_argument(
@@ -556,7 +555,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["relu", "selu"],
-        help="The activation function for hidden layers in the FNN.",
+        help="The activation function of the FNN.",
         default="relu",
     )
     # Tracking Configuration
@@ -564,23 +563,22 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "--aeWabTracking",
         metavar="BOOL",
         type=bool,
-        help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.",
+        help="Track autoencoder performance via Weights & Biases.",
         default=False,
     )
     tracking_args.add_argument(
         "--wabTracking",
         metavar="BOOL",
         type=bool,
-        help="Track FNN performance via Weights & Biases, see https://wandb.ai.",
+        help="Track FNN performance via Weights & Biases",
         default=False,
     )
     tracking_args.add_argument(
         "--wabTarget",
         metavar="STRING",
         type=str,
-        choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"],
-        help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.",
-        default="AR",
+        help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name.",
+        default=None,
     )
 
 
@@ -598,7 +596,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "--configFile",
         metavar="FILE",
         type=str,
-        help="Input JSON file that contains all information for training/predicting.",
+        help="JSON file that contains all information for training/predicting.",
     )
     files_args.add_argument(
         "-i",
@@ -620,19 +618,17 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "--outputDir",
         metavar="DIR",
         type=str,
-        help="Prefix of output directory. It will contain a log file and the file specified"
-        "with --outputFile.",
+        help="Prefix of output directory. It will contain a log file and the file specified with --outputFile.",
         default="example/results_predict/",
     )
     files_args.add_argument(
         "--outputFile",
         metavar="FILE",
         type=str,
-        help="Output .CSV file name which will contain one prediction per input line. "
+        help="Output csv file name which will contain one prediction per input line. "
         "Default: prefix of input file name.",
         default="results.csv",
     )
-    # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES?
     general_args.add_argument(
         "-t",
         "--type",
@@ -648,34 +644,37 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         metavar="STR",
         type=str,
         choices=["topological", "MACCS"],
-        help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.",
+        help="The type of fingerprint to be generated/used in input file.",
         default="topological",
     )
     files_args.add_argument(
         "--ecModelDir",
         type=str,
         metavar="DIR",
-        help="The directory where the full model of the encoder will be saved (if trainAE=True) or "
-        "loaded from (if trainAE=False). Provide a full path here.",
+        help="The encoder dir where it is saved (if trainAE=True) or "
+        "it is loaded from (if trainAE=False). Provide a full path here.",
         default="",
     )
     files_args.add_argument(
         "--ecWeightsFile",
         type=str,
         metavar="STR",
-        help="The file  where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.",
+        help="The encoder file where it is loaded from, to compress the fingerprints.",
         default="",
     )
     files_args.add_argument(
         "--fnnModelDir",
         type=str,
         metavar="DIR",
-        help="The directory where the full model of the fnn is loaded from. "
-        "Provide a full path here.",
+        help="The directory where the full model of the fnn is loaded from.",
         default="example/results_train/AR_saved_model",
     )
     general_args.add_argument(
-        "-c", "--compressFeatures", action="store_true", default=False
+        "-c",
+        "--compressFeatures",
+        action="store_true",
+        help="Compresses the fingerprints if encoder dir/file is provided",
+        default=False
     )
     (
         general_args.add_argument(
@@ -737,20 +736,20 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
     # General arguments
-    general_args.add_argument("--split_key_molecule", type=int)
-    general_args.add_argument("--pytorch_seed", type=int)
-    general_args.add_argument("--cache_cutoff", type=float)
-    general_args.add_argument("--save_preds", type=bool)
+    general_args.add_argument("--split_key_molecule",help="The index of the key molecule used for splitting", type=int)
+    general_args.add_argument("--pytorch_seed",help="Seed for pytorch", type=int)
+    general_args.add_argument("--cache_cutoff",help="Maximum number of molecules in dataset to allow caching.", type=float)
+    general_args.add_argument("--save_preds",help="Saves test split predictions during training", type=bool)
     general_args.add_argument("--wabTracking", action="store_true", default=False)
     general_args.add_argument(
         "--cuda", action="store_true", default=False, help="Turn on cuda"
     )
-    # general_args.add_argument(
-    #     "--save_smiles_splits",
-    #     action="store_true",
-    #     default=False,
-    #     help="Save smiles for each train/val/test splits for prediction convenience later",
-    # )
+    general_args.add_argument(
+        "--save_smiles_splits",
+        action="store_true",
+        default=False,
+        help="Save smiles for each train/val/test splits",
+    )
     general_args.add_argument(
         "--test",
         action="store_true",
@@ -775,13 +774,13 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         type=int,
         metavar="INT",
         default=10,
-        help="The number of batches between each logging of the training loss",
+        help="The number of batches between each log",
     )
     general_args.add_argument(
-        "--no_cache",
+        "--no_cache_mol",
         action="store_true",
         default=False,
-        help="Turn off caching mol2graph computation",
+        help="If raised, Turn off caching rdkit mols",
     )
 
     # FILES ARGUMENTS
@@ -790,7 +789,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         "--configFile",
         metavar="FILE",
         type=str,
-        help="Input JSON file that contains all information for training/predicting.",
+        help="JSON file that contains all configuration for training/predicting.",
     )
     files_args.add_argument(
         "--save_dir",
@@ -950,7 +949,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         type=int,
         metavar="INT",
         default=3,
-        help="Number of classes when running multiclass classification",
+        help="Number of classes in multiclass classification",
     )
     data_args.add_argument(
         "--split_type",
@@ -993,6 +992,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     data_args.add_argument(
         "--target_columns",
         type=str,
+        nargs="*",
         metavar="STRING",
         help="Name of the target columns",
     )
@@ -1000,11 +1000,12 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     data_args.add_argument(
         "--ignore_columns",
         type=str,
+        nargs="*",
         metavar="STRING",
         help="Names of the columns to ignore",
     )
     data_args.add_argument(
-        "--num_tasks", type=int, metavar="INT", help="NUmber of tasks"
+        "--num_tasks", type=int, metavar="INT", help="Number of tasks"
     )
     data_args.add_argument(
         "--no_features_scaling",
@@ -1102,35 +1103,71 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         default=2,
         help="Number of layers in FFN after MPN encoding",
     )
-    model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING")
+    model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING",help="Freeze the loaded model")
 
     # Model arguments
-    model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL")
+    # model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL")
     model_args.add_argument(
         "--show_individual_scores",
         action="store_true",
         default=True,
         help="Show all scores for individual targets, not just average, at the end",
     )
-    model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"])
-    model_args.add_argument("--aggregation_norm", type=int)
-    model_args.add_argument("--explicit_h", type=bool, metavar="BOOL")
-    model_args.add_argument("--adding_h", type=bool, metavar="BOOL")
+    model_args.add_argument(
+        "--aggregation",
+        choices=["mean", "sum", "norm"],
+        help="Aggregation scheme for atomic vectors into molecular vectors")
+    model_args.add_argument(
+        "--aggregation_norm",
+        type=int,
+        help="For norm aggregation, number by which to divide summed up atomic features")
+    # model_args.add_argument("--explicit_h", type=bool, metavar="BOOL",help="A explicit hydrogen")
+    model_args.add_argument(
+        "--adding_h",
+        type=bool,
+        metavar="BOOL",
+        help="Adding hydrogen")
     # Training arguments
-    model_args.add_argument("--class_balance", type=bool, metavar="BOOL")
-    model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT")
     model_args.add_argument(
-        "--overwrite_default_atom_features", type=bool, metavar="BOOL"
+        "--class_balance",
+        type=bool,
+        metavar="BOOL",
+        help="Balances the classes across batches")
+    model_args.add_argument(
+        "--evidential_regularization",
+        type=float,
+        metavar="FLOAT",
+        help="Regularization parameter for evidential loss")
+    model_args.add_argument(
+        "--overwrite_default_atom_features",
+        type=bool,
+        metavar="BOOL",
+        help="Overwrites default atom features instead of concatenating"
+    )
+    model_args.add_argument(
+        "--no_atom_descriptor_scaling",
+        type=bool,
+        metavar="BOOL")
+    model_args.add_argument(
+        "--overwrite_default_bond_features",
+        type=bool,
+        metavar="BOOL",
+        help="Overwrites default bond features instead of concatenating"
     )
-    model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL")
     model_args.add_argument(
-        "--overwrite_default_bond_features", type=bool, metavar="BOOL"
+        "--frzn_ffn_layers",
+        type=int,
+        metavar="INT",
+        help="Number of layers in FFN to freeze"
     )
-    model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT")
-    model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL")
+    # model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL")
     # Training arguments
     training_args.add_argument(
-        "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run"
+        "--epochs",
+        type=int,
+        metavar="INT",
+        default=30,
+        help="Number of epochs to run"
     )
     training_args.add_argument(
         "--total_epochs",
@@ -1140,7 +1177,11 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Number of total epochs to run",
     )
     training_args.add_argument(
-        "--batch_size", type=int, metavar="INT", default=50, help="Batch size"
+        "--batch_size",
+        type=int,
+        metavar="INT",
+        default=50,
+        help="Batch size"
     )
     training_args.add_argument(
         "--warmup_epochs",
@@ -1196,7 +1237,12 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
             "dirichlet",
         ],
     )
-    training_args.add_argument("--grad_clip", type=float)
+    training_args.add_argument(
+        "--grad_clip",
+       type=float,
+       metavar="FLOAT",
+       help="Gradient clipping value"
+    )
     training_args.add_argument(
         "--metric",
         type=str,
@@ -1237,23 +1283,17 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         metavar="FILE",
         help="Path to model checkpoint (.pt file)",
     )
-    # general_args.add_argument(
-    #     "--no_features_scaling",
-    #     action="store_true",
-    #     help="Turn on scaling of features",
-    # )
     files_args.add_argument(
         "-f",
         "--configFile",
         type=str,
         metavar="FILE",
-        help="Path to a .json file containing arguments. Any arguments present in the config"
-        "file will override arguments specified via the command line or by the defaults.",
+        help="Path to a .json file containing arguments. CLI arguments will override these.",
     )
     files_args.add_argument(
         "--test_path",
         type=str,
-        help="Path to CSV file containing testing data for which predictions will be made.",
+        help="Path to CSV file for which predictions will be made.",
     )
     files_args.add_argument(
         "--preds_path",
@@ -1275,12 +1315,13 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
     files_args.add_argument(
         "--calibration_atom_descriptors_path",
         type=str,
-    help="Extra atom descriptors file.",
+        help="Extra atom descriptors file.",
     )
     files_args.add_argument(
         "--calibration_bond_descriptors_path",
         type=str,
-        help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.",
+        help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to "
+             "featurize a given molecule.",
     )
 
     general_args.add_argument(
@@ -1317,7 +1358,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
             "platt",
             "isotonic",
         ],
-        help="Methods used for calibrating the uncertainty calculated with uncertainty method.",
+        help="Methods used for calibrating the uncertainty.",
     )
     uncertainty_args.add_argument(
         "--individual_ensemble_predictions",
@@ -1329,7 +1370,9 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--evaluation_methods",
         type=str,
         nargs="+",
-        help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
+        help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes "
+             "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available "
+             "classification or multiclass metric.",
     )
     uncertainty_args.add_argument(
         "--evaluation_scores_path",
@@ -1346,7 +1389,8 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--dropout_sampling_size",
         type=int,
         default=10,
-        help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.",
+        help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout "
+             "used during training.",
     )
     uncertainty_args.add_argument(
         "--calibration_interval_percentile",
@@ -1397,7 +1441,7 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None:
         "--data_path",
         type=str,
         metavar="FILE",
-        help="Path to CSV file containing testing data for which predictions will be made",
+        help="Path to CSV file  for which predictions will be made",
     )
     interpret_args.add_argument(
         "--max_atoms",

From 40e6b0ba6c1044ad28d1a96fc039cbf1eefc474e Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Fri, 8 Mar 2024 12:06:32 +0100
Subject: [PATCH 12/19] flaked and blacked

---
 dfpl/options.py | 82 ++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 782e55d8..1d041de6 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -106,6 +106,7 @@ class GnnOptions(TrainArgs):
     evaluation_scores_path: str = ""
     wabTracking: bool = False
     split_sizes: List[float] = None
+
     # save_smiles_splits: bool = False
 
     @classmethod
@@ -512,7 +513,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         metavar="STRING",
         type=str,
         choices=["Adam", "SGD"],
-        help='Optimizer of the FNN.',
+        help="Optimizer of the FNN.",
         default="Adam",
     )
     training_args.add_argument(
@@ -674,7 +675,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "--compressFeatures",
         action="store_true",
         help="Compresses the fingerprints if encoder dir/file is provided",
-        default=False
+        default=False,
     )
     (
         general_args.add_argument(
@@ -736,10 +737,20 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
     # General arguments
-    general_args.add_argument("--split_key_molecule",help="The index of the key molecule used for splitting", type=int)
-    general_args.add_argument("--pytorch_seed",help="Seed for pytorch", type=int)
-    general_args.add_argument("--cache_cutoff",help="Maximum number of molecules in dataset to allow caching.", type=float)
-    general_args.add_argument("--save_preds",help="Saves test split predictions during training", type=bool)
+    general_args.add_argument(
+        "--split_key_molecule",
+        type=int,
+        help="The index of the key molecule used for splitting",
+    )
+    general_args.add_argument("--pytorch_seed", type=int, help="Seed for pytorch")
+    general_args.add_argument(
+        "--cache_cutoff",
+        type=float,
+        help="Maximum number of molecules in dataset to allow caching.",
+    )
+    general_args.add_argument(
+        "--save_preds", help="Saves test split predictions during training", type=bool
+    )
     general_args.add_argument("--wabTracking", action="store_true", default=False)
     general_args.add_argument(
         "--cuda", action="store_true", default=False, help="Turn on cuda"
@@ -1103,8 +1114,9 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         default=2,
         help="Number of layers in FFN after MPN encoding",
     )
-    model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING",help="Freeze the loaded model")
-
+    model_args.add_argument(
+        "--checkpoint_frzn", type=str, metavar="STRING", help="Freeze the loaded model"
+    )
     # Model arguments
     # model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL")
     model_args.add_argument(
@@ -1116,58 +1128,53 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     model_args.add_argument(
         "--aggregation",
         choices=["mean", "sum", "norm"],
-        help="Aggregation scheme for atomic vectors into molecular vectors")
+        help="Aggregation scheme for atomic vectors into molecular vectors",
+    )
     model_args.add_argument(
         "--aggregation_norm",
         type=int,
-        help="For norm aggregation, number by which to divide summed up atomic features")
+        help="For norm aggregation, number by which to divide summed up atomic features",
+    )
     # model_args.add_argument("--explicit_h", type=bool, metavar="BOOL",help="A explicit hydrogen")
     model_args.add_argument(
-        "--adding_h",
-        type=bool,
-        metavar="BOOL",
-        help="Adding hydrogen")
+        "--adding_h", type=bool, metavar="BOOL", help="Adding hydrogen"
+    )
     # Training arguments
     model_args.add_argument(
         "--class_balance",
         type=bool,
         metavar="BOOL",
-        help="Balances the classes across batches")
+        help="Balances the classes across batches",
+    )
     model_args.add_argument(
         "--evidential_regularization",
         type=float,
         metavar="FLOAT",
-        help="Regularization parameter for evidential loss")
+        help="Regularization parameter for evidential loss",
+    )
     model_args.add_argument(
         "--overwrite_default_atom_features",
         type=bool,
         metavar="BOOL",
-        help="Overwrites default atom features instead of concatenating"
+        help="Overwrites default atom features instead of concatenating",
     )
-    model_args.add_argument(
-        "--no_atom_descriptor_scaling",
-        type=bool,
-        metavar="BOOL")
+    model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL")
     model_args.add_argument(
         "--overwrite_default_bond_features",
         type=bool,
         metavar="BOOL",
-        help="Overwrites default bond features instead of concatenating"
+        help="Overwrites default bond features instead of concatenating",
     )
     model_args.add_argument(
         "--frzn_ffn_layers",
         type=int,
         metavar="INT",
-        help="Number of layers in FFN to freeze"
+        help="Number of layers in FFN to freeze",
     )
     # model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL")
     # Training arguments
     training_args.add_argument(
-        "--epochs",
-        type=int,
-        metavar="INT",
-        default=30,
-        help="Number of epochs to run"
+        "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run"
     )
     training_args.add_argument(
         "--total_epochs",
@@ -1177,11 +1184,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Number of total epochs to run",
     )
     training_args.add_argument(
-        "--batch_size",
-        type=int,
-        metavar="INT",
-        default=50,
-        help="Batch size"
+        "--batch_size", type=int, metavar="INT", default=50, help="Batch size"
     )
     training_args.add_argument(
         "--warmup_epochs",
@@ -1238,10 +1241,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         ],
     )
     training_args.add_argument(
-        "--grad_clip",
-       type=float,
-       metavar="FLOAT",
-       help="Gradient clipping value"
+        "--grad_clip", type=float, metavar="FLOAT", help="Gradient clipping value"
     )
     training_args.add_argument(
         "--metric",
@@ -1321,7 +1321,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         "--calibration_bond_descriptors_path",
         type=str,
         help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to "
-             "featurize a given molecule.",
+        "featurize a given molecule.",
     )
 
     general_args.add_argument(
@@ -1371,8 +1371,8 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         type=str,
         nargs="+",
         help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes "
-             "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available "
-             "classification or multiclass metric.",
+        "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available "
+        "classification or multiclass metric.",
     )
     uncertainty_args.add_argument(
         "--evaluation_scores_path",
@@ -1390,7 +1390,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         type=int,
         default=10,
         help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout "
-             "used during training.",
+        "used during training.",
     )
     uncertainty_args.add_argument(
         "--calibration_interval_percentile",

From c03a32e306694ce7fb845d4047e215b49bf4f184 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Mon, 11 Mar 2024 13:37:18 +0100
Subject: [PATCH 13/19] removed metavar from args with choices

---
 dfpl/options.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 1d041de6..5db0c837 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -300,7 +300,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     general_args.add_argument(
         "-t",
         "--type",
-        metavar="STRING",
         type=str,
         choices=["fp", "smiles"],
         help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
@@ -324,7 +323,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
     general_args.add_argument(
         "--fpType",
-        metavar="STR",
         type=str,
         choices=["topological", "MACCS"],
         help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.",
@@ -366,7 +364,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
     autoencoder_args.add_argument(
         "--aeType",
-        metavar="STRING",
         type=str,
         choices=["variational", "deterministic"],
         help="Autoencoder type, variational or deterministic.",
@@ -388,7 +385,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
     autoencoder_args.add_argument(
         "--aeActivationFunction",
-        metavar="STRING",
         type=str,
         choices=["relu", "selu"],
         help="The activation function of the autoencoder.",
@@ -410,7 +406,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
     autoencoder_args.add_argument(
         "--aeSplitType",
-        metavar="STRING",
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
         help="Set how the data is split for the autoencoder",
@@ -433,7 +428,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     # Training Configuration
     training_args.add_argument(
         "--split_type",
-        metavar="STRING",
         type=str,
         choices=["scaffold_balanced", "random", "molecular_weight"],
         help="Set how the data is split for the feedforward neural network",
@@ -457,7 +451,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     training_args.add_argument(
         "-v",
         "--verbose",
-        metavar="INT",
         type=int,
         choices=[0, 1, 2],
         help="Verbosity level. O: No additional output, "
@@ -501,7 +494,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     # TODO CHECK IF ALL LOSSES MAKE SENSE HERE
     training_args.add_argument(
         "--lossFunction",
-        metavar="STRING",
         type=str,
         choices=["mse", "bce", "focal"],
         help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy.",
@@ -510,7 +502,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS?
     training_args.add_argument(
         "--optimizer",
-        metavar="STRING",
         type=str,
         choices=["Adam", "SGD"],
         help="Optimizer of the FNN.",
@@ -553,7 +544,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
     training_args.add_argument(
         "--activationFunction",
-        metavar="STRING",
         type=str,
         choices=["relu", "selu"],
         help="The activation function of the FNN.",
@@ -633,7 +623,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
     general_args.add_argument(
         "-t",
         "--type",
-        metavar="STR",
         type=str,
         choices=["fp", "smiles"],
         help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
@@ -642,7 +631,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
     general_args.add_argument(
         "-k",
         "--fpType",
-        metavar="STR",
         type=str,
         choices=["topological", "MACCS"],
         help="The type of fingerprint to be generated/used in input file.",
@@ -680,7 +668,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
     (
         general_args.add_argument(
             "--aeType",
-            metavar="STRING",
             type=str,
             choices=["variational", "deterministic"],
             help="Autoencoder type, variational or deterministic.",
@@ -699,7 +686,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     uncertainty_args.add_argument(
         "--uncertainty_method",
         type=str,
-        metavar="STRING",
         choices=[
             "mve",
             "ensemble",
@@ -717,7 +703,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     uncertainty_args.add_argument(
         "--calibration_method",
         type=str,
-        metavar="STRING",
         choices=[
             "zscaling",
             "tscaling",
@@ -949,7 +934,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     data_args.add_argument(
         "--dataset_type",
         type=str,
-        metavar="STRING",
         choices=["classification", "regression", "multiclass"],
         help="Type of dataset, e.g. classification or regression."
         "This determines the loss function used during training.",
@@ -965,7 +949,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     data_args.add_argument(
         "--split_type",
         type=str,
-        metavar="STRING",
         default="random",
         choices=[
             "random",
@@ -1075,7 +1058,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     model_args.add_argument(
         "--activation",
         type=str,
-        metavar="STRING",
         default="ReLU",
         choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"],
         help="Activation function",
@@ -1226,7 +1208,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     training_args.add_argument(
         "--loss_function",
         type=str,
-        metavar="STRING",
         choices=[
             "mse",
             "bounded_mse",
@@ -1246,7 +1227,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     training_args.add_argument(
         "--metric",
         type=str,
-        metavar="STRING",
         default=None,
         choices=[
             "auc",

From ebaaacaebbb4e73a72101e95b7f5b11583b4d925 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Mon, 11 Mar 2024 13:42:55 +0100
Subject: [PATCH 14/19] make literals optionals for None

---
 dfpl/options.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 5db0c837..d9834092 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -134,14 +134,14 @@ class PredictGnnOptions(PredictArgs):
     calibration_atom_descriptors_path: str = None
     calibration_features_path: str = None
     calibration_interval_percentile: float = 95
-    calibration_method: Literal[
+    calibration_method: Optional[Literal[
         "zscaling",
         "tscaling",
         "zelikman_interval",
         "mve_weighting",
         "platt",
         "isotonic",
-    ] = None
+    ]] = None
     calibration_path: str = None
     calibration_phase_features_path: str = None
     drop_extra_columns: bool = False
@@ -151,10 +151,10 @@ class PredictGnnOptions(PredictArgs):
     # no_features_scaling: bool = True
     individual_ensemble_predictions: bool = False
     preds_path: str = None
-    regression_calibrator_metric: Literal["stdev", "interval"] = None
+    regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None
     test_path: str = None
     uncertainty_dropout_p: float = 0.1
-    uncertainty_method: Literal[
+    uncertainty_method: Optional[Literal[
         "mve",
         "ensemble",
         "evidential_epistemic",
@@ -162,7 +162,7 @@ class PredictGnnOptions(PredictArgs):
         "evidential_total",
         "classification",
         "dropout",
-    ] = None
+    ]] = None
 
     @classmethod
     def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):

From d6090a9fdf9841ccfe5c7d2ec241687aa0bba136 Mon Sep 17 00:00:00 2001
From: Kyriakos Soulios <kyriakos.soulios@ufz.de>
Date: Mon, 11 Mar 2024 13:50:35 +0100
Subject: [PATCH 15/19] applied black

---
 dfpl/options.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index d9834092..2009fa76 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -134,14 +134,16 @@ class PredictGnnOptions(PredictArgs):
     calibration_atom_descriptors_path: str = None
     calibration_features_path: str = None
     calibration_interval_percentile: float = 95
-    calibration_method: Optional[Literal[
-        "zscaling",
-        "tscaling",
-        "zelikman_interval",
-        "mve_weighting",
-        "platt",
-        "isotonic",
-    ]] = None
+    calibration_method: Optional[
+        Literal[
+            "zscaling",
+            "tscaling",
+            "zelikman_interval",
+            "mve_weighting",
+            "platt",
+            "isotonic",
+        ]
+    ] = None
     calibration_path: str = None
     calibration_phase_features_path: str = None
     drop_extra_columns: bool = False
@@ -154,15 +156,17 @@ class PredictGnnOptions(PredictArgs):
     regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None
     test_path: str = None
     uncertainty_dropout_p: float = 0.1
-    uncertainty_method: Optional[Literal[
-        "mve",
-        "ensemble",
-        "evidential_epistemic",
-        "evidential_aleatoric",
-        "evidential_total",
-        "classification",
-        "dropout",
-    ]] = None
+    uncertainty_method: Optional[
+        Literal[
+            "mve",
+            "ensemble",
+            "evidential_epistemic",
+            "evidential_aleatoric",
+            "evidential_total",
+            "classification",
+            "dropout",
+        ]
+    ] = None
 
     @classmethod
     def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None):

From cb3fa01599f9889092df0b3ece618b1c66497061 Mon Sep 17 00:00:00 2001
From: Matthias Bernt <m.bernt@ufz.de>
Date: Tue, 9 Apr 2024 13:11:29 +0200
Subject: [PATCH 16/19] rename some variables

needed because of the specifics of the Galaxy tool generator
---
 dfpl/options.py | 416 ++++++++++++++++++++++++------------------------
 1 file changed, 207 insertions(+), 209 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index 85e245bc..d599215d 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -240,11 +240,11 @@ def createCommandlineParser() -> argparse.ArgumentParser:
     parser_train.set_defaults(method="train")
     parseInputTrain(parser_train)
 
-    parser_predict = subparsers.add_parser(
+    parser_input_predict = subparsers.add_parser(
         "predict", help="Predict your data with existing models"
     )
-    parser_predict.set_defaults(method="predict")
-    parseInputPredict(parser_predict)
+    parser_input_predict.set_defaults(method="predict")
+    parseInputPredict(parser_input_predict)
 
     parser_convert = subparsers.add_parser(
         "convert", help="Convert known data files to pickle serialization files"
@@ -254,20 +254,20 @@ def createCommandlineParser() -> argparse.ArgumentParser:
     return parser
 
 
-def parseInputTrain(parser: argparse.ArgumentParser) -> None:
+def parseInputTrain(parser_train: argparse.ArgumentParser) -> None:
     """
     Parse the input arguments.
 
     :return: A namespace object built up from attributes parsed out of the cmd line.
     """
     # Create argument groups
-    general_args = parser.add_argument_group("Model Configuration")
-    autoencoder_args = parser.add_argument_group("Autoencoder Configuration")
-    training_args = parser.add_argument_group("Training Configuration")
-    tracking_args = parser.add_argument_group("Tracking Configuration")
+    input_tain_general_args = parser_train.add_argument_group("Model Configuration")
+    input_tain_autoencoder_args = parser_train.add_argument_group("Autoencoder Configuration")
+    input_tain_training_args = parser_train.add_argument_group("Training Configuration")
+    input_tain_tracking_args = parser_train.add_argument_group("Tracking Configuration")
 
     # Model Configuration
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-f",
         "--configFile",
         metavar="FILE",
@@ -275,7 +275,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Input JSON file that contains all information for training/predicting.",
         default="example/train.json",
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-i",
         "--inputFile",
         metavar="FILE",
@@ -284,7 +284,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "comma separated CSV format.The first column should be smiles.",
         default="tests/data/smiles.csv",
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-o",
         "--outputDir",
         metavar="DIR",
@@ -295,7 +295,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
 
     # TODO CHECK WHAT IS TYPE DOING?
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-t",
         "--type",
         metavar="STRING",
@@ -304,7 +304,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
         default="fp",
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-thr",
         "--threshold",
         type=float,
@@ -312,7 +312,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Threshold for binary classification.",
         default=0.5,
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "-gpu",
         "--gpu",
         metavar="INT",
@@ -320,7 +320,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Select which gpu to use by index. If not available, leave empty",
         default=None,
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "--fpType",
         metavar="STR",
         type=str,
@@ -328,26 +328,26 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.",
         default="topological",
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "--fpSize",
         type=int,
         help="Length of the fingerprint that should be generated.",
         default=2048,
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "--compressFeatures",
         action="store_true",
         help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.",
         default=False,
     )
-    general_args.add_argument(
+    input_tain_general_args.add_argument(
         "--enableMultiLabel",
         action="store_true",
         help="Train multi-label classification model in addition to the individual models.",
         default=False,
     )
     # Autoencoder Configuration
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "-a",
         "--ecWeightsFile",
         type=str,
@@ -355,14 +355,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="The .hdf5 file of a trained encoder",
         default="",
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--ecModelDir",
         type=str,
         metavar="DIR",
         help="The directory where the full model of the encoder will be saved",
         default="example/results_train/AE_encoder/",
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeType",
         metavar="STRING",
         type=str,
@@ -370,21 +370,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Autoencoder type, variational or deterministic.",
         default="deterministic",
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeEpochs",
         metavar="INT",
         type=int,
         help="Number of epochs for autoencoder training.",
         default=100,
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeBatchSize",
         metavar="INT",
         type=int,
         help="Batch size in autoencoder training.",
         default=512,
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeActivationFunction",
         metavar="STRING",
         type=str,
@@ -392,21 +392,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="The activation function for the hidden layers in the autoencoder.",
         default="relu",
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeLearningRate",
         metavar="FLOAT",
         type=float,
         help="Learning rate for autoencoder training.",
         default=0.001,
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeLearningRateDecay",
         metavar="FLOAT",
         type=float,
         help="Learning rate decay for autoencoder training.",
         default=0.96,
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--aeSplitType",
         metavar="STRING",
         type=str,
@@ -414,7 +414,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Set how the data is going to be split for the autoencoder",
         default="random",
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "-d",
         "--encFPSize",
         metavar="INT",
@@ -422,14 +422,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Size of encoded fingerprint (z-layer of autoencoder).",
         default=256,
     )
-    autoencoder_args.add_argument(
+    input_tain_autoencoder_args.add_argument(
         "--visualizeLatent",
         action="store_true",
         help="UMAP the latent space for exploration",
         default=False,
     )
     # Training Configuration
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--split_type",
         metavar="STRING",
         type=str,
@@ -437,14 +437,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="Set how the data is going to be split for the feedforward neural network",
         default="random",
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--testSize",
         metavar="FLOAT",
         type=float,
         help="Fraction of the dataset that should be used for testing. Value in [0,1].",
         default=0.2,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "-K",
         "--kFolds",
         metavar="INT",
@@ -452,7 +452,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help="K that is used for K-fold cross-validation in the training procedure.",
         default=1,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "-v",
         "--verbose",
         metavar="INT",
@@ -462,19 +462,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         + "1: Some additional output, 2: full additional output",
         default=2,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--trainAC",
         action="store_true",
         help="Choose to train or not, the autoencoder based on the input file",
         default=False,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--trainFNN",
         action="store_false",
         help="When called it deactivates the training.",
         default=True,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--sampleFractionOnes",
         metavar="FLOAT",
         type=float,
@@ -482,14 +482,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         "only works if --sampleDown is enabled",
         default=0.5,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--sampleDown",
         metavar="BOOL",
         type=bool,
         help="Enable automatic down sampling of the 0 valued samples.",
         default=False,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "-e",
         "--epochs",
         metavar="INT",
@@ -498,7 +498,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         default=100,
     )
     # TODO CHECK IF ALL LOSSES MAKE SENSE HERE
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--lossFunction",
         metavar="STRING",
         type=str,
@@ -507,7 +507,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         default="bce",
     )
     # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS?
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--optimizer",
         metavar="STRING",
         type=str,
@@ -515,42 +515,42 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"',
         default="Adam",
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--batchSize",
         metavar="INT",
         type=int,
         help="Batch size in FNN training.",
         default=128,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--l2reg",
         metavar="FLOAT",
         type=float,
         help="Value for l2 kernel regularizer.",
         default=0.001,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--dropout",
         metavar="FLOAT",
         type=float,
         help="The fraction of data that is dropped out in each dropout layer.",
         default=0.2,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--learningRate",
         metavar="FLOAT",
         type=float,
         help="Learning rate size in FNN training.",
         default=0.000022,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--learningRateDecay",
         metavar="FLOAT",
         type=float,
         help="Learning rate size in FNN training.",
         default=0.96,
     )
-    training_args.add_argument(
+    input_tain_training_args.add_argument(
         "--activationFunction",
         metavar="STRING",
         type=str,
@@ -559,21 +559,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
         default="relu",
     )
     # Tracking Configuration
-    tracking_args.add_argument(
+    input_tain_tracking_args.add_argument(
         "--aeWabTracking",
         metavar="BOOL",
         type=bool,
         help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.",
         default=False,
     )
-    tracking_args.add_argument(
+    input_tain_tracking_args.add_argument(
         "--wabTracking",
         metavar="BOOL",
         type=bool,
         help="Track FNN performance via Weights & Biases, see https://wandb.ai.",
         default=False,
     )
-    tracking_args.add_argument(
+    input_tain_tracking_args.add_argument(
         "--wabTarget",
         metavar="STRING",
         type=str,
@@ -583,23 +583,23 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
     )
 
 
-def parseInputPredict(parser: argparse.ArgumentParser) -> None:
+def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None:
     """
     Parse the input arguments.
 
     :return: A namespace object built up from attributes parsed out of the cmd line.
     """
 
-    general_args = parser.add_argument_group("General Configuration")
-    files_args = parser.add_argument_group("Files")
-    files_args.add_argument(
+    input_predict_general_args = parser_input_predict.add_argument_group("General Configuration")
+    input_predict_files_args = parser_input_predict.add_argument_group("Files")
+    input_predict_files_args.add_argument(
         "-f",
         "--configFile",
         metavar="FILE",
         type=str,
         help="Input JSON file that contains all information for training/predicting.",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "-i",
         "--inputFile",
         metavar="FILE",
@@ -614,7 +614,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "A header is expected and respective column names are used.",
         default="tests/data/smiles.csv",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "-o",
         "--outputDir",
         metavar="DIR",
@@ -623,7 +623,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "with --outputFile.",
         default="example/results_predict/",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "--outputFile",
         metavar="FILE",
         type=str,
@@ -632,7 +632,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         default="results.csv",
     )
     # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES?
-    general_args.add_argument(
+    input_predict_general_args.add_argument(
         "-t",
         "--type",
         metavar="STR",
@@ -641,7 +641,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
         default="fp",
     )
-    general_args.add_argument(
+    input_predict_general_args.add_argument(
         "-k",
         "--fpType",
         metavar="STR",
@@ -650,7 +650,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.",
         default="topological",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "--ecModelDir",
         type=str,
         metavar="DIR",
@@ -658,14 +658,14 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "loaded from (if trainAE=False). Provide a full path here.",
         default="",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "--ecWeightsFile",
         type=str,
         metavar="STR",
         help="The file  where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.",
         default="",
     )
-    files_args.add_argument(
+    input_predict_files_args.add_argument(
         "--fnnModelDir",
         type=str,
         metavar="DIR",
@@ -673,29 +673,27 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
         "Provide a full path here.",
         default="example/results_train/AR_saved_model",
     )
-    general_args.add_argument(
+    input_predict_general_args.add_argument(
         "-c", "--compressFeatures", action="store_true", default=False
     )
-    (
-        general_args.add_argument(
-            "--aeType",
-            metavar="STRING",
-            type=str,
-            choices=["variational", "deterministic"],
-            help="Autoencoder type, variational or deterministic.",
-            default="deterministic",
-        )
-    )
-
-
-def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
-    general_args = parser.add_argument_group("General Configuration")
-    data_args = parser.add_argument_group("Data Configuration")
-    files_args = parser.add_argument_group("Files")
-    model_args = parser.add_argument_group("Model arguments")
-    training_args = parser.add_argument_group("Training Configuration")
-    uncertainty_args = parser.add_argument_group("Uncertainty Configuration")
-    uncertainty_args.add_argument(
+    input_predict_general_args.add_argument(
+        "--aeType",
+        metavar="STRING",
+        type=str,
+        choices=["variational", "deterministic"],
+        help="Autoencoder type, variational or deterministic.",
+        default="deterministic",
+    )
+
+
+def parseTrainGnn(parser_train_gnn: argparse.ArgumentParser) -> None:
+    train_gnn_general_args = parser_train_gnn.add_argument_group("General Configuration")
+    train_gnn_data_args = parser_train_gnn.add_argument_group("Data Configuration")
+    train_gnn_files_args = parser_train_gnn.add_argument_group("Files")
+    train_gnn_model_args = parser_train_gnn.add_argument_group("Model arguments")
+    train_gnn_training_args = parser_train_gnn.add_argument_group("Training Configuration")
+    train_gnn_uncertainty_args = parser_train_gnn.add_argument_group("Uncertainty Configuration")
+    train_gnn_uncertainty_args.add_argument(
         "--uncertainty_method",
         type=str,
         metavar="STRING",
@@ -713,7 +711,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         default="none",
     )
     # Uncertainty arguments
-    uncertainty_args.add_argument(
+    train_gnn_uncertainty_args.add_argument(
         "--calibration_method",
         type=str,
         metavar="STRING",
@@ -728,7 +726,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Method to use for calibration",
         default="none",
     )
-    uncertainty_args.add_argument(
+    train_gnn_uncertainty_args.add_argument(
         "--calibration_path",
         type=str,
         metavar="FILE",
@@ -736,47 +734,47 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
     # General arguments
-    general_args.add_argument("--split_key_molecule", type=int)
-    general_args.add_argument("--pytorch_seed", type=int)
-    general_args.add_argument("--cache_cutoff", type=float)
-    general_args.add_argument("--save_preds", type=bool)
-    general_args.add_argument("--wabTracking", action="store_true", default=False)
-    general_args.add_argument(
+    train_gnn_general_args.add_argument("--split_key_molecule", type=int)
+    train_gnn_general_args.add_argument("--pytorch_seed", type=int)
+    train_gnn_general_args.add_argument("--cache_cutoff", type=float)
+    train_gnn_general_args.add_argument("--save_preds", type=bool)
+    train_gnn_general_args.add_argument("--wabTracking", action="store_true", default=False)
+    train_gnn_general_args.add_argument(
         "--cuda", action="store_true", default=False, help="Turn on cuda"
     )
-    # general_args.add_argument(
+    # train_gnn_general_args.add_argument(
     #     "--save_smiles_splits",
     #     action="store_true",
     #     default=False,
     #     help="Save smiles for each train/val/test splits for prediction convenience later",
     # )
-    general_args.add_argument(
+    train_gnn_general_args.add_argument(
         "--test",
         action="store_true",
         default=False,
         help="Whether to skip training and only test the model",
     )
-    general_args.add_argument(
+    train_gnn_general_args.add_argument(
         "--gpu",
         type=int,
         choices=list(range(torch.cuda.device_count())),
         help="Which GPU to use",
     )
-    general_args.add_argument("--save", type=bool)
-    general_args.add_argument(
+    train_gnn_general_args.add_argument("--save", type=bool)
+    train_gnn_general_args.add_argument(
         "--quiet",
         action="store_true",
         default=False,
         help="Skip non-essential print statements",
     )
-    general_args.add_argument(
+    train_gnn_general_args.add_argument(
         "--log_frequency",
         type=int,
         metavar="INT",
         default=10,
         help="The number of batches between each logging of the training loss",
     )
-    general_args.add_argument(
+    train_gnn_general_args.add_argument(
         "--no_cache",
         action="store_true",
         default=False,
@@ -784,21 +782,21 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
     # FILES ARGUMENTS
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "-f",
         "--configFile",
         metavar="FILE",
         type=str,
         help="Input JSON file that contains all information for training/predicting.",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--save_dir",
         type=str,
         metavar="DIR",
         default="./ckpt/",
         help="Directory where model checkpoints will be saved",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--checkpoint_dir",
         type=str,
         metavar="DIR",
@@ -806,14 +804,14 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Directory from which to load model checkpoints"
         "(walks directory and ensembles all models that are found)",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--checkpoint_path",
         type=str,
         metavar="FILE",
         default=None,
         help="Path to model checkpoint (.pt file)",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--checkpoint_paths",
         type=str,
         metavar="FILE",
@@ -821,73 +819,73 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         default=None,
         help="Path to model checkpoint (.pt file)",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_val_path",
         type=str,
         metavar="FILE",
         help="Path to separate val set, optional",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_val_features_path",
         type=str,
         metavar="FILE",
         nargs="*",
         help="Path to file with features for separate val set",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_test_path",
         type=str,
         metavar="FILE",
         help="Path to separate test set, optional",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_test_features_path",
         type=str,
         metavar="FILE",
         nargs="*",
         help="Path to file with features for separate test set",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--folds_file",
         type=str,
         metavar="FILE",
         default=None,
         help="Optional file of fold labels",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--val_fold_index",
         type=int,
         metavar="INT",
         default=None,
         help="Which fold to use as val for cross val",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--test_fold_index",
         type=int,
         metavar="INT",
         default=None,
         help="Which fold to use as test for cross val",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--crossval_index_dir",
         type=str,
         metavar="DIR",
         help="Directory in which to find cross validation index files",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--crossval_index_file",
         type=str,
         metavar="FILE",
         help="Indices of files to use as train/val/test"
         "Overrides --num_folds and --seed.",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--data_weights_path",
         type=str,
         metavar="FILE",
         help="Path where the data weight are saved",
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--features_path",
         type=str,
         metavar="FILE",
@@ -895,47 +893,47 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Path to features to use in FNN (instead of features_generator)",
     )
 
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_val_phase_features_path", type=str, metavar="FILE"
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_test_phase_features_path", type=str, metavar="FILE"
     )
 
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_val_atom_descriptors_path", type=str, metavar="FILE"
     )
-    files_args.add_argument(
+    train_gnn_files_args.add_argument(
         "--separate_test_atom_descriptors_path", type=str, metavar="FILE"
     )
     # Data related arguments
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--data_path",
         type=str,
         metavar="FILE",
         help="Path to data CSV file",
         default="",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--use_compound_names",
         action="store_true",
         default=False,
         help="Use when test data file contains compound names in addition to SMILES strings",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--max_data_size",
         type=int,
         metavar="INT",
         help="Maximum number of data points to load",
     )
 
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--features_only",
         action="store_true",
         default=False,
         help="Use only the additional features in an FFN, no graph network",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--dataset_type",
         type=str,
         metavar="STRING",
@@ -944,14 +942,14 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         "This determines the loss function used during training.",
         default="regression",
     )  # classification
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--multiclass_num_classes",
         type=int,
         metavar="INT",
         default=3,
         help="Number of classes when running multiclass classification",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--split_type",
         type=str,
         metavar="STRING",
@@ -965,7 +963,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         ],
         help="Method of splitting the data into train/val/test",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--split_sizes",
         type=float,
         metavar="FLOAT",
@@ -974,7 +972,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         help="Split proportions for train/validation/test sets",
     )
 
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--seed",
         type=int,
         default=0,
@@ -982,42 +980,42 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         "When `num_folds` > 1, the first fold uses this seed and all"
         "subsequent folds add 1 to the seed.",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--smiles_columns",
         type=str,
         metavar="STRING",
         help="Name of the smiles columns",
     )
 
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--target_columns",
         type=str,
         metavar="STRING",
         help="Name of the target columns",
     )
 
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--ignore_columns",
         type=str,
         metavar="STRING",
         help="Names of the columns to ignore",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--num_tasks", type=int, metavar="INT", help="NUmber of tasks"
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--no_features_scaling",
         action="store_true",
         default=False,
         help="Turn off scaling of features",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--features_scaling",
         action="store_true",
         default=False,
         help="Turn on scaling of features",
     )
-    data_args.add_argument(
+    train_gnn_data_args.add_argument(
         "--use_input_features",
         type=str,
         metavar="STRING",
@@ -1025,41 +1023,41 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
     # Model arguments
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--ensemble_size",
         type=int,
         metavar="INT",
         default=1,
         help="Number of models in ensemble",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--hidden_size",
         type=int,
         metavar="INT",
         default=300,
         help="Dimensionality of hidden layers in MPN",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--bias",
         action="store_true",
         default=False,
         help="Whether to add bias to linear layers",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--depth",
         type=int,
         metavar="INT",
         default=3,
         help="Number of message passing steps",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--dropout",
         type=float,
         metavar="FLOAT",
         default=0.0,
         help="Dropout probability",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--activation",
         type=str,
         metavar="STRING",
@@ -1067,81 +1065,81 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"],
         help="Activation function",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--undirected",
         action="store_true",
         default=False,
         help="Undirected edges (always sum the two relevant bond vectors)",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--ffn_hidden_size",
         type=int,
         metavar="INT",
         default=2,
         help="Hidden dim for higher-capacity FFN (defaults to hidden_size)",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--ffn_num_layers",
         type=int,
         metavar="INT",
         default=2,
         help="Number of layers in FFN after MPN encoding",
     )
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--atom_messages",
         action="store_true",
         default=False,
         help="Use messages on atoms instead of messages on bonds",
     )
 
-    model_args.add_argument(
+    train_gnn_model_args.add_argument(
         "--num_lrs",
         type=int,
         metavar="INT",
         default=2,
         help="Number of layers in FFN after MPN encoding",
     )
-    model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING")
+    train_gnn_model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING")
 
     # Model arguments
-    model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL")
-    model_args.add_argument(
+    train_gnn_model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument(
         "--show_individual_scores",
         action="store_true",
         default=True,
         help="Show all scores for individual targets, not just average, at the end",
     )
-    model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"])
-    model_args.add_argument("--aggregation_norm", type=int)
-    model_args.add_argument("--explicit_h", type=bool, metavar="BOOL")
-    model_args.add_argument("--adding_h", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"])
+    train_gnn_model_args.add_argument("--aggregation_norm", type=int)
+    train_gnn_model_args.add_argument("--explicit_h", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument("--adding_h", type=bool, metavar="BOOL")
     # Training arguments
-    model_args.add_argument("--class_balance", type=bool, metavar="BOOL")
-    model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT")
-    model_args.add_argument(
+    train_gnn_model_args.add_argument("--class_balance", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT")
+    train_gnn_model_args.add_argument(
         "--overwrite_default_atom_features", type=bool, metavar="BOOL"
     )
-    model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL")
-    model_args.add_argument(
+    train_gnn_model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument(
         "--overwrite_default_bond_features", type=bool, metavar="BOOL"
     )
-    model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT")
-    model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL")
+    train_gnn_model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT")
+    train_gnn_model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL")
     # Training arguments
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run"
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--total_epochs",
         type=int,
         metavar="INT",
         default=30,
         help="Number of total epochs to run",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--batch_size", type=int, metavar="INT", default=50, help="Batch size"
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--warmup_epochs",
         type=int,
         metavar="INT",
@@ -1150,35 +1148,35 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         "init_lr to max_lr. Afterwards, learning rate decreases exponentially"
         "from max_lr to final_lr.",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--init_lr",
         type=float,
         metavar="FLOAT",
         default=1e-4,
         help="Initial learning rate",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--max_lr",
         type=float,
         metavar="FLOAT",
         default=1e-3,
         help="Maximum learning rate",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--final_lr",
         type=float,
         metavar="FLOAT",
         default=1e-4,
         help="Final learning rate",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--extra_metrics",
         type=str,
         metavar="STRING",
         nargs="*",
         help="Extra metrics to use",
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--loss_function",
         type=str,
         metavar="STRING",
@@ -1195,8 +1193,8 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
             "dirichlet",
         ],
     )
-    training_args.add_argument("--grad_clip", type=float)
-    training_args.add_argument(
+    train_gnn_training_args.add_argument("--grad_clip", type=float)
+    train_gnn_training_args.add_argument(
         "--metric",
         type=str,
         metavar="STRING",
@@ -1216,7 +1214,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
         "(loss is determined by the `dataset_type` argument)."
         'Note: Defaults to "auc" for classification and "rmse" for regression.',
     )
-    training_args.add_argument(
+    train_gnn_training_args.add_argument(
         "--num_folds",
         type=int,
         metavar="INT",
@@ -1225,23 +1223,23 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None:
     )
 
 
-def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
-    general_args = parser.add_argument_group("General Configuration")
-    files_args = parser.add_argument_group("Files")
-    uncertainty_args = parser.add_argument_group("Uncertainty Configuration")
+def parsePredictGnn(parser_predict_gnn: argparse.ArgumentParser) -> None:
+    predict_gnn_general_args = parser_predict_gnn.add_argument_group("General Configuration")
+    predict_gnn_files_args = parser_predict_gnn.add_argument_group("Files")
+    predict_gnn_uncertainty_args = parser_predict_gnn.add_argument_group("Uncertainty Configuration")
 
-    general_args.add_argument(
+    predict_gnn_general_args.add_argument(
         "--checkpoint_path",
         type=str,
         metavar="FILE",
         help="Path to model checkpoint (.pt file)"
     )
-    # general_args.add_argument(
+    # predict_gnn_general_args.add_argument(
     #     "--no_features_scaling",
     #     action="store_true",
     #     help="Turn on scaling of features",
     # )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "-f",
         "--configFile",
         type=str,
@@ -1249,46 +1247,46 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         help="Path to a .json file containing arguments. Any arguments present in the config"
         "file will override arguments specified via the command line or by the defaults.",
     )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "--test_path",
         type=str,
         help="Path to CSV file containing testing data for which predictions will be made.",
     )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "--preds_path",
         type=str,
         help="Path to CSV or PICKLE file where predictions will be saved.",
     )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "--calibration_path",
         type=str,
         help="Path to data file to be used for uncertainty calibration.",
     )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "--calibration_features_path",
         type=str,
         nargs="+",
         help="Path to features data to be used with the uncertainty calibration dataset.",
     )
-    files_args.add_argument("--calibration_phase_features_path", type=str, help="")
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument("--calibration_phase_features_path", type=str, help="")
+    predict_gnn_files_args.add_argument(
         "--calibration_atom_descriptors_path",
         type=str,
         help="Path to the extra atom descriptors.",
     )
-    files_args.add_argument(
+    predict_gnn_files_args.add_argument(
         "--calibration_bond_descriptors_path",
         type=str,
         help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.",
     )
 
-    general_args.add_argument(
+    predict_gnn_general_args.add_argument(
         "--drop_extra_columns",
         action="store_true",
         help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.",
     )
 
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--uncertainty_method",
         type=str,
         choices=[
@@ -1304,7 +1302,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         ],
         help="The method of calculating uncertainty.",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--calibration_method",
         type=str,
         nargs="+",
@@ -1318,40 +1316,40 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
         ],
         help="Methods used for calibrating the uncertainty calculated with uncertainty method.",
     )
-    uncertainty_args.add_argument("--individual_ensemble_predictions",
+    predict_gnn_uncertainty_args.add_argument("--individual_ensemble_predictions",
         action="store_true",
         default=False,
         help="Whether to save individual ensemble predictions.")
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--evaluation_methods",
         type=str,
         nargs="+",
         help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--evaluation_scores_path",
         type=str,
         help="Location to save the results of uncertainty evaluations.",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--uncertainty_dropout_p",
         type=float,
         default=0.1,
         help="The probability to use for Monte Carlo dropout uncertainty estimation.",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--dropout_sampling_size",
         type=int,
         default=10,
         help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--calibration_interval_percentile",
         type=float,
         default=95,
         help="Sets the percentile used in the calibration methods. Must be in the range (1,100).",
     )
-    uncertainty_args.add_argument(
+    predict_gnn_uncertainty_args.add_argument(
         "--regression_calibrator_metric",
         type=str,
         choices=["stdev", "interval"],
@@ -1359,24 +1357,24 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None:
     )
 
 
-def parseInterpretGnn(parser: argparse.ArgumentParser) -> None:
-    files_args = parser.add_argument_group("Files")
-    interpret_args = parser.add_argument_group("Interpretation Configuration")
-    files_args.add_argument(
+def parseInterpretGnn(parser_interpret_gnn: argparse.ArgumentParser) -> None:
+    interpret_gnn_files_args = parser_interpret_gnn.add_argument_group("Files")
+    interpret_gnn_interpret_args = parser_interpret_gnn.add_argument_group("Interpretation Configuration")
+    interpret_gnn_files_args.add_argument(
         "-f",
         "--configFile",
         metavar="FILE",
         type=str,
         help="Input JSON file that contains all information for interpretation.",
     )
-    files_args.add_argument(
+    interpret_gnn_files_args.add_argument(
         "--preds_path",
         type=str,
         metavar="FILE",
         help="Path to CSV file where predictions will be saved",
         default="",
     )
-    files_args.add_argument(
+    interpret_gnn_files_args.add_argument(
         "--checkpoint_dir",
         type=str,
         metavar="DIR",
@@ -1384,46 +1382,46 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None:
         "(walks directory and ensembles all models that are found)",
         default="./ckpt",
     )
-    files_args.add_argument(
+    interpret_gnn_files_args.add_argument(
         "--checkpoint_path",
         type=str,
         metavar="DIR",
         help="Path to model checkpoint (.pt file)",
     )
-    files_args.add_argument(
+    interpret_gnn_files_args.add_argument(
         "--data_path",
         type=str,
         metavar="FILE",
         help="Path to CSV file containing testing data for which predictions will be made",
     )
-    interpret_args.add_argument(
+    interpret_gnn_interpret_args.add_argument(
         "--max_atoms",
         type=int,
         metavar="INT",
         help="Maximum number of atoms to use for interpretation",
     )
 
-    interpret_args.add_argument(
+    interpret_gnn_interpret_args.add_argument(
         "--min_atoms",
         type=int,
         metavar="INT",
         help="Minimum number of atoms to use for interpretation",
     )
 
-    interpret_args.add_argument(
+    interpret_gnn_interpret_args.add_argument(
         "--prop_delta",
         type=float,
         metavar="FLOAT",
         help="The minimum change in the property of interest that is considered significant",
     )
-    interpret_args.add_argument(
+    interpret_gnn_interpret_args.add_argument(
         "--property_id",
         type=int,
         metavar="INT",
         help="The index of the property of interest",
     )
     # write the argument for rollouts
-    interpret_args.add_argument(
+    interpret_gnn_interpret_args.add_argument(
         "--rollout",
         type=int,
         metavar="INT",
@@ -1431,13 +1429,13 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None:
     )
 
 
-def parseInputConvert(parser: argparse.ArgumentParser) -> None:
+def parseInputConvert(parser_convert: argparse.ArgumentParser) -> None:
     """
     Parse the input arguments.
 
     :return: A namespace object built up from attributes parsed out of the cmd line.
     """
-    parser.add_argument(
+    parser_convert.add_argument(
         "-f",
         metavar="FILE",
         type=str,

From e87be1bf8161a66d9588873832ff47f126847210 Mon Sep 17 00:00:00 2001
From: Matthias Bernt <m.bernt@ufz.de>
Date: Tue, 9 Apr 2024 15:27:12 +0200
Subject: [PATCH 17/19] fixup

---
 dfpl/options.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index fe3e1bb9..a9e56102 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -676,7 +676,6 @@ def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None:
         help="Autoencoder type, variational or deterministic.",
         default="deterministic",
     )
-    )
 
 
 def parseTrainGnn(parser_train_gnn: argparse.ArgumentParser) -> None:

From 6b02f284637258b25dd4c76986480e19ee8b3fc2 Mon Sep 17 00:00:00 2001
From: Matthias Bernt <m.bernt@ufz.de>
Date: Tue, 11 Jun 2024 10:54:45 +0200
Subject: [PATCH 18/19] remove redundant info

---
 dfpl/options.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index a9e56102..baa25fda 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -306,7 +306,7 @@ def parseInputTrain(parser_train: argparse.ArgumentParser) -> None:
         "--type",
         type=str,
         choices=["fp", "smiles"],
-        help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
+        help="Type of the chemical representation.",
         default="fp",
     )
     input_tain_general_args.add_argument(
@@ -629,7 +629,7 @@ def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None:
         "--type",
         type=str,
         choices=["fp", "smiles"],
-        help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
+        help="Type of the chemical representation.",
         default="fp",
     )
     input_predict_general_args.add_argument(

From a6dedfdd18050cabd7e60ccb13e86b5d8beffeea Mon Sep 17 00:00:00 2001
From: Matthias Bernt <m.bernt@ufz.de>
Date: Tue, 9 Jul 2024 10:04:02 +0200
Subject: [PATCH 19/19] check of options from json are used

of defaults
---
 dfpl/options.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dfpl/options.py b/dfpl/options.py
index baa25fda..dd455c11 100644
--- a/dfpl/options.py
+++ b/dfpl/options.py
@@ -19,12 +19,12 @@ class Options:
     """
 
     configFile: str = None
-    inputFile: str = "tests/data/smiles.csv"
-    outputDir: str = "example/results_train/"  # changes according to mode
+    inputFile: str = ""
+    outputDir: str = ""  # changes according to mode
     outputFile: str = "results.csv"
     ecWeightsFile: str = ""
-    ecModelDir: str = "example/results_train/AE_encoder/"
-    fnnModelDir: str = "example/results_train/AR_saved_model/"
+    ecModelDir: str = ""
+    fnnModelDir: str = ""
     type: str = "smiles"
     fpType: str = "topological"  # also "MACCS", "atompairs"
     epochs: int = 100
@@ -85,8 +85,8 @@ class GnnOptions(TrainArgs):
 
     total_epochs: int = 30
     save: bool = True
-    configFile: str = "./example/traingnn.json"
-    data_path: str = "./example/data/tox21.csv"
+    configFile: str = ""
+    data_path: str = ""
     use_compound_names: bool = False
     save_dir: str = ""
     no_cache: bool = False