selene/selene_sdk/utils/config_utils.py at df6e53a5874a8be2b5473db905501c7c80130a89 · FunctionLab/selene · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
"""
Utilities for loading configurations, instantiating Python objects, and
running operations in _Selene_.

"""
import os
import importlib
import sys
from time import strftime
import types
import random
import shutil, yaml

import numpy as np
import torch

from . import _is_lua_trained_model
from . import instantiate
from . import load_path

from selene_sdk import version

def class_instantiate(classobj):
    """Not used currently, but might be useful later for recursive
    class instantiation
    """
    for attr, obj in classobj.__dict__.items():
        is_module = getattr(obj, '__module__', None)
        if is_module and "selene_sdk" in is_module and attr is not "model":
            class_instantiate(obj)
    classobj.__init__(**classobj.__dict__)


def module_from_file(path):
    """
    Load a module created based on a Python file path.

    Parameters
    ----------
    path : str
        Path to the model architecture file.

    Returns
    -------
    The loaded module

    """
    parent_path, module_file = os.path.split(path)
    loader = importlib.machinery.SourceFileLoader(
        module_file[:-3], path)
    module = types.ModuleType(loader.name)
    loader.exec_module(module)
    return module


def module_from_dir(path):
    """
    This method expects that you pass in the path to a valid Python module,
    where the `__init__.py` file already imports the model class,
    `criterion`, and `get_optimizer` methods from the appropriate file
    (e.g. `__init__.py` contains the line `from <model_class_file> import
    <ModelClass>`).

    Parameters
    ----------
    path : str
        Path to the Python module containing the model class.

    Returns
    -------
    The loaded module
    """
    parent_path, module_dir = os.path.split(path)
    sys.path.insert(0, parent_path)
    return importlib.import_module(module_dir)


def initialize_model(model_configs, train=True, lr=None):
    """
    Initialize model (and associated criterion, optimizer)

    Parameters
    ----------
    model_configs : dict
        Model-specific configuration
    train : bool, optional
        Default is True. If `train`, returns the user-specified optimizer
        and optimizer class that can be found within the input model file.
    lr : float or None, optional
        If `train`, a learning rate must be specified. Otherwise, None.

    Returns
    -------
    model, criterion : tuple(torch.nn.Module, torch.nn._Loss) or \
            model, criterion, optim_class, optim_kwargs : \
                tuple(torch.nn.Module, torch.nn._Loss, torch.optim, dict)

        * `torch.nn.Module` - the model architecture
        * `torch.nn._Loss` - the loss function associated with the model
        * `torch.optim` - the optimizer associated with the model
        * `dict` - the optimizer arguments

        The optimizer and its arguments are only returned if `train` is
        True.

    Raises
    ------
    ValueError
        If `train` but the `lr` specified is not a float.

    """
    import_model_from = model_configs["path"]
    model_class_name = model_configs["class"]

    module = None
    if os.path.isdir(import_model_from):
        import_model_from = import_model_from.rstrip(os.sep)
        module = module_from_dir(import_model_from)
    else:
        module = module_from_file(import_model_from)
    model_class = getattr(module, model_class_name)

    model = model_class(**model_configs["class_args"])
    if "non_strand_specific" in model_configs:
        from selene_sdk.utils import NonStrandSpecific
        model = NonStrandSpecific(
            model, mode=model_configs["non_strand_specific"])

    _is_lua_trained_model(model)
    criterion = module.criterion()
    if train and isinstance(lr, float):
        optim_class, optim_kwargs = module.get_optimizer(lr)
        return model, criterion, optim_class, optim_kwargs
    elif train:
        raise ValueError("Learning rate must be specified as a float "
                         "but was {0}".format(lr))
    return model, criterion


def execute(operations, configs, output_dir):
    """
    Execute operations in _Selene_.

    Parameters
    ----------
    operations : list(str)
        The list of operations to carry out in _Selene_.
    configs : dict or object
        The loaded configurations from a YAML file.
    output_dir : str or None
        The path to the directory where all outputs will be saved.
        If None, this means that an `output_dir` was not specified
        in the top-level configuration keys. `output_dir` must be
        specified in each class's individual configuration wherever
        it is required.

    Returns
    -------
    None
        Executes the operations listed and outputs any files
        to the dirs specified in each operation's configuration.

    Raises
    ------
    ValueError
        If an expected key in configuration is missing.

    """
    model = None
    train_model = None
    for op in operations:
        if op == "train":
            model, loss, optim, optim_kwargs = initialize_model(
                configs["model"], train=True, lr=configs["lr"])

            sampler_info = configs["sampler"]
            if output_dir is not None:
                sampler_info.bind(output_dir=output_dir)
            sampler = instantiate(sampler_info)
            train_model_info = configs["train_model"]
            train_model_info.bind(model=model,
                                  data_sampler=sampler,
                                  loss_criterion=loss,
                                  optimizer_class=optim,
                                  optimizer_kwargs=optim_kwargs)
            if output_dir is not None:
                train_model_info.bind(output_dir=output_dir)
            if "random_seed" in configs:
                train_model_info.bind(deterministic=True)

            train_model = instantiate(train_model_info)
            # TODO: will find a better way to handle this in the future
            if "load_test_set" in configs and configs["load_test_set"] and \
                    "evaluate" in operations:
                train_model.create_test_set()
            train_model.train_and_validate()

        elif op == "evaluate":
            if train_model is not None:
                train_model.evaluate()

            if not model:
                model, loss = initialize_model(
                    configs["model"], train=False)
            if "evaluate_model" in configs:
                sampler_info = configs["sampler"]
                sampler = instantiate(sampler_info)
                evaluate_model_info = configs["evaluate_model"]
                evaluate_model_info.bind(
                    model=model,
                    criterion=loss,
                    data_sampler=sampler)
                if output_dir is not None:
                    evaluate_model_info.bind(output_dir=output_dir)

                evaluate_model = instantiate(evaluate_model_info)
                evaluate_model.evaluate()

        elif op == "analyze":
            if not model:
                model, _ = initialize_model(
                    configs["model"], train=False)
            analyze_seqs_info = configs["analyze_sequences"]
            analyze_seqs_info.bind(model=model)

            analyze_seqs = instantiate(analyze_seqs_info)
            if "variant_effect_prediction" in configs:
                vareff_info = configs["variant_effect_prediction"]
                if "vcf_files" not in vareff_info:
                    raise ValueError("variant effect prediction requires "
                                     "as input a list of 1 or more *.vcf "
                                     "files ('vcf_files').")
                for filepath in vareff_info.pop("vcf_files"):
                    analyze_seqs.variant_effect_prediction(
                        filepath, **vareff_info)
            if "in_silico_mutagenesis" in configs:
                ism_info = configs["in_silico_mutagenesis"]
                if "sequence" in ism_info:
                    analyze_seqs.in_silico_mutagenesis(**ism_info)
                elif "input_path" in ism_info:
                    analyze_seqs.in_silico_mutagenesis_from_file(**ism_info)
                elif "fa_files" in ism_info:
                    for filepath in ism_info.pop("fa_files"):
                        analyze_seqs.in_silico_mutagenesis_from_file(
                            filepath, **ism_info)
                else:
                    raise ValueError("in silico mutagenesis requires as input "
                                     "the path to the FASTA file "
                                     "('input_path') or a sequence "
                                     "('input_sequence') or a list of "
                                     "FASTA files ('fa_files'), but found "
                                     "neither.")
            if "prediction" in configs:
                predict_info = configs["prediction"]
                analyze_seqs.get_predictions(**predict_info)


def parse_configs_and_run(configs,
                          create_subdirectory=True,
                          lr=None):
    """
    Method to parse the configuration YAML file and run each operation
    specified.

    Parameters
    ----------
    configs : str or dict
        If it is a str,  then configs is the name of the configuration YAML file, from which we will read
                         nested configuration parameters.
        If it is a dict, then configs is a dict storing nested configuration parameters.
        Will look for the following top-level parameters:

            * `ops`: A list of 1 or more of the values \
            {"train", "evaluate", "analyze"}. The operations specified\
            determine what objects and information we expect to parse\
            in order to run these operations. This is required.
            * `output_dir`: Output directory to use for all the operations.\
            If no `output_dir` is specified, assumes that all constructors\
            that will be initialized (which have their own configurations\
            in `configs`) have their own `output_dir` specified.\
            Optional.
            * `random_seed`: A random seed set for `torch` and `torch.cuda`\
            for reproducibility. Optional.
            * `lr`: The learning rate, if one of the operations in the list is\
            "train".
            * `load_test_set`: If `ops: [train, evaluate]`, you may set\
               this parameter to True if you would like to load the test\
               set into memory ahead of time--and therefore save the test\
               data to a .bed file at the start of training. This is only\
               useful if you have a machine that can support a large increase\
               (on the order of GBs) in memory usage and if you want to\
               create a test dataset early-on because you do not know if your\
               model will finish training and evaluation within the allotted\
               time that your job is run.

    create_subdirectory : bool, optional
        Default is True. If `create_subdirectory`, will create a directory
        within `output_dir` with the name formatted as "%Y-%m-%d-%H-%M-%S",
        the date/time this method was run.
    lr : float or None, optional
        Default is None. If "lr" (learning rate) is already specified as a
        top-level key in `configs`, there is no need to set `lr` to a value
        unless you want to override the value in `configs`. Otherwise,
        set `lr` to the desired learning rate if "train" is one of the
        operations to be executed.

    Returns
    -------
    None
        Executes the operations listed and outputs any files
        to the dirs specified in each operation's configuration.

    """
    if isinstance(configs, str):
        configs_file = configs
        if not os.path.isfile(configs_file):
            print("The configuration YAML file {} does not exist!".format(configs_file))
            return
        configs = load_path(configs_file, instantiate=False)
    operations = configs["ops"]

    #print selene_sdk version
    if "selene_sdk_version" not in configs:
        configs["selene_sdk_version"] = version.__version__
        print("Running with selene_sdk version {0}".format(version.__version__))

    if "train" in operations and "lr" not in configs and lr != None:
        configs["lr"] = float(lr)
    elif "train" in operations and "lr" in configs and lr != None:
        print("Warning: learning rate specified in both the "
              "configuration dict and this method's `lr` parameter. "
              "Using the `lr` value input to `parse_configs_and_run` "
              "({0}, not {1}).".format(lr, configs["lr"]))
    elif "train" in operations and "lr" not in configs and lr == None:
        raise ValueError("Learning rate not specified, cannot "
                         "fit model. Exiting.")

    current_run_output_dir = None
    if "output_dir" not in configs and \
            ("train" in operations or "evaluate" in operations):
        print("No top-level output directory specified. All constructors "
              "to be initialized (e.g. Sampler, TrainModel) that require "
              "this parameter must have it specified in their individual "
              "parameter configuration.")
    elif "output_dir" in configs:
        current_run_output_dir = configs["output_dir"]
        os.makedirs(current_run_output_dir, exist_ok=True)
        if "create_subdirectory" in configs:
            create_subdirectory = configs["create_subdirectory"]
        if create_subdirectory:
            rand_str = str(random.random())[2:]
            current_run_output_dir = os.path.join(
                current_run_output_dir, '{}-{}'.format(strftime("%Y-%m-%d-%H-%M-%S"), rand_str))
            os.makedirs(current_run_output_dir)
        print("Outputs and logs saved to {0}".format(
            current_run_output_dir))

    if "random_seed" in configs:
        seed = configs["random_seed"]
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        #torch.backends.cudnn.deterministic = True
        #torch.backends.cudnn.benchmark = False
        print("Setting random seed = {0}".format(seed))
    else:
        print("Warning: no random seed specified in config file. "
              "Using a random seed ensures results are reproducible.")

    if current_run_output_dir:
        # write configs to output directory
        with open('{}/{}'.format(current_run_output_dir,'configs.yaml'), 'w') as f:
             yaml.dump(configs, f, default_flow_style=None)
        # copy model file or directory to output
        model_input = configs['model']['path']
        if os.path.isdir(model_input): # copy the directory
            shutil.copytree (model_input,
                             os.path.join(current_run_output_dir, os.path.basename(import_model_from)),
                             dirs_exist_ok=True)
        else:
            shutil.copy     (model_input, current_run_output_dir)


    execute(operations, configs, current_run_output_dir)