Source code for alframework.ml_interfaces.neurochem_interface

import numpy as np
import os
import shutil

import anitraintools as alt

from ase_interface import aniensloader
from ase_interface import ANIENS, batchedensemblemolecule

from alframework.tools.tools import build_input_dict

from parsl import python_app, bash_app

import time


[docs]
class NeuroChemTrainer():
    """Class that interfaces ALF and neurochem/ANI"""

    def __init__(self, ensemble_size, gpuids, force_training=True, periodic=False, rmhighe=False, rmhighf=False,
                 build_test=True, remove_existing=False):
        """
        Args:
            ensemble_size (int): Number of NN to form the ensemble.
            gpuids (list): GPUs identification.
            force_training (bool): If True also trains the NN for forces.
            periodic (bool): True if PBC are considered.
            rmhighe (float or bool): Remove from training configuration with energy higher than 'rmhighe'.
            rmhighf (float or bool): Remove from training configurations with forces higher than 'rmhighf'.
            build_test (bool): If True build a held out test set.
            remove_existing (bool): If True deletes the previous model before training the new one.
        """
        self.ensemble_size = ensemble_size
        self.force_training = force_training
        self.periodic = periodic
        self.gpuids = gpuids
        self.rmhighe = rmhighe
        self.rmhighf = rmhighf
        self.build_test = build_test
        self.remove_existing = remove_existing


[docs]
    def train_models(self, tparam):
        """Trains an ensemble of ANI networks.

        Args:
            tparam (dict): Dicitonary containing training parameters.

        Returns:
            (tuple): Tuple of lists (all_nets, completed) where the first element contain training information of the
                     networks and the second element tell us whether training was successfuly completed or not.

        """
        print('Trainer:')
        print(tparam['ensemble_path'], tparam['data_store'], tparam['seed'])
        
        if os.path.isdir(tparam['ensemble_path']):
            if self.remove_existing:
                shutil.rmtree(tparam['ensemble_path'])
                os.mkdir(tparam['ensemble_path'])
            else: 
                raise RuntimeError("model directory already exists: " + tparam['ensemble_path'])
        else:
            os.mkdir(tparam['ensemble_path'])
                

        ndir = tparam['ensemble_path']

        #f = open('TRAIN-'+str(tparam['ids'][0]), 'w')
        #f.write(ndir+' '+tparam['data_store']+'\n')
        #f.close()

        # Setup AEV parameters
        aevparams  = tparam['aev_params']
        prm = alt.anitrainerparamsdesigner(aevparams['elements'],
                                           aevparams['NRrad'],
                                           aevparams['Rradcut'],
                                           aevparams['NArad'],
                                           aevparams['NAang'],
                                           aevparams['Aradcut'],
                                           aevparams['x0'])
        prm.create_params_file(ndir)

        # input parameters
        iptparams = tparam['input_params']
        ipt = alt.anitrainerinputdesigner()
        ipt.set_parameter('atomEnergyFile', 'sae_linfit.dat')
        ipt.set_parameter('sflparamsfile', prm.get_filename())

        for key in iptparams.keys():
            ipt.set_parameter(key, str(iptparams[key]))

        # Set network layers
        netparams = tparam['layers']
        for element_key in netparams.keys():
            for layer_params in netparams[element_key]:
                ipt.add_layer(element_key, layer_params)

        netdict = {'cnstfile': ndir + '/' + prm.get_filename(),
                   'saefile': ndir + '/sae_linfit.dat',
                   'iptsize': prm.get_aev_size(),
                   'atomtyp': prm.params['elm']}

        np.random.seed(tparam['seed'])
        local_seeds = np.random.randint(0, 2 ** 32, size=2)
        print('local seeds:',local_seeds)

        # Declare the training class for all ranks
        ens = alt.alaniensembletrainer(ndir + '/',
                                       netdict,
                                       ipt,
                                       tparam['data_store'],
                                       self.ensemble_size, random_seed=local_seeds[0])
        #
        # Build training cache
        #16,1,1 should probably be exposed to the user, maybe.
        ens.build_strided_training_cache(16, 1, 1, build_test=self.build_test, Ekey='energy',
                                             forces=self.force_training, grad=False, Fkey='forces',
                                             dipole=False,
                                             rmhighf=self.rmhighf, rmhighe=self.rmhighe, pbc=self.periodic)

        # Train a single model, outside interface should handle ensembles?
        #os.environ['CUDA_VISIBLE_DEVICES'] = str(self.gpuid)
        ens.train_ensemble(self.gpuids, self.remove_existing)

        all_nets, completed = alt.get_train_stats(self.ensemble_size, ndir + '/')

        return all_nets, completed




[docs]
def NeuroChemCalculator(model_details):
    """Neurochem calculator that uses an ensemble of ANI neural networks.

    Args:
        model_details (dict): Dictionary containing the details of the model.

    Returns:
        (ase_interface.ANIEN): An object representing an ensemble of ANI neural networks.

    """
    model_path = model_details['model_path']
    cns = [f for f in os.listdir(model_path) if '.params' in f][0]
    sae = 'sae_linfit.dat'
    Nn  = model_details['Nn']
    gpu = model_details['gpu']
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)

    return ANIENS(batchedensemblemolecule(model_path + '/' + cns, model_path + '/' + sae, model_path, Nn, 0))


#Gen
#configuration,h5_train_dir,ensemble_path,ensemble_index,remove_existing=False,h5_test_dir=None)
@python_app(executors=['alf_ML_executor'])
def train_ANI_model_task(ML_config, h5_dir, model_path, current_training_id, gpus_per_node, remove_existing=False, h5_test_dir=None):
    """ANI executor task.

    Args:
        ML_config (dict): ANI parameters as defined in the ML config json.
        h5_dir (str): Path to the directory containing the h5 files.
        model_path (str): Path of the ML models.
        current_training_id (int): Current model number.
        gpus_per_node (int): Number of GPUs per node.
        remove_existing (bool): If True deletes the previous model before training the new one.
        h5_test_dir (str): Path to the directory containing the h5 test data.

    Returns:
        (tuple): Tuple whose first element tell us if the training was successfully completed and second element the
                 id of the ensemble model.

    """
    configuration = ML_config.copy()
    #nct = NeuroChemTrainer(ensemble_size, gpuids, force_training=True, periodic=False, rmhighe=False, rmhighf=False, build_test=True)
    nct_input = build_input_dict(NeuroChemTrainer.__init__,[{"gpuids":list(range(gpus_per_node))}, configuration])
    #nct = NeuroChemTrainer(8,list(range(gpus_per_node)), force_training=True, periodic=True, rmhighe=False, rmhighf=False, build_test=True, remove_existing=remove_existing)
    nct = NeuroChemTrainer(**nct_input)

    # this is a little awkward
    configuration['ensemble_path'] = model_path.format(current_training_id)
    configuration['data_store'] = h5_dir
    configuration['seed'] = np.random.randint(1e8) # Change before production to a random number generated on the fly
    
    (all_nets, completed) = nct.train_models(configuration)
    
    # No need to return network parameters
    # return(all_nets, completed, current_training_id)
    # return completed, model_index # Shouldn't it be current_training_id? model_index doesn't exist.
    return completed, current_training_id