Source code for chemper.smirksify

#!/usr/bin/env python

#=============================================================================================
# MODULE DOCSTRING
#=============================================================================================

"""
smirksify.py

In this script, we start with a set of clustered molecular fragments with specified
indexed atoms as those you would use to build a ClusterGraph.
We then build cluster Graphs to create the initial SMIRKS patterns and check
that the generated SMIRKS patterns retain the typing from the input cluster.
Next we run a series of iterations removing SMIRKS decorators.
If this "move" doesn't change the way the molecules are typed then the change is accepted.

This class takes inspiration from the tool SMIRKY previously published by the
Open Force Field Initiative:
github.com/openforcefield/smarty

In theory, it is possible this process of removing decorators could be more
systematic/deterministic, however this is a first approach to see
if extracted SMIRKS patterns can do better than SMIRKY.
Also, this approach will be more general since the input clusters do not
rely on a reference force field.
"""
#=============================================================================================
# GLOBAL IMPORTS
#=============================================================================================

import copy

from chemper.graphs.environment import ChemicalEnvironment as CE
from chemper.mol_toolkits import mol_toolkit
from chemper.chemper_utils import ImproperDict, ValenceDict, \
    get_typed_molecules, match_reference
from chemper.graphs.cluster_graph import ClusterGraph

import numpy as np


# =============================================================================================
# private subroutines
# =============================================================================================

[docs]def print_smirks(smirks_list):
    """
    Prints out the provided smirks list
    in a table like format with label | SMIRKS

    Parameters
    -----------
    smirks_list : list of tuples
        list in the form [ (label, SMIRKS), ...]
    """
    str_form = " {0:<20} | {1:} "

    print()
    print(str_form.format("Label", "SMIRKS"))
    print('='*80)
    for label, smirks in smirks_list:
        print(str_form.format(label, smirks))
        print('-'*80)
    print()


[docs]class ClusteringError(Exception):
    """
    Exception for when the SMIRKSifier is unable to create
    a list of SMIRKS to maintain the input clusters.
    """
    def __init__(self, msg):
        Exception.__init__(self, msg)
        self.msg = msg


# =============================================================================================
# SMIRKSifier
# =============================================================================================

[docs]class SMIRKSifier(object):
    """
    Generates complex SMIRKS for a given cluster of substructures
    and then reduces the decorators in those smirks
    """
    def __init__(self, molecules, cluster_list,
                 max_layers=5, verbose=True, strict_smirks=True):
        """
        Parameters
        ----------
        molecules : list of Mols
            These can be chemper Mols or molecules from any supported toolkit
            (currently OpenEye or RDKit)

        cluster_list : list of labels and smirks_atom_lists
            For each label the user should provide a list tuples for atom indices
            in each molecule you want included in that cluster.

            For example, if you wanted all atoms with indices (0,1) and (1,2) to be in cluster 'c1'
            and atoms (2,3) in cluster 'c2' for each of two molecules then cluster_list would be

            [ ('c1', [ (0,1), (1,2) ], [ (0,1), (1,2) ]),
              ('c2', [ (2,3)        ], [ (2,3)        ]) ]

            To see an example of this in action checkout
            https://github.com/MobleyLab/chemper/tree/master/examples

        max_layers : int (optional)
            default = 5
            how many atoms away from the indexed atoms should
            we consider at the maximum

        verbose : boolean (optional)
            default = True
            If true information is printed to the command line during reducing

        strict_smirks : boolean (optional)
            default = True
            If False it will not raise an error when incapable of making SMIRKS
            This setting is not recommended unless you are a master user
            or developer trying to test current behavior.
            The variable SMIRKSifier.checks will tell you if the SMIRKS
            generation failed when strict_smirks = False
        """
        self.molecules = [mol_toolkit.Mol(m) for m in molecules]
        self.intermediate_smirks = dict()
        self.cluster_list = cluster_list
        self.verbose = verbose
        self.max_layers = max_layers
        self.strict_smirks = strict_smirks

        # determine the type of SMIRKS for symmetry in indices purposes
        # This is done by making a test SMIRKS
        graph = ClusterGraph(self.molecules, cluster_list[0][1], 0)
        test_smirks = graph.as_smirks(compress=True)
        env = CE(test_smirks)
        if env.get_type() is None:
            # corresponds to an unknown chemical pattern
            self.dict_type = dict
        elif env.get_type().lower() == 'impropertorsion':
            self.dict_type = ImproperDict
        else:
            self.dict_type = ValenceDict

        # Convert input "smirks_atom_list" into a dictionary with the form:
        # {mol_idx: {(atom indices): label, ...}, ... }
        self.cluster_dict = dict()
        self.ref_labels = set()
        self.total = 0
        # form of cluster_list is [(label, [for each mol [ (tuples of atom indices)] ) ]
        for label, mol_list in self.cluster_list:
            self.ref_labels.add(label)
            # [for each mol [ (tuples of atom indices)]
            for mol_idx, atom_indice_tuples in enumerate(mol_list):
                if mol_idx not in self.cluster_dict:
                    self.cluster_dict[mol_idx] = self.dict_type()
                for atom_tuple in atom_indice_tuples:
                    self.total += 1
                    self.cluster_dict[mol_idx][atom_tuple] = label

        # make SMIRKS patterns for input clusters
        self.current_smirks, self.layers = self.make_smirks()
        if self.verbose: print_smirks(self.current_smirks)
        # check SMIRKS and save the matches to input clusters
        self.type_matches, self.checks = self.types_match_reference()

        if not self.checks:
            msg = """
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with %i layers. Try increasing the number of layers
                      or changing your clusters
                      """ % self.max_layers
            if self.strict_smirks:
                raise ClusteringError(msg)
            else:
                print("WARNING!", msg)

[docs]    def make_smirks(self):
        """
        Create a list of SMIRKS patterns for the input clusters.
        This includes a determining how far away from the indexed atom should
        be included in the SMIRKS (or the number of max_layers is reached)

        Returns
        -------
        smirks_list : list of tuples
            list of tuples in the form (label, smirks)
        layers : int
            number of layers actually used to specify the set clusters
        """
        layers = 0
        # try generating smirks with no layers
        smirks_list = self._make_cluster_graphs(layers)
        # store intermediate smirks and check them
        self.intermediate_smirks[layers] = smirks_list
        _, checks = self.types_match_reference(current_types=smirks_list)

        while not checks and (layers < self.max_layers):
            layers += 1
            smirks_list = self._make_cluster_graphs(layers)
            # store intermediate smirks
            self.intermediate_smirks[layers] = smirks_list
            # check current smirks patterns
            _, checks = self.types_match_reference(current_types=smirks_list)

        return smirks_list, layers

    def _make_cluster_graphs(self, layers):
        """
        Creates a list of SMIRKS using the stored
        molecules and clusters with the specified number
        of layers (atoms away from the indexed atoms)

        Parameters
        -----------
        layers : int
            number of layers (atoms away from indexed atoms) to
            include in this round of graphs

        Returns
        --------
        smirks_list : list of two tuples
            SMIRKS list in the form [ (label: SMIRKS), ...]
        """
        smirks_list = list()

        # loop through the list of fragment clusters
        for label, smirks_atom_list in self.cluster_list:
            # make a ClusterGraph for that label
            graph = ClusterGraph(self.molecules, smirks_atom_list, layers)

            # extract and save the SMIRKS for the cluster
            smirks = graph.as_smirks(compress=True)
            smirks_list.append(('zz_'+str(label), smirks))

        return smirks_list

[docs]    def types_match_reference(self, current_types=None):
        """
        Determine best match for each parameter with reference types

        Parameters
        ----------
        current_types : list of tuples with form [ (label, smirks), ]

        Returns
        -------
        type_matches : list of tuples (current_label, reference_label, counts)
            pair of current and reference labels with the number of fragments that match it
        """
        if current_types is None:
            current_types = self.current_smirks

        if len(current_types) != len(self.ref_labels):
            return set(), False

        current_assignments = get_typed_molecules(current_types, self.molecules)

        type_matches, checks_pass = match_reference(current_assignments, self.cluster_dict)
        return type_matches, checks_pass

[docs]    def reduce(self, max_its=1000, verbose=None):
        """
        Reduce the SMIRKS decorators for a number of iterations

        Parameters
        ----------
        max_its : int
            default = 1000
            The specified number of iterations
        verbose : boolean
            default = None
            will set the verboseness while running
            (if None, the current verbose variable will be used)

        Returns
        ----------
        final_smirks : list of tuples
            list of final smirks patterns after reducing in the form
            [(label, smirks)]
        """
        if not self.checks:
            print("Cannot reduce since unable to create reliable SMIRKS")
            return self.current_smirks

        red = Reducer(self.current_smirks, self.molecules, verbose)
        self.current_smirks = red.run(max_its)
        return self.current_smirks


# ==================================================================================
# Reducer class
# ==================================================================================

[docs]class Reducer():
    """
    Reducer starts with any list of SMIRKS and removes unnecessary decorators
    while maintaining typing on input molecules.
    This was created to be used as a part of the SMIRKSifier.reduce function.
    However, if you have complex SMIRKS and a list of molecules you can
    also reduce those patterns independently.

    Attributes
    ----------
    current_smirks : list of tuples
                    current SMIRKS patterns in the form (label, smirks)
    mols : list of chemper molecules
          molecules being used to reduce the input SMIRKS
    cluster_dict : dictionary
                  Dictionary specifying typing using current SMIRKS in the form:
                  {mol_idx:
                        { (tuple of atom indices): label } }
    """
    def __init__(self, smirks_list, mols, verbose=False):
        """
        Parameters
        ----------
        smirks_list : list of tuples
            set of SMIRKS patterns in the form (label, smirks)
        mols : list of molecules
            Any chemper compatible molecules accepted (ChemPer, OpenEye, or RDKit)
        """
        self.verbose = verbose
        self.current_smirks = copy.deepcopy(smirks_list)

        # make reference clusters
        ref_smirks = [('ref_%s' % l, smirks) for l, smirks in smirks_list]
        self.molecules = [mol_toolkit.Mol(m) for m in mols]
        self.cluster_dict = get_typed_molecules(ref_smirks, self.molecules)

[docs]    def remove_one_sub_dec(self, input_ors, ref_idx):
        """
        Remove one OR decorator from the specified index
        # i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, [+0]), (#7, [X3]) ]

        Parameters
        -----------
        input_ors : list of two tuples
            OR decorators in the form from ChemicalEnvironments
            that is [ (base, [decorators, ]), ... ]
        ref_idx : int
            The index from this list to use when removing one sub-decorator

        Returns
        --------
        new_ors : list of two tuples
            New OR decorators
        """
        new_ors = copy.deepcopy(input_ors)
        # remove one or decorator from a single type
        ref_or = new_ors[ref_idx]
        decs = ref_or[1]
        decs.remove(np.random.choice(decs))
        new_ors[ref_idx] = (ref_or[0], decs)
        return new_ors

[docs]    def remove_ref_sub_decs(self, input_ors, ref_idx):
        """
        Remove all of the ORdecorators at the specified index
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, []), (#7, [X3]) ]

        Parameters
        -----------
        input_ors : list of two tuples
            OR decorators in the form from ChemicalEnvironments
            that is [ (base, [decorators, ]), ... ]
        ref_idx : int
            The index from this list to use when removing one set of sub-decorators

        Returns
        --------
        new_ors : list of two tuples
            New OR decorators
        """
        new_ors = copy.deepcopy(input_ors)
        # remove all decorators on one OR type
        ref_or = new_ors[ref_idx]
        new_ors[ref_idx] = (ref_or[0], list())
        return new_ors

[docs]    def remove_ref(self, input_ors, ref_idx):
        """
        Remove the decorator at the referenced index
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#7, [X3])]

        Parameters
        -----------
        input_ors : list of two tuples
            OR decorators in the form from ChemicalEnvironments
            that is [ (base, [decorators, ]), ... ]
        ref_idx : int
            The OR decorators at ref_idx will be removed entirely

        Returns
        --------
        new_ors : list of two tuples
            New OR decorators
        """
        new_ors = copy.deepcopy(input_ors)
        # remove the single OR type at or_idx
        ref = new_ors[ref_idx]
        new_ors.remove(ref)
        return new_ors

[docs]    def remove_all_bases(self, input_ors):
        """
        convert all bases to [*]
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(*, [X4, +0]), (*, [X3]) ]

        Parameters
        -----------
        input_ors : list of two tuples
            OR decorators in the form from ChemicalEnvironments
            that is [ (base, [decorators, ]), ... ]

        Returns
        --------
        new_ors : list of two tuples
            New OR decorators
        """
        new_all_ors = [('*', d) for b, d in input_ors]
        return new_all_ors

[docs]    def remove_all_dec_type(self, input_ors):
        """
        remove all decorators of the same type, like all 'X' decorators
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, [+0]), (#7, []) ]

        Parameters
        -----------
        input_ors : list of two tuples
            OR decorators in the form from ChemicalEnvironments
            that is [ (base, [decorators, ]), ... ]

        Returns
        --------
        new_ors : list of two tuples
            New OR decorators
        """
        all_decs = set([d for b,decs in input_ors for d in decs])
        remove_dec = np.random.choice(list(all_decs))

        # we start by defining a criteria for when the decorator
        # should not be removed.
        def criteria(x): return remove_dec[0] not in x

        if '+' in remove_dec or '-' in remove_dec:
            def criteria(x): return not ('+' in x or '-' in x)
        elif remove_dec in ['a', 'A']:
            def criteria(x): return x.lower() != 'a'
        elif 'r' in remove_dec:
            def criteria(x): return 'r' not in x

        new_ors = list()
        for b, decs in input_ors:
            new_decs = [d for d in decs if criteria(d)]
            new_ors.append((b, new_decs))
        return new_ors

[docs]    def remove_or_atom(self, input_all_ors, or_idx):
        """
        makes specific types of changes based on atom OR decorators

        Parameters
        ----------
        input_all_ors: list of OR decorators
            [ (base, [decs]), ...]
        or_idx: index that should be used to guide changes

        Returns
        --------
        new_ors : list of two tuples
            new or decorators
        """
        ref_or = input_all_ors[or_idx]

        # Start by checking what removal choices are available with the
        # current list of OR decorators
        # ---------------------------------------------------------------------
        # one choice is to remove all decorators (0)
        # it is always an option
        choices = ['all']
        # are there non-generic bases? If so removing all bases is an option
        if len(set([b for b,ds in input_all_ors])) > 1:
            choices.append('gen_base')
        # there are two options if the ref type has OR decorators
        if len(ref_or[1]) > 0:
            # remove all OR decorators (#6, [X4, +0]) --> (#6, [])
            choices.append('all_ref_decs')
            if len(ref_or[1]) > 1:
                # remove one OR decorator (#6, [X4, +0]) --> (#6, [+0])
                choices.append('one_dec')
        # if there are more than one or type
        if len(input_all_ors) > 1:
            # you can remove just one ORtype (the reference)
            choices.append('remove_ref')
            # check that there are decorators
            all_decs = set([d for b, decs in input_all_ors for d in decs])
            if len(all_decs) > 0:
                # you can remove 1 type of decorator, i.e. all 'Xn' decorators
                choices.append('all_one_dec')
        # ---------------------------------------------------------------------

        # Make a random choice from the available change options
        change = np.random.choice(choices)

        # Based on the option chosen call the appropriate method
        if change == 'all':
            # remove ALL OR decorators
            # i.e.[(  # 6, [X4, +0]), (#7, [X3]) ] --> []
            return list()

        if change == 'remove_ref':
            return self.remove_ref(input_all_ors, or_idx)

        if change == 'all_ref_decs':
            return self.remove_ref_sub_decs(input_all_ors, or_idx)

        if change == 'one_dec':
            return self.remove_one_sub_dec(input_all_ors, or_idx)

        if change == 'gen_base':
            return self.remove_all_bases(input_all_ors)

        # change = 'all_one_dec'
        return self.remove_all_dec_type(input_all_ors)

[docs]    def remove_or(self, input_all_ors, is_bond=False):
        """
        Changes the OR decorators by removing some of them

        Parameters
        -----------
        input_all_ors : list of tuples
            these are the OR decorators for an atom or bond
            from a ChemicalEnvironment
        is_bond : boolean
            are these decorators from from a bond (False for atom)

        Returns
        --------
        new_ors : list of two tuples
            new OR decorators
        """
        if len(input_all_ors) == 0:
            return input_all_ors, False

        # chose a set of or decorators to change
        # these come in the form [base, (decorators)]
        all_ors = copy.deepcopy(input_all_ors)
        or_idx = np.random.randint(len(all_ors))

        # atoms have more OR decorators and therefore more options for
        # how they can be removed
        if not is_bond:
            return self.remove_or_atom(all_ors, or_idx), True

        # For a bond, either one OR type is removed
        # or they are all removed.
        if np.random.rand() > 0.5:
            # remove just one or type
            return self.remove_ref(all_ors, or_idx), True
        # remove all ors
        return list(), True

[docs]    def remove_and(self, input_all_ands):
        """
        removes a decorator that is AND'd in the original SMIRKS

        Parameters
        -----------
        input_all_ands : list
            List of AND decorators

        Returns
        --------
        new_ands : list
            List of new AND decorators
        """
        # if there are no ands return with no changes
        if len(input_all_ands) == 0:
            return input_all_ands, False

        if np.random.rand() > 0.5:
            # otherwise randomly remove an AND decorator
            all_ands = copy.deepcopy(input_all_ands)
            all_ands.remove(np.random.choice(all_ands))
        else:
            all_ands = list()
        return all_ands, True

    def _get_item_and_remove_options(self, env):
        """
        This function chooses and Atom or Bond
        which will then have some of its decorators removed.
        It also determines what part of the component can be
        removed: OR decorators(0), AND decorators(1), the whole atom(2)

        Parameters
        ----------
        env: ChemicalEnvironment

        Returns
        -------
        item: an Atom or Bond from the Chemical Environment
        dec_opts: list
            0 = has OR decorators to remove
            1 = has AND decorators to remove
            2 = is an atom that could be removed
        """
        items = list()
        odds = list()
        for a_b in env.get_atoms() + env.get_bonds():
            count = len(a_b.and_types)
            for o in a_b.or_types:
                if o[0] != '*':
                    count += 1
                count += len(o[1])

            # a wild card, atom with no index should be considered ([*])
            # should also be on this list so it can be removed
            if isinstance(a_b, CE.Atom) and not isinstance(a_b, CE.Bond) and env.is_unindexed(a_b):
                count += 1

            items.append(a_b)
            odds.append(count)

        # normalize odds to weights
        odds = np.array(odds)

        # if the odds for changing all atoms and bonds is zero return None
        if odds.sum() == 0:
            return None, list()

        weights = odds / odds.sum()

        # choose an atom or bond with the probabilities:
        item = np.random.choice(items, p=weights)
        dec_opts = list()
        if len(item.or_types) > 0:
            dec_opts.append('remove_ors')
        if len(item.and_types) > 0:
            dec_opts.append('remove_ands')

        if not isinstance(item, CE.Bond): # then it is an atom
            if env.get_valence(item) == 1 and env.is_unindexed(item):
                dec_opts.append('remove_atom')

        return item, dec_opts

[docs]    def remove_decorator(self, smirks):
        """
        Chose an atom or bond in the input smirks pattern
        and then remove one decorator from it.

        Parameters
        -----------
        smirks : str
            A SMIRKS string which should be reduced

        Returns
        --------
        new_smirks : str
            A new SMIRKS pattern
        is_changed : bool
            True if some of the decorators were successfully removed
        """
        env = CE(smirks)
        sub, dec_opts = self._get_item_and_remove_options(env)

        # note this should be impossible
        if sub is None or len(dec_opts) == 0:
            return smirks, False

        change = np.random.choice(dec_opts)
        if change == 'remove_ors':
            new_or_types, changed = self.remove_or(sub.or_types, isinstance(sub, CE.Bond))
            if not changed:
                return smirks, False
            sub.or_types = new_or_types
        elif change == 'remove_ands':
            new_and_types, changed = self.remove_and(sub.and_types)
            if not changed:
                return smirks, False
            sub.and_types = new_and_types
        else: # change == 'remove_atom'
            remove = env.remove_atom(sub)
            if not remove:
                return smirks, False

        return env.as_smirks(), True

[docs]    def run(self, max_its=1000, verbose=None):
        """
        Reduce the SMIRKS decorators for a number of iterations

        Parameters
        ----------
        max_its : int
            The specified number of iterations
        verbose : boolean
            will set the verboseness while running
            (if None, the current verbose variable will be used)

        Returns
        ----------
        final_smirks : list of tuples
            list of final smirks patterns after reducing in the form
            [(label, smirks)]
        """
        temp_verbose = self.verbose
        if verbose is not None:
            self.verbose = verbose

        for it in range(max_its):
            if self.verbose: print("Iteration: ", it)

            proposed_smirks = copy.deepcopy(self.current_smirks)
            # chose a SMIRKS to change
            change_idx = np.random.randint(len(proposed_smirks))
            change_entry = proposed_smirks[change_idx]

            # generate new smirks
            new_smirks, changed = self.remove_decorator(change_entry[1])
            if self.verbose: print("Attempting to change SMIRKS #%i\n%s  -->  %s"\
                                   % (change_idx, change_entry[1], new_smirks))
            if not changed:
                if self.verbose:
                    print("Rejected!\nNo change made to SMIRKS\n%s" % change_entry[1])
                    print('-'*90)
                continue

            # update proposed list and check if it still matches the reference
            proposed_smirks[change_idx] = (change_entry[0], new_smirks)

            current_assignments = get_typed_molecules(proposed_smirks, self.molecules)
            _, proposed_checks = match_reference(current_assignments, self.cluster_dict)

            if proposed_checks:
                if self.verbose: print("Accepted! ")
                self.current_smirks = copy.deepcopy(proposed_smirks)
            else:
                if self.verbose: print("Rejected!\n proposed SMIRKS changed the way fragments are clustered")
            if self.verbose: print('-'*90)

        if self.verbose: print_smirks(self.current_smirks)
        self.verbose = temp_verbose

        return self.current_smirks