Source code for chemper.graphs.cluster_graph

"""
cluster_graph.py

ClusterGraph are a class for tracking all possible smirks decorators in a group
(or cluster) of molecular fragments. Moving forward these will be used to
find the minimum number of smirks decorators that are required to have a
set of smirks patterns that maintain a given clustering of fragments.
"""

import networkx as nx
from functools import total_ordering
from chemper.graphs.single_graph import SingleGraph
from chemper.graphs.environment import ChemicalEnvironment as CE
from chemper.mol_toolkits import mol_toolkit


[docs]@total_ordering
class ClusterGraph(SingleGraph):
    """
    ChemPerGraphs are a graph based class for storing atom and bond information.
    They use the chemper.mol_toolkits Atoms, Bonds, and Mols
    """
[docs]    @total_ordering
    class AtomStorage:
        """
        AtomStorage tracks information about an atom
        """
        def __init__(self, atoms=None, label=None):
            """
            Parameters
            ----------
            atoms : ChemPer Atom or list of ChemPer Atoms
                this is one or more atoms whose information should be stored
            label : int
                SMIRKS index (:n) for writing SMIRKS
                if the value is less than zero it is used for storage purposes
                only as SMIRKS can only be written with positive integer indices
            """
            self.decorators = set()
            if atoms is not None:
                # check if this is a single atom
                if 'Atom' in str(type(atoms)):
                    atoms = [atoms]

                # otherwise it should be iterable
                for atom in atoms:
                    self.decorators.add(self.make_atom_decorators(atom))
            self.label = label

        def __lt__(self, other):
            """
            Overrides the default implementation
            This method was primarily written for making SMIRKS patterns predictable.
            If atoms are sortable, then the SMIRKS patterns are always the same making
            tests easier to write. However, the specific sorting was created to also make SMIRKS
            output as human readable as possible, that is to at least make it easier for a
            human to see how the indexed atoms are related to each other.
            It is typically easier for humans to read SMILES/SMARTS/SMIRKS with less branching (indicated with ()).

            For example in:
            [C:1]([H])([H])~[N:2]([C])~[O:3]
            it is easier to see that the atoms C~N~O are connected in a "line" instead of:
            [C:1]([N:2]([O:3])[C])([H])[H]
            which is equivalent, but with all the () it is hard for a human to read the branching

            Parameters
            ----------
            other : AtomStorage

            Returns
            -------
            is_less_than : boolean
                self is less than other
            """
            # if either smirks index is None, then you can't directly compare
            # make a temporary index that is negative if it was None
            self_index = self.label if self.label is not None else -1000
            other_index = other.label if other.label is not None else -1000
            # if either index is greater than 0, the one that is largest should go at the end of the list
            if self_index > 0 or other_index > 0:
                return self_index < other_index

            # Both SMIRKS indices are not positive or None so compare the SMIRKS patterns instead
            return self.as_smirks() < other.as_smirks()

        def __eq__(self, other): return self.as_smirks() == other.as_smirks() and self.label == other.label

        def __hash__(self): return id(self)

        def __str__(self): return self.as_smirks()

[docs]        def make_atom_decorators(self, atom):
            """
            extract information from a ChemPer Atom that would be useful in a smirks

            parameters
            ----------
            atom : ChemPer atom object

            returns
            -------
            decorators : tuple of str
                tuple of all possible decorators for this atom
            """
            aromatic = 'a' if atom.is_aromatic() else 'A'
            charge = atom.formal_charge()
            if charge >= 0:
                charge = '+%i' % charge
            else:
                charge = '%i' % charge
            min_ring_size = atom.min_ring_size()
            if min_ring_size == 0:
                ring = '!r'
            else:
                ring = 'r%i' % min_ring_size

            return (
                '#%i' % atom.atomic_number(),
                'H%i' % atom.hydrogen_count(),
                'X%i' % atom.connectivity(),
                'x%i' % atom.ring_connectivity(),
                ring,
                charge,
                aromatic,
                )

[docs]        def as_smirks(self, compress=False):
            """
            Parameters
            ----------
            compress : boolean
                should decorators common to all sets be combined
                for example '#6X4,#7X3;+0!r...'

            Returns
            -------
            smirks : str
                how this atom would be represented in a SMIRKS string
                with the minimal combination of SMIRKS decorators
            """
            if len(self.decorators) == 0:
                if self.label is None or self.label <= 0:
                    return '[*]'
                return '[*:%i]' % self.label

            if compress and len(self.decorators) > 1:
                base_smirks = self._compress_smirks()
            else:
                base_smirks = ','.join(sorted([''.join(l) for l in self.decorators]))

            if self.label is None or self.label <= 0:
                return '[%s]' % base_smirks

            return '[%s:%i]' % (base_smirks, self.label)

        def _sort_decs(self, dec_set, wild=True):
            """
            Parameters
            ----------
            dec_set : list like
                single set of atom decorators
            wild : boolean
                insert * for decorator lists with no #n decorator

            Returns
            -------
            sorted_dec_set : list
                same set of decorators sorted with atomic number or * first
            """
            temp_dec_set = list(dec_set)
            atom_num = [i for i in temp_dec_set if '#' in i]
            if len(atom_num) == 0 and wild:
                atom_num = ["*"]

            temp_dec_set = set(temp_dec_set) - set(atom_num)

            aro = [i for i in temp_dec_set if 'a' in i.lower()]
            temp_dec_set = set(temp_dec_set) - set(aro)

            return atom_num + sorted(list(temp_dec_set)) + aro

        def _compress_smirks(self):
            """
            Returns
            -------
            smirks : str
                This SMIRKS is compressed with all common decorators and'd to
                the end of the pattern
            """
            set_decs = [set(d) for d in self.decorators]
            ands = set_decs[0]

            for d_set in set_decs:
                ands = ands & d_set

            # check for atomic number in the "ands"
            atomic = [a for a in ands if '#' in a]
            if len(atomic) == 1:
                # remove from and
                ands.remove(atomic[0])
                # put in all sets
                for s in set_decs:
                    s.add(atomic[0])

            or_sets = [self._sort_decs(d.difference(ands)) for d in set_decs]
            ors = [''.join(o) for o in or_sets]

            # add commas between ors
            base = ','.join(sorted(ors))
            # add and decorators
            if len(ands) > 0:
                base += ';'+ ';'.join(self._sort_decs(ands, wild=False))
            return base

[docs]        def add_atom(self, atom):
            """
            Expand current AtomStorage by adding information about
            a new ChemPer Atom

            Parameters
            ----------
            atom : ChemPer Atom
            """
            self.decorators.add(self.make_atom_decorators(atom))

[docs]        def compare_atom(self, atom):
            """
            Compares decorators in this AtomStorage with the provided
            ChemPer atom. The decorators are compared separately and
            the highest score is returned. For example,
            if this storage had two sets of decorators
                - #7H1X3x0!r+0A
                - #6H1X4x0!r+0A
            and the input atom would have the decorators:
                - #6H1X3x2!r+0a

            The score is calculated by finding the number of decorators
            in common which would be
                - #7H1X3x0!r+0A and #6H1X3x2r6+0a
                    have 3 decorators in common (H1,X3,+0)
                - #6H1X4x0!r+0A and #6H1X3x2r6+0a
                    also have 3 decorators in common (#6, H1, +0)
            However, we weight atoms with the same atomic number as more
            similar by dividing the score by 10 if the atomic numbers do
            not agree. Therefore the final scores will be:
                - 0.3 for #7H1X3x0!r+0A
                - 3 for #6H1X4x0!r+0A

            The highest score for any set of decorators is returned so
            3 is the returned score in this example.

            Parameters
            ----------
            atom : ChemPer Atom

            Returns
            -------
            score : float
                A score describing how similar the input atom is to any set of
                decorators currently in this storage, based on its SMIRKS decorators.
                This score ranges from 0 to 7. 7 comes from the number of decorators
                on any atom, if this atom matches perfectly with one of the
                current decorator sets then 7 decorators agree.However, if the atomic
                number doesn't agree, then that set of decorators is considered
                less ideal, thus if the atomic numbers don't agree, then the score
                is given by the number other decorators divided by 10.
                If the current storage is empty, then the score is given as 7
                since any atom matches a wildcard atom.
            """
            # If decorators is empty (no known atom information, return 7 (current max)
            if len(self.decorators) == 0:
                return 7

            score = 0
            decs = self.make_atom_decorators(atom)

            for ref in self.decorators:
                # get atomic number for this set of decorators
                current = len(set(ref) & set(decs))

                # if atomic numbers don't agree, get the number of common decorators / 10
                # if there are no matching atomic numbers, priority should still be given
                # when the current atom matches stored decorators most closely
                if ref[0] != decs[0]:
                    current = current / 10.0

                if current > score:
                    score = current

            return score

[docs]    @total_ordering
    class BondStorage:
        """
        BondStorage tracks information about a bond
        """
        def __init__(self, bonds=None, label=None):
            """
            Parameters
            ----------
            bonds : list of ChemPer Bonds
                this is one or more bonds whose information should be stored
            label : a label for the object, it can be anything
                unlike atoms, bonds in smirks don't have labels
                so this is only used for labeling the object if wanted
            """
            self.order = set()
            self.ring = set()
            self.order_dict = {1:'-', 1.5:':', 2:'=', 3:'#'}
            if bonds is not None:
                if 'Bond' in str(type(bonds)):
                    bonds = [bonds]
                for bond in bonds:
                    self.order.add(bond.get_order())
                    self.ring.add(bond.is_ring())

            self.label = label

        def __str__(self): return self.as_smirks()

        def __lt__(self, other):
            if self.as_smirks() == other.as_smirks():
                return self.label < other.label
            return self.as_smirks() < other.as_smirks()

        def __eq__(self, other):
            return self.label == other.label and self.as_smirks() == other.as__smirks()

        def __hash__(self): return id(self)
        
[docs]        def as_smirks(self):
            """
            Returns
            -------
            smirks : str
                how this bond would be represented in a SMIRKS string
                using only the required number of
            """
            if len(self.order) == 0:
                order = '~'
            else:
                order = ','.join([self.order_dict.get(o, '~') for o in sorted(list(self.order))])

            # the ring set has booleans, if the length of the set is 1 then only ring (@) or non-ring (!@)
            # bonds haven been added to this storage and we AND that decorator to the end of the bond
            if len(self.ring) == 1:
                if list(self.ring)[0]:
                    return order+';@'
                else:
                    return order+';!@'

            return order

[docs]        def add_bond(self, bond):
            """
            Expand current BondStorage by adding information about
            a new ChemPer Bond

            Parameters
            ----------
            bond : ChemPer Bond
            """
            self.order.add(bond.get_order())
            self.ring.add(bond.is_ring())

[docs]        def compare_bond(self, bond):
            """

            Parameters
            ----------
            bond : ChemPer Bond
                bond you want to compare to the current storage

            Returns
            -------
            score : int (0,1,2)
                A score describing how similar the input bond is to any set of decorators currently
                in this storage, based on its SMIRKS decorators.

                1 for the bond order +
                1 base on if this is a ring bond
            """
            score = 0
            if bond.get_order() in self.order or len(self.order) == 0:
                score += 1

            # the ring set has booleans, if the length of the set is 1 then only ring or non-ring
            # bonds haven been added to this storage. That is the only time the ring contributes to the score
            if len(self.ring) == 1 and list(self.ring)[0] == bond.is_ring():
                score += 1

            return score

    # Initiate ClusterGraph
    def __init__(self, mols=None, smirks_atoms_lists=None, layers=0):
        """
        Initialize a SingleGraph from a molecule and list of indexed atoms

        For the example, imagine we wanted to get a SMIRKS that
        would match the carbon-carbon bonds in ethane and propane.
        The carbon atoms are have indices (0,1) in ethane and (0,1) and (1,2)
        in propane. For this example, we will assume we also want to include
        the atoms one bond away from the indexed atoms (1 layer away).

        Parameters
        ----------
        mols : list of molecules (optional)
            default = None (makes an empty graph)
            these can be ChemPer Mols or molecule objects from
            any supported toolkit (currently OpenEye or RDKit)

        smirks_atoms_lists : list of list of tuples (optional)
            default = None (must be paired with mols=None)
            There is a list of tuples for each molecule, where each tuple specifies
            a molecular fragment using the atoms' indices.
            In the ethane and propane example, the `smirks_atoms_lists` would be
                [ [ (0,1) ], [ (0,1), (1,2) ] ]
            with one carbon-carbon bond in ethane and two carbon-carbon bonds in propane

        layers : int (optional)
            default = 0
            layers specifies how many bonds away from the indexed atoms should be included in the
            the SMIRKS patterns.
            Instead of an int, the string 'all' would lead to all atoms in the molecules
            being included in the SMIRKS (not recommended)
        """
        SingleGraph.__init__(self)

        self.mols = list()
        self.smirks_atoms_lists = list()
        self.layers = layers
        self._symmetry_funct = self._no_symmetry

        if mols is not None:
            temp_mols = [mol_toolkit.Mol(m) for m in mols]
            if len(temp_mols) != len(smirks_atoms_lists):
                raise Exception('Number of molecules and smirks dictionaries should be equal')

            for idx, mol in enumerate(temp_mols):
                self.add_mol(mol, smirks_atoms_lists[idx])

[docs]    def as_smirks(self, compress=False):
        """
        Parameters
        ----------
        compress : boolean
            returns the shorter version of atom SMIRKS patterns
            that is atoms have decorators "anded" to the end rather than listed
            in each set that are OR'd together.
            For example "[#6AH2X3x0!r+0,#6AH1X3x0!r+0:1]-;!@[#1AH0X1x0!r+0]"
            compresses to: "[#6H2,#6H1;AX3x0!r+0:1]-;!@[#1AH0X1x0!r+0]"

        Returns
        -------
        SMIRKS : str
            a SMIRKS string matching the exact atom and bond information stored
        """
        # The atom compression is different, but otherwise this is the
        # same function as the parent class (SingleGraph)
        return SingleGraph.as_smirks(self, compress)

[docs]    def get_symmetry_funct(self, sym_label):
        """
        Determine the symmetry function that should be used
        when adding atoms to this graph.

        For example, imagine a user is trying to make a
        SMIRKS for all of the C-H bonds in methane. In most
        toolkits the index for the carbon is 0 and the hydrogens are 1,2,3,4.
        The final SMIRKS should have the form [#6AH4X4x0!r+0:1]-;!@[#1AH0X1x0!r+0]
        no matter what order the atoms are input into ClusterGraph.
        So if the user provides (0,1), (0,2), (3,0), (4,0) ClusterGraph
        should figure out that the carbons in (3,0) and (4,0) should be in
        the atom index :1 place like they were in the first set of atoms.

        Bond atoms in (1,2) or (2,1) are symmetric, for angles its (1,2,3) or (3,2,1)
        for proper torsions (1,2,3,4) or (4,3,2,1) and for
        improper torsions (1,2,3,4), (3,2,1,4), (4,2,1,3).
        For any other fragment type the atoms will be added to the graph in
        the order they are provided since the symmetry function is unknown.

        # TODO: In theory you could generalize this for generic linear fragments
        # where those with an odd number of atoms behave like angles and an
        # even number behave like proper torsions, however I think that is
        # going to be outside the scope of ChemPer for the foreseeable future.

        Parameters
        ----------
        sym_label : str or None
            type of symmetry, options which will change the way symmetry is
            handled in the graph are "bond", "angle", "ProperTorsion", and "ImproperTorsion"

        Returns
        -------
        symmetry_funct : function
            returns the function that should be used to handle the appropriate symmetry
        """
        if sym_label is None:
            return self._no_symmetry
        if sym_label.lower() == 'bond':
            return self._bond_symmetry
        if sym_label.lower() == 'angle':
            return self._angle_symmetry
        if sym_label.lower() == 'propertorsion':
            return self._proper_torsion_symmetry
        if sym_label.lower() == 'impropertorsion':
            return self._improper_torsion_symmetry
        return self._no_symmetry

[docs]    def add_mol(self, input_mol, smirks_atoms_list):
        """
        Expand the information in this graph by adding a new molecule

        Parameters
        ----------
        input_mol : ChemPer Mol
        smirks_atoms_list : list of tuples
            This is a list of tuples with atom indices [ (indices), ... ]
        """
        mol = mol_toolkit.Mol(input_mol)

        if len(smirks_atoms_list) == 0:
            return

        if len(self.mols) == 0:
            self._add_first_smirks_atoms(mol, smirks_atoms_list[0])
            self._symmetry_funct = self.get_symmetry_funct(CE(self.as_smirks()).get_type())
            self._add_mol(mol, smirks_atoms_list[1:])
        else:
            self._add_mol(mol, smirks_atoms_list)

        self.mols.append(mol)
        self.smirks_atoms_lists.append(smirks_atoms_list)

    def _add_first_smirks_atoms(self, mol, smirks_atoms):
        """
        private function for adding the first molecule to an empty ClusterGraph
        add_mol calls this if the graph is empty

        Parameters
        ----------
        mol : ChemPer Mol
        smirks_atoms : tuple
            tuple of atom indices for the first atoms to add to the graph. i.e. (0, 1)
        """
        atom_dict = dict()
        for key, atom_index in enumerate(smirks_atoms, 1):
            atom_dict[atom_index] = key

            atom1 = mol.get_atom_by_index(atom_index)
            new_atom_storage = self.AtomStorage([atom1], key)
            self._graph.add_node(new_atom_storage)
            self.atom_by_label[key] = new_atom_storage

            # Check for bonded atoms already in the graph
            for neighbor_key in range(len(smirks_atoms), 0, -1):
                if neighbor_key not in self.atom_by_label:
                    continue

                # check if atoms are already connected on the graph
                neighbor_storage = self.atom_by_label[neighbor_key]
                if nx.has_path(self._graph, new_atom_storage, neighbor_storage):
                    continue

                # check if atoms are connected in the molecule
                atom2 = mol.get_atom_by_index(smirks_atoms[neighbor_key-1])
                bond = mol.get_bond_by_atoms(atom1, atom2)

                if bond is not None: # Atoms are connected add edge
                    bond_smirks = tuple(sorted([neighbor_key, key]))
                    bond_storage = self.BondStorage([bond], bond_smirks)
                    self.bond_by_label[bond_smirks] = bond_storage
                    self._graph.add_edge(new_atom_storage,
                                         neighbor_storage,
                                         bond=bond_storage)

        # for each indexed atoms add unindexed atoms for the number of specified layers
        for atom_label, atom_index in enumerate(smirks_atoms, 1):
            atom = mol.get_atom_by_index(atom_index)
            storage = self.atom_by_label[atom_label]
            self._add_layers(mol, atom, storage, self.layers, atom_dict, is_first=True)

    def _add_layers(self, mol, atom, storage, layers, idx_dict, is_first=False):
        """
        Parameters
        ----------
        mol : ChemPer Mol
            molecule containing provided atom
        atom : ChemPer Atom
        storage: AtomStorage
            corresponding to the ChemPer Atom provided
        layers : int or 'all'
            number of layers left to add (or all)
        idx_dict : dict
            form {atom index: label} for this smirks_list in this molecule
        """
        # if layers is 0 there are no more atoms to add so end the recursion
        if layers == 0:
            return

        # find atom neighbors that are not already included in SMIRKS indexed atoms
        atom_neighbors = [(a, mol.get_bond_by_atoms(a,atom)) for a in atom.get_neighbors() \
                          if a.get_index() not in idx_dict]

        # get the smirks indices already added to the storage
        # This includes all previous layers since the idx_dict is updated as you go
        storage_labels = [e for k,e in idx_dict.items()]

        # similar to atoms find neighbors already in the graph that haven't already been used
        storage_neighbors = [(s, self.get_connecting_bond(s, storage)) for s in self.get_neighbors(storage) \
                             if s.label not in storage_labels]

        new_pairs = list()
        # if this is the first set of atoms added, just make a new
        # storage for all neighboring atoms
        if is_first:
            min_smirks = storage.label * 10
            if min_smirks > 0:
                min_smirks = min_smirks * -1

            for a, b in atom_neighbors:
                new_bond_smirks = tuple(sorted([storage.label, min_smirks]))

                adding_new_storage = self.add_atom(a,b,storage,
                                                   min_smirks, new_bond_smirks)

                idx_dict[a.get_index()] = min_smirks
                self.atom_by_label[min_smirks] = adding_new_storage
                min_smirks -= 1
                new_pairs.append((a, adding_new_storage))

        else: # this isn't the first set of atoms so you need to
            # pair up the atoms with their storage
            pairs = self.find_pairs(atom_neighbors, storage_neighbors)
            for new_atom, new_bond, new_storage_atom, new_storage_bond in pairs:
                # if no storage is paired to this atom skip it
                if new_storage_atom is None:
                    continue
                # if there is no atom paired to a storage remove that branch
                if new_atom is None:
                    self.remove_atom(new_storage_atom)
                    continue
                # add atom and bond information to the storage
                new_storage_atom.add_atom(new_atom)
                new_storage_bond.add_bond(new_bond)
                new_pairs.append((new_atom, new_storage_atom))
                idx_dict[new_atom.get_index()] = new_storage_atom.label

        # Repeat for the extra layers
        if layers == 'all':
            new_layers = 'all'
        else:
            new_layers = layers - 1
            if new_layers == 0:
                return

        for new_atom, new_storage in new_pairs:
            self._add_layers(mol, new_atom, new_storage, new_layers, idx_dict, is_first)

[docs]    def find_pairs(self, atoms_and_bonds, storages):
        """
        Find pairs is used to determine which current AtomStorage from storages
        atoms should be paired with.
        This function takes advantage of the maximum scoring function in networkx
        to find the pairing with the highest "score".
        Scores are determined using functions in the atom and bond storage objects
        that compare those storages to the new atom or bond.

        If there are less atoms than storages then the atoms with the lowest pair are
        assigned a None pairing.

        Parameters
        ----------
        atoms_and_bonds : list of tuples in form (ChemPer Atom, ChemPer Bond, ...)
        storages: list of tuples in form (AtomStorage, BondStorage, ...)

        Tuples can be of any length as long as they are the same, so for example, in
        a bond you might only care about the outer atoms for comparison so you would compare
        (atom1,) and (atom2,) with (atom_storage1,) and (atom_storage2,)
        However, in a torsion, you might want the atoms and bonds for each outer bond
        so in that case you would compare
        (atom1, bond1, atom2) and (atom4, bond3, atom3)
        with the corresponding storage objects.

        Returns
        -------
        pairs : list of lists
            pairs of atoms and storage objects that are most similar,
            these lists always come in the form (all atom/bonds, all storage objects)
            for the bond example above you might get
            [ [atom1, storage1], [atom2, storage2] ]
            for the torsion example you might get
            [ [atom4, bond4, atom3, atom_storage1, bond_storage1, atom_storage2],
              [atom1, bond1, atom2, atom_storage4, bond_storage3, atom_storage3]

        """
        # store paired stets of atoms/bonds and corresponding storages
        pairs = list()
        # check for odd cases
        combo = atoms_and_bonds + storages
        # 1. both lists are empty
        if len(combo) == 0:
            return pairs

        nones = [None] * len(combo[0])
        # 2. no atom/bond storage
        if len(atoms_and_bonds) == 0:
            for storage_set in storages:
                pairs.append(nones + list(storage_set))
            return pairs

        # 3. no storages
        if len(storages) == 0:
            for atom_set in atoms_and_bonds:
                pairs.append(list(atom_set) + nones)
            return pairs

        g = nx.Graph()

        atom_dict = dict()
        storage_dict = dict()

        # create a bipartite graph with atoms/bonds on one side
        for idx, atom_set in enumerate(atoms_and_bonds):
            g.add_node(idx+1, bipartite=0)
            atom_dict[idx+1] = atom_set
        # and atom/bond storage objects on the other
        for idx, storage_set in enumerate(storages):
            g.add_node((idx*-1)-1, bipartite=1)
            storage_dict[(idx*-1)-1] = storage_set

        # Fill in the weight on each edge of the graph using the compare_atom/bond functions
        for a_idx, atom_set in atom_dict.items():
            for s_idx, storage_set in storage_dict.items():
                # sum up score for every entry in the atom and storage set
                score = 0
                for sa, a in zip(storage_set, atom_set):
                    if isinstance(sa, self.BondStorage):
                        score += sa.compare_bond(a)
                    else:
                        score += sa.compare_atom(a)
                # score can't be zero so save score+1
                g.add_edge(a_idx,s_idx,weight=score+1)

        # calculate maximum matching, that is the pairing of atoms/bonds to
        # storage objects that leads the the highest overall score
        matching = nx.algorithms.max_weight_matching(g,maxcardinality=False)
        # track the atoms assigned a paired storage object
        pair_set = set()

        # store all pairs
        for idx_1, idx_2 in matching:
            pair_set.add(idx_1)
            pair_set.add(idx_2)
            if idx_1 in atom_dict:
                atom_set = atom_dict[idx_1]
                storage_set = storage_dict[idx_2]
            else:
                atom_set = atom_dict[idx_2]
                storage_set = storage_dict[idx_1]
            pairs.append(list(atom_set) + list(storage_set))

        # check for missing atom storages
        for a_idx, atom_set in atom_dict.items():
            if a_idx not in pair_set:
                pairs.append(list(atom_set) + nones)

        # check for missing atoms
        for s_idx, storage_set in storage_dict.items():
            if s_idx not in pair_set:
                pairs.append(nones + list(storage_set))

        return pairs

    def _add_mol(self, mol, smirks_atoms_list):
        """
        private function for adding a new molecule
        This is used by add_mol if the graph is not empty, allowing the user to
        not have to track if the graph already has information before adding molecules

        Parameters
        ----------
        mol : any Mol
        smirks_atoms_list : list of dicts
            This is a list of dictionaries of the form [{smirks index: atom index}]
            each atom (by index) in the dictionary will be added the relevant
            AtomStorage by smirks index
        """
        for smirks_atoms in smirks_atoms_list:
            atom_dict = dict()
            sorted_smirks_atoms = self._symmetry_funct(mol, smirks_atoms)
            for key, atom_index in enumerate(sorted_smirks_atoms, 1):
                atom_dict[atom_index] = key
                atom1 = mol.get_atom_by_index(atom_index)
                self.atom_by_label[key].add_atom(atom1)

                for neighbor_key, neighbor_index in enumerate(sorted_smirks_atoms, 1):
                    # check for connecting bond
                    atom2 = mol.get_atom_by_index(neighbor_index)
                    bond = mol.get_bond_by_atoms(atom1, atom2)
                    if bond is not None and (neighbor_key, key) in self.bond_by_label:
                        bond_smirks = tuple(sorted([neighbor_key, key]))
                        self.bond_by_label[bond_smirks].add_bond(bond)

            for atom_label, atom_index in enumerate(sorted_smirks_atoms, 1):
                atom = mol.get_atom_by_index(atom_index)
                storage = self.atom_by_label[atom_label]
                self._add_layers(mol, atom, storage, self.layers, atom_dict)

    def _no_symmetry(self, mol, smirks_atoms):
        """
        No change is made to the atom order for this molecule
        """
        return smirks_atoms

    def _bond_symmetry(self, mol, smirks_atoms):
        """
        Returns a tuple of two atom indices in the order that
        leads to the atoms that match with previously stored atoms.

        Parameters
        -----------
        mol : ChemPer Mol
        smirks_atoms : two tuple
            tuple of atom indices

        Returns
        --------
        ordered_smirks_atoms : two tuple
            tuple of atom indices as they should be added to the graph
        """
        # pair atoms and bonds
        atom1 = mol.get_atom_by_index(smirks_atoms[0])
        atom2 = mol.get_atom_by_index(smirks_atoms[1])
        # Find potential storages for those atoms and bonds
        atoms_and_bonds = [(atom1,), (atom2,)]
        storages = [
            (self.atom_by_label[1],),
            (self.atom_by_label[2],)
        ]
        pairs = self.find_pairs(atoms_and_bonds, storages)
        ordered_smirks_atoms = [p[0].get_index() for p in sorted(pairs, key=lambda x: x[1].label)]
        return tuple(ordered_smirks_atoms)

    def _angle_symmetry(self, mol, smirks_atoms):
        """
        Returns a tuple of three atom indices in the order that
        leads to the atoms that match with previously stored atoms.

        Parameters
        -----------
        mol : ChemPer Mol
        smirks_atoms : three tuple
            tuple of atom indices

        Returns
        --------
        ordered_smirks_atoms : three tuple
            tuple of atom indices as they should be added to the graph
        """
        # get all three atoms
        atom1 = mol.get_atom_by_index(smirks_atoms[0])
        atom2 = mol.get_atom_by_index(smirks_atoms[1])
        atom3 = mol.get_atom_by_index(smirks_atoms[2])
        # get both bonds
        bond1 = mol.get_bond_by_atoms(atom1, atom2)
        bond2 = mol.get_bond_by_atoms(atom2, atom3)
        if None in (bond1, bond2):
            return smirks_atoms
        # save atom and bond pairs that could be reordered
        atoms_and_bonds = [(atom1, bond1), (atom3, bond2)]
        # find current atom and bond storage
        storages = [
            (self.atom_by_label[1], self.bond_by_label[(1,2)]),
            (self.atom_by_label[3], self.bond_by_label[(2,3)])
        ]
        pairs = self.find_pairs(atoms_and_bonds, storages)
        order = [p[0].get_index() for p in sorted(pairs, key=lambda x: x[2].label)]
        return tuple((order[0], smirks_atoms[1], order[1]))

    def _proper_torsion_symmetry(self, mol, smirks_atoms):
        """
        Returns a tuple of four atom indices for a proper torsion
        reordered to match with previously stored atoms.

        Parameters
        -----------
        mol : ChemPer Mol
        smirks_atoms : four tuple
            tuple of atom indices

        Returns
        --------
        ordered_smirks_atoms : four tuple
            tuple of atom indices as they should be added to the graph
        """
        # get all four atoms
        atom1 = mol.get_atom_by_index(smirks_atoms[0])
        atom2 = mol.get_atom_by_index(smirks_atoms[1])
        atom3 = mol.get_atom_by_index(smirks_atoms[2])
        atom4 = mol.get_atom_by_index(smirks_atoms[3])
        # get two relevant bonds
        bond1 = mol.get_bond_by_atoms(atom1, atom2)
        bond3 = mol.get_bond_by_atoms(atom3, atom4)
        if None in (bond1, bond3):
            return smirks_atoms
        # make pairs
        atoms_and_bonds = [ (atom2, bond1, atom1), (atom3, bond3, atom4) ]
        # get atom and bond storages
        storages = [
            (self.atom_by_label[2], self.bond_by_label[(1,2)], self.atom_by_label[1]),
            (self.atom_by_label[3], self.bond_by_label[(3,4)], self.atom_by_label[4])
        ]
        pairs = self.find_pairs(atoms_and_bonds, storages)
        order = [p[0].get_index() for p in sorted(pairs, key=lambda x: x[3].label)]
        if order[0] == smirks_atoms[1]:
            return smirks_atoms
        temp = list(smirks_atoms)
        temp.reverse()
        return tuple(temp)

    def _improper_torsion_symmetry(self, mol, smirks_atoms):
        """
        Returns a tuple of four atom indices for an improper torsion
        reordered to match with previously stored atoms.

        Parameters
        -----------
        mol : ChemPer Mol
        smirks_atoms : four tuple
            tuple of atom indices

        Returns
        --------
        ordered_smirks_atoms : four tuple
            tuple of atom indices as they should be added to the graph
        """
        # get all four atoms
        atom1 = mol.get_atom_by_index(smirks_atoms[0])
        atom2 = mol.get_atom_by_index(smirks_atoms[1])
        atom3 = mol.get_atom_by_index(smirks_atoms[2])
        atom4 = mol.get_atom_by_index(smirks_atoms[3])
        # get all three bonds
        bond1 = mol.get_bond_by_atoms(atom1, atom2)
        bond2 = mol.get_bond_by_atoms(atom2, atom3)
        bond3 = mol.get_bond_by_atoms(atom2, atom4)
        if None in (bond1, bond2, bond3):
            return smirks_atoms
        # make pairs of atoms and bonds to be reordered
        atoms_and_bonds = [
            (atom1, bond1), (atom3, bond2), (atom4, bond3)
        ]
        # find current atom and bond storages
        storages = [
            (self.atom_by_label[1], self.bond_by_label[(1,2)]),
            (self.atom_by_label[3], self.bond_by_label[(2,3)]),
            (self.atom_by_label[4], self.bond_by_label[(2,4)])
        ]
        pairs = self.find_pairs(atoms_and_bonds, storages)
        order = [p[0].get_index() for p in sorted(pairs, key=lambda x: x[2].label)]
        return tuple((order[0], smirks_atoms[1], order[1], order[2]))