Spaces:

buchijw
/

jtvae-demo

Sleeping

App Files Files

Trương Gia Bảo commited on Jun 9, 2023

Commit

a3ea5d3

1 Parent(s): 6c75a42

Initial commit

Browse files

Files changed (18) hide show

.gitattributes +1 -0
app.py +111 -0
fast_jtnn/__init__.py +9 -0
fast_jtnn/chemutils.py +429 -0
fast_jtnn/datautils.py +213 -0
fast_jtnn/jtmpn.py +138 -0
fast_jtnn/jtnn_dec.py +347 -0
fast_jtnn/jtnn_enc.py +131 -0
fast_jtnn/jtnn_vae.py +226 -0
fast_jtnn/jtprop_vae.py +311 -0
fast_jtnn/mol_tree.py +168 -0
fast_jtnn/mpn.py +125 -0
fast_jtnn/nnutils.py +72 -0
fast_jtnn/vocab.py +31 -0
fpscores.pkl.gz +3 -0
requirements.txt +9 -0
sascorer.py +173 -0
vocab.txt +533 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.iter-685000 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import streamlit as st
+import sys, os
+import rdkit
+import rdkit.Chem as Chem
+from rdkit.Chem.Draw import MolToImage
+from rdkit.Chem import Descriptors
+import sascorer
+import networkx as nx
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+sys.path.append('%s/fast_jtnn/' % os.path.dirname(os.path.realpath(__file__)))
+from mol_tree import Vocab, MolTree
+from jtprop_vae import JTPropVAE
+from molbloom import buy
+lg = rdkit.RDLogger.logger()
+lg.setLevel(rdkit.RDLogger.CRITICAL)
+st.header('Junction Tree Variational Autoencoder for Molecular Graph Generation (JTVAE)')
+st.subheader('Wengong Jin, Regina Barzilay, Tommi Jaakkola')
+descrip = '''
+We seek to automate the design of molecules based on specific chemical properties. In computational terms, this task involves continuous embedding and generation of molecular graphs. Our primary contribution is the direct realization of molecular graphs, a task previously approached by generating linear SMILES strings instead of graphs. Our junction tree variational autoencoder generates molecular graphs in two phases, by first generating a tree-structured scaffold over chemical substructures, and then combining them into a molecule with a graph message passing network. This approach allows us to incrementally expand molecules while maintaining chemical validity at every step. We evaluate our model on multiple tasks ranging from molecular generation to optimization. Across these tasks, our model outperforms previous state-of-the-art baselines by a significant margin.
+[https://arxiv.org/abs/1802.04364](https://arxiv.org/abs/1802.04364)'''
+with st.expander('About'):
+    st.markdown(descrip)
+st.text_input('Enter a SMILES string:','CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC(=C(C=C3)Cl)C(F)(F)F',key='smiles')
+def penalized_logp_standard(mol):
+    logP_mean = 2.4399606244103639873799239
+    logP_std = 0.9293197802518905481505840
+    SA_mean = -2.4485512208785431553792478
+    SA_std = 0.4603110476923852334429910
+    cycle_mean = -0.0307270378623088931402396
+    cycle_std = 0.2163675785228087178335699
+    log_p = Descriptors.MolLogP(mol)
+    SA = -sascorer.calculateScore(mol)
+    # cycle score
+    cycle_list = nx.cycle_basis(nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol)))
+    if len(cycle_list) == 0:
+        cycle_length = 0
+    else:
+        cycle_length = max([len(j) for j in cycle_list])
+    if cycle_length <= 6:
+        cycle_length = 0
+    else:
+        cycle_length = cycle_length - 6
+    cycle_score = -cycle_length
+    # print(logP_mean)
+    standardized_log_p = (log_p - logP_mean) / logP_std
+    standardized_SA = (SA - SA_mean) / SA_std
+    standardized_cycle = (cycle_score - cycle_mean) / cycle_std
+    return standardized_log_p + standardized_SA + standardized_cycle
+mol = Chem.MolFromSmiles(st.session_state.smiles)
+if mol is None:
+    st.write('SMILES is invalid. Please enter a valid SMILES.')
+else:
+    st.write('Molecule:')
+    st.image(MolToImage(mol,size=(300,300)))
+    score = penalized_logp_standard(mol)
+    st.write('Penalized logP score: %.5f' % (score))
+if mol is not None:
+    st.slider('Choose learning rate: ',0.0,10.0,0.4,key='lr')
+    st.slider('Choose similarity cutoff: ',0.0,3.0,0.4,key='sim_cutoff')
+    st.slider('Choose number of iterations: ',1,100,80,key='n_iter')
+    vocab = [x.strip("\r\n ") for x in open('./vocab.txt')]
+    vocab = Vocab(vocab)
+    if st.button('Optimize'):
+        st.write('Testing')
+        model = JTPropVAE(vocab, 450, 56, 20, 3)
+        model.load_state_dict(torch.load('./model.iter-685000',map_location=torch.device('cpu')))
+        new_smiles,sim = model.optimize(st.session_state.smiles, sim_cutoff=st.session_state.sim_cutoff, lr=st.session_state.lr, num_iter=st.session_state.n_iter)
+        del model
+        if new_smiles is None:
+            st.write('Cannot optimize.')
+        else:
+            st.write('New SMILES:')
+            st.code(new_smiles)
+            new_mol = Chem.MolFromSmiles(new_smiles)
+            if new_mol is None:
+                st.write('New SMILES is invalid.')
+            else:
+                st.write('New SMILES molecule:')
+                st.image(MolToImage(new_mol,size=(300,300)))
+                new_score = penalized_logp_standard(new_mol)
+                st.write('New penalized logP score: %.5f' % (new_score))
+                st.write('Caching ZINC20 if necessary...')
+                if buy(new_smiles, catalog='zinc20',canonicalize=True):
+                    st.write('This molecule exists.')
+                    st.caption('Checked by molbloom.')
+                else:
+                    st.write('THIS MOLECULE DOES NOT EXIST!')
+                    st.caption('Checked by molbloom.')

fast_jtnn/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# import sys
+# sys.path.append('./')
+from mol_tree import Vocab, MolTree
+from jtnn_vae import JTNNVAE
+from jtnn_enc import JTNNEncoder
+from jtmpn import JTMPN
+from mpn import MPN
+from nnutils import create_var
+from datautils import MolTreeFolder, PairTreeFolder, MolTreeDataset

fast_jtnn/chemutils.py ADDED Viewed

	@@ -0,0 +1,429 @@

+import rdkit
+import rdkit.Chem as Chem
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import minimum_spanning_tree
+from collections import defaultdict
+from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
+from vocab import Vocab
+MST_MAX_WEIGHT = 100
+MAX_NCAND = 2000
+def set_atommap(mol, num=0):
+    for atom in mol.GetAtoms():
+        atom.SetAtomMapNum(num)
+def get_mol(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    Chem.Kekulize(mol, clearAromaticFlags=True)
+    return mol
+def get_smiles(mol):
+    return Chem.MolToSmiles(mol, kekuleSmiles=True)
+def decode_stereo(smiles2D):
+    mol = Chem.MolFromSmiles(smiles2D)
+    dec_isomers = list(EnumerateStereoisomers(mol))
+    dec_isomers = [Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) for mol in dec_isomers]
+    smiles3D = [Chem.MolToSmiles(mol, isomericSmiles=True) for mol in dec_isomers]
+    chiralN = [atom.GetIdx() for atom in dec_isomers[0].GetAtoms() if int(atom.GetChiralTag()) > 0 and atom.GetSymbol() == "N"]
+    if len(chiralN) > 0:
+        for mol in dec_isomers:
+            for idx in chiralN:
+                mol.GetAtomWithIdx(idx).SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
+            smiles3D.append(Chem.MolToSmiles(mol, isomericSmiles=True))
+    return smiles3D
+def sanitize(mol):
+    try:
+        smiles = get_smiles(mol)
+        mol = get_mol(smiles)
+    except Exception as e:
+        return None
+    return mol
+def copy_atom(atom):
+    new_atom = Chem.Atom(atom.GetSymbol())
+    new_atom.SetFormalCharge(atom.GetFormalCharge())
+    new_atom.SetAtomMapNum(atom.GetAtomMapNum())
+    return new_atom
+def copy_edit_mol(mol):
+    new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
+    for atom in mol.GetAtoms():
+        new_atom = copy_atom(atom)
+        new_mol.AddAtom(new_atom)
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        bt = bond.GetBondType()
+        new_mol.AddBond(a1, a2, bt)
+    return new_mol
+def get_clique_mol(mol, atoms):
+    smiles = Chem.MolFragmentToSmiles(mol, atoms, kekuleSmiles=True)
+    new_mol = Chem.MolFromSmiles(smiles, sanitize=False)
+    new_mol = copy_edit_mol(new_mol).GetMol()
+    new_mol = sanitize(new_mol) #We assume this is not None
+    return new_mol
+def tree_decomp(mol):
+    n_atoms = mol.GetNumAtoms()
+    if n_atoms == 1: #special case
+        return [[0]], []
+    cliques = []
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        if not bond.IsInRing():
+            cliques.append([a1,a2])
+    ssr = [list(x) for x in Chem.GetSymmSSSR(mol)]
+    cliques.extend(ssr)
+    nei_list = [[] for i in range(n_atoms)]
+    for i in range(len(cliques)):
+        for atom in cliques[i]:
+            nei_list[atom].append(i)
+    #Merge Rings with intersection > 2 atoms
+    for i in range(len(cliques)):
+        if len(cliques[i]) <= 2: continue
+        for atom in cliques[i]:
+            for j in nei_list[atom]:
+                if i >= j or len(cliques[j]) <= 2: continue
+                inter = set(cliques[i]) & set(cliques[j])
+                if len(inter) > 2:
+                    cliques[i].extend(cliques[j])
+                    cliques[i] = list(set(cliques[i]))
+                    cliques[j] = []
+    cliques = [c for c in cliques if len(c) > 0]
+    nei_list = [[] for i in range(n_atoms)]
+    for i in range(len(cliques)):
+        for atom in cliques[i]:
+            nei_list[atom].append(i)
+    #Build edges and add singleton cliques
+    edges = defaultdict(int)
+    for atom in range(n_atoms):
+        if len(nei_list[atom]) <= 1:
+            continue
+        cnei = nei_list[atom]
+        bonds = [c for c in cnei if len(cliques[c]) == 2]
+        rings = [c for c in cnei if len(cliques[c]) > 4]
+        if len(bonds) > 2 or (len(bonds) == 2 and len(cnei) > 2): #In general, if len(cnei) >= 3, a singleton should be added, but 1 bond + 2 ring is currently not dealt with.
+            cliques.append([atom])
+            c2 = len(cliques) - 1
+            for c1 in cnei:
+                edges[(c1,c2)] = 1
+        elif len(rings) > 2: #Multiple (n>2) complex rings
+            cliques.append([atom])
+            c2 = len(cliques) - 1
+            for c1 in cnei:
+                edges[(c1,c2)] = MST_MAX_WEIGHT - 1
+        else:
+            for i in range(len(cnei)):
+                for j in range(i + 1, len(cnei)):
+                    c1,c2 = cnei[i],cnei[j]
+                    inter = set(cliques[c1]) & set(cliques[c2])
+                    if edges[(c1,c2)] < len(inter):
+                        edges[(c1,c2)] = len(inter) #cnei[i] < cnei[j] by construction
+    edges = [u + (MST_MAX_WEIGHT-v,) for u,v in edges.items()]
+    if len(edges) == 0:
+        return cliques, edges
+    #Compute Maximum Spanning Tree
+    row,col,data = zip(*edges)
+    n_clique = len(cliques)
+    clique_graph = csr_matrix( (data,(row,col)), shape=(n_clique,n_clique) )
+    junc_tree = minimum_spanning_tree(clique_graph)
+    row,col = junc_tree.nonzero()
+    edges = [(row[i],col[i]) for i in range(len(row))]
+    return (cliques, edges)
+def atom_equal(a1, a2):
+    return a1.GetSymbol() == a2.GetSymbol() and a1.GetFormalCharge() == a2.GetFormalCharge()
+#Bond type not considered because all aromatic (so SINGLE matches DOUBLE)
+def ring_bond_equal(b1, b2, reverse=False):
+    b1 = (b1.GetBeginAtom(), b1.GetEndAtom())
+    if reverse:
+        b2 = (b2.GetEndAtom(), b2.GetBeginAtom())
+    else:
+        b2 = (b2.GetBeginAtom(), b2.GetEndAtom())
+    return atom_equal(b1[0], b2[0]) and atom_equal(b1[1], b2[1])
+def attach_mols(ctr_mol, neighbors, prev_nodes, nei_amap):
+    prev_nids = [node.nid for node in prev_nodes]
+    for nei_node in prev_nodes + neighbors:
+        nei_id,nei_mol = nei_node.nid,nei_node.mol
+        amap = nei_amap[nei_id]
+        for atom in nei_mol.GetAtoms():
+            if atom.GetIdx() not in amap:
+                new_atom = copy_atom(atom)
+                amap[atom.GetIdx()] = ctr_mol.AddAtom(new_atom)
+        if nei_mol.GetNumBonds() == 0:
+            nei_atom = nei_mol.GetAtomWithIdx(0)
+            ctr_atom = ctr_mol.GetAtomWithIdx(amap[0])
+            ctr_atom.SetAtomMapNum(nei_atom.GetAtomMapNum())
+        else:
+            for bond in nei_mol.GetBonds():
+                a1 = amap[bond.GetBeginAtom().GetIdx()]
+                a2 = amap[bond.GetEndAtom().GetIdx()]
+                if ctr_mol.GetBondBetweenAtoms(a1, a2) is None:
+                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
+                elif nei_id in prev_nids: #father node overrides
+                    ctr_mol.RemoveBond(a1, a2)
+                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
+    return ctr_mol
+def local_attach(ctr_mol, neighbors, prev_nodes, amap_list):
+    ctr_mol = copy_edit_mol(ctr_mol)
+    nei_amap = {nei.nid:{} for nei in prev_nodes + neighbors}
+    for nei_id,ctr_atom,nei_atom in amap_list:
+        nei_amap[nei_id][nei_atom] = ctr_atom
+    ctr_mol = attach_mols(ctr_mol, neighbors, prev_nodes, nei_amap)
+    return ctr_mol.GetMol()
+#This version records idx mapping between ctr_mol and nei_mol
+def enum_attach(ctr_mol, nei_node, amap, singletons):
+    nei_mol,nei_idx = nei_node.mol,nei_node.nid
+    att_confs = []
+    black_list = [atom_idx for nei_id,atom_idx,_ in amap if nei_id in singletons]
+    ctr_atoms = [atom for atom in ctr_mol.GetAtoms() if atom.GetIdx() not in black_list]
+    ctr_bonds = [bond for bond in ctr_mol.GetBonds()]
+    if nei_mol.GetNumBonds() == 0: #neighbor singleton
+        nei_atom = nei_mol.GetAtomWithIdx(0)
+        used_list = [atom_idx for _,atom_idx,_ in amap]
+        for atom in ctr_atoms:
+            if atom_equal(atom, nei_atom) and atom.GetIdx() not in used_list:
+                new_amap = amap + [(nei_idx, atom.GetIdx(), 0)]
+                att_confs.append( new_amap )
+    elif nei_mol.GetNumBonds() == 1: #neighbor is a bond
+        bond = nei_mol.GetBondWithIdx(0)
+        bond_val = int(bond.GetBondTypeAsDouble())
+        b1,b2 = bond.GetBeginAtom(), bond.GetEndAtom()
+        for atom in ctr_atoms:
+            #Optimize if atom is carbon (other atoms may change valence)
+            if atom.GetAtomicNum() == 6 and atom.GetTotalNumHs() < bond_val:
+                continue
+            if atom_equal(atom, b1):
+                new_amap = amap + [(nei_idx, atom.GetIdx(), b1.GetIdx())]
+                att_confs.append( new_amap )
+            elif atom_equal(atom, b2):
+                new_amap = amap + [(nei_idx, atom.GetIdx(), b2.GetIdx())]
+                att_confs.append( new_amap )
+    else:
+        #intersection is an atom
+        for a1 in ctr_atoms:
+            for a2 in nei_mol.GetAtoms():
+                if atom_equal(a1, a2):
+                    #Optimize if atom is carbon (other atoms may change valence)
+                    if a1.GetAtomicNum() == 6 and a1.GetTotalNumHs() + a2.GetTotalNumHs() < 4:
+                        continue
+                    new_amap = amap + [(nei_idx, a1.GetIdx(), a2.GetIdx())]
+                    att_confs.append( new_amap )
+        #intersection is an bond
+        if ctr_mol.GetNumBonds() > 1:
+            for b1 in ctr_bonds:
+                for b2 in nei_mol.GetBonds():
+                    if ring_bond_equal(b1, b2):
+                        new_amap = amap + [(nei_idx, b1.GetBeginAtom().GetIdx(), b2.GetBeginAtom().GetIdx()), (nei_idx, b1.GetEndAtom().GetIdx(), b2.GetEndAtom().GetIdx())]
+                        att_confs.append( new_amap )
+                    if ring_bond_equal(b1, b2, reverse=True):
+                        new_amap = amap + [(nei_idx, b1.GetBeginAtom().GetIdx(), b2.GetEndAtom().GetIdx()), (nei_idx, b1.GetEndAtom().GetIdx(), b2.GetBeginAtom().GetIdx())]
+                        att_confs.append( new_amap )
+    return att_confs
+#Try rings first: Speed-Up
+def enum_assemble(node, neighbors, prev_nodes=[], prev_amap=[]):
+    all_attach_confs = []
+    singletons = [nei_node.nid for nei_node in neighbors + prev_nodes if nei_node.mol.GetNumAtoms() == 1]
+    def search(cur_amap, depth):
+        if len(all_attach_confs) > MAX_NCAND:
+            return
+        if depth == len(neighbors):
+            all_attach_confs.append(cur_amap)
+            return
+        nei_node = neighbors[depth]
+        cand_amap = enum_attach(node.mol, nei_node, cur_amap, singletons)
+        cand_smiles = set()
+        candidates = []
+        for amap in cand_amap:
+            cand_mol = local_attach(node.mol, neighbors[:depth+1], prev_nodes, amap)
+            cand_mol = sanitize(cand_mol)
+            if cand_mol is None:
+                continue
+            smiles = get_smiles(cand_mol)
+            if smiles in cand_smiles:
+                continue
+            cand_smiles.add(smiles)
+            candidates.append(amap)
+        if len(candidates) == 0:
+            return
+        for new_amap in candidates:
+            search(new_amap, depth + 1)
+    search(prev_amap, 0)
+    cand_smiles = set()
+    candidates = []
+    aroma_score = []
+    for amap in all_attach_confs:
+        cand_mol = local_attach(node.mol, neighbors, prev_nodes, amap)
+        cand_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cand_mol))
+        smiles = Chem.MolToSmiles(cand_mol)
+        if smiles in cand_smiles or check_singleton(cand_mol, node, neighbors) == False:
+            continue
+        cand_smiles.add(smiles)
+        candidates.append( (smiles,amap) )
+        aroma_score.append( check_aroma(cand_mol, node, neighbors) )
+    return candidates, aroma_score
+def check_singleton(cand_mol, ctr_node, nei_nodes):
+    rings = [node for node in nei_nodes + [ctr_node] if node.mol.GetNumAtoms() > 2]
+    singletons = [node for node in nei_nodes + [ctr_node] if node.mol.GetNumAtoms() == 1]
+    if len(singletons) > 0 or len(rings) == 0: return True
+    n_leaf2_atoms = 0
+    for atom in cand_mol.GetAtoms():
+        nei_leaf_atoms = [a for a in atom.GetNeighbors() if not a.IsInRing()] #a.GetDegree() == 1]
+        if len(nei_leaf_atoms) > 1:
+            n_leaf2_atoms += 1
+    return n_leaf2_atoms == 0
+def check_aroma(cand_mol, ctr_node, nei_nodes):
+    rings = [node for node in nei_nodes + [ctr_node] if node.mol.GetNumAtoms() >= 3]
+    if len(rings) < 2: return 0 #Only multi-ring system needs to be checked
+    get_nid = lambda x: 0 if x.is_leaf else x.nid
+    benzynes = [get_nid(node) for node in nei_nodes + [ctr_node] if node.smiles in Vocab.benzynes]
+    penzynes = [get_nid(node) for node in nei_nodes + [ctr_node] if node.smiles in Vocab.penzynes]
+    if len(benzynes) + len(penzynes) == 0:
+        return 0 #No specific aromatic rings
+    n_aroma_atoms = 0
+    for atom in cand_mol.GetAtoms():
+        if atom.GetAtomMapNum() in benzynes+penzynes and atom.GetIsAromatic():
+            n_aroma_atoms += 1
+    if n_aroma_atoms >= len(benzynes) * 4 + len(penzynes) * 3:
+        return 1000
+    else:
+        return -0.001
+#Only used for debugging purpose
+def dfs_assemble(cur_mol, global_amap, fa_amap, cur_node, fa_node):
+    fa_nid = fa_node.nid if fa_node is not None else -1
+    prev_nodes = [fa_node] if fa_node is not None else []
+    children = [nei for nei in cur_node.neighbors if nei.nid != fa_nid]
+    neighbors = [nei for nei in children if nei.mol.GetNumAtoms() > 1]
+    neighbors = sorted(neighbors, key=lambda x:x.mol.GetNumAtoms(), reverse=True)
+    singletons = [nei for nei in children if nei.mol.GetNumAtoms() == 1]
+    neighbors = singletons + neighbors
+    cur_amap = [(fa_nid,a2,a1) for nid,a1,a2 in fa_amap if nid == cur_node.nid]
+    cands = enum_assemble(cur_node, neighbors, prev_nodes, cur_amap)
+    cand_smiles,cand_amap = zip(*cands)
+    label_idx = cand_smiles.index(cur_node.label)
+    label_amap = cand_amap[label_idx]
+    for nei_id,ctr_atom,nei_atom in label_amap:
+        if nei_id == fa_nid:
+            continue
+        global_amap[nei_id][nei_atom] = global_amap[cur_node.nid][ctr_atom]
+    cur_mol = attach_mols(cur_mol, children, [], global_amap) #father is already attached
+    for nei_node in children:
+        if not nei_node.is_leaf:
+            dfs_assemble(cur_mol, global_amap, label_amap, nei_node, cur_node)
+if __name__ == "__main__":
+    import sys
+    from mol_tree import MolTree
+    lg = rdkit.RDLogger.logger()
+    lg.setLevel(rdkit.RDLogger.CRITICAL)
+    smiles = ["O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1","O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2", "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3", "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1", 'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1', "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"]
+    def tree_test():
+        for s in sys.stdin:
+            s = s.split()[0]
+            tree = MolTree(s)
+            print('-------------------------------------------')
+            print(s)
+            for node in tree.nodes:
+                print(node.smiles, [x.smiles for x in node.neighbors])
+    def decode_test():
+        wrong = 0
+        for tot,s in enumerate(sys.stdin):
+            s = s.split()[0]
+            tree = MolTree(s)
+            tree.recover()
+            cur_mol = copy_edit_mol(tree.nodes[0].mol)
+            global_amap = [{}] + [{} for node in tree.nodes]
+            global_amap[1] = {atom.GetIdx():atom.GetIdx() for atom in cur_mol.GetAtoms()}
+            dfs_assemble(cur_mol, global_amap, [], tree.nodes[0], None)
+            cur_mol = cur_mol.GetMol()
+            cur_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cur_mol))
+            set_atommap(cur_mol)
+            dec_smiles = Chem.MolToSmiles(cur_mol)
+            gold_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(s))
+            if gold_smiles != dec_smiles:
+                print(gold_smiles, dec_smiles)
+                wrong += 1
+            print(wrong, tot + 1)
+    def enum_test():
+        for s in sys.stdin:
+            s = s.split()[0]
+            tree = MolTree(s)
+            tree.recover()
+            tree.assemble()
+            for node in tree.nodes:
+                if node.label not in node.cands:
+                    print(tree.smiles)
+                    print(node.smiles, [x.smiles for x in node.neighbors])
+                    print(node.label, len(node.cands))
+    def count():
+        cnt,n = 0,0
+        for s in sys.stdin:
+            s = s.split()[0]
+            tree = MolTree(s)
+            tree.recover()
+            tree.assemble()
+            for node in tree.nodes:
+                cnt += len(node.cands)
+            n += len(tree.nodes)
+            #print cnt * 1.0 / n
+    count()

fast_jtnn/datautils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from mol_tree import MolTree
+import numpy as np
+from jtnn_enc import JTNNEncoder
+from mpn import MPN
+from jtmpn import JTMPN
+import pickle
+import os, random
+class PairTreeFolder(object):
+    def __init__(self, data_folder, vocab, batch_size, num_workers=4, shuffle=True, y_assm=True, replicate=None):
+        self.data_folder = data_folder
+        self.data_files = [fn for fn in os.listdir(data_folder)]
+        self.batch_size = batch_size
+        self.vocab = vocab
+        self.num_workers = num_workers
+        self.y_assm = y_assm
+        self.shuffle = shuffle
+        if replicate is not None: #expand is int
+            self.data_files = self.data_files * replicate
+    def __iter__(self):
+        for fn in self.data_files:
+            fn = os.path.join(self.data_folder, fn)
+            with open(fn, 'rb') as f:
+                data = pickle.load(f)
+            if self.shuffle:
+                random.shuffle(data) #shuffle data before batch
+            batches = [data[i : i + self.batch_size] for i in range(0, len(data), self.batch_size)]
+            if len(batches[-1]) < self.batch_size:
+                batches.pop()
+            dataset = PairTreeDataset(batches, self.vocab, self.y_assm)
+            dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, collate_fn=lambda x:x[0])
+            for b in dataloader:
+                yield b
+            del data, batches, dataset, dataloader
+class MolTreeFolder(object):
+    def __init__(self, data_folder, vocab, batch_size, num_workers=4, shuffle=True, assm=True, replicate=None):
+        self.data_folder = data_folder
+        self.data_files = [fn for fn in os.listdir(data_folder)]
+        self.batch_size = batch_size
+        self.vocab = vocab
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.assm = assm
+        if replicate is not None: #expand is int
+            self.data_files = self.data_files * replicate
+    def __iter__(self):
+        for fn in self.data_files:
+            fn = os.path.join(self.data_folder, fn)
+            with open(fn, 'rb') as f:
+                data = pickle.load(f)
+            if self.shuffle:
+                random.shuffle(data) #shuffle data before batch
+            batches = [data[i : i + self.batch_size] for i in range(0, len(data), self.batch_size)]
+            if len(batches[-1]) < self.batch_size:
+                batches.pop()
+            dataset = MolTreeDataset(batches, self.vocab, self.assm)
+            dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, collate_fn=lambda x:x[0])
+            for b in dataloader:
+                yield b
+            del data, batches, dataset, dataloader
+class PairTreeDataset(Dataset):
+    def __init__(self, data, vocab, y_assm):
+        self.data = data
+        self.vocab = vocab
+        self.y_assm = y_assm
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        batch0, batch1 = zip(*self.data[idx])
+        return tensorize(batch0, self.vocab, assm=False), tensorize(batch1, self.vocab, assm=self.y_assm)
+class MolTreeDataset(Dataset):
+    def __init__(self, data, vocab, assm=True):
+        self.data = data
+        self.vocab = vocab
+        self.assm = assm
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return tensorize(self.data[idx], self.vocab, assm=self.assm)
+def tensorize(tree_batch, vocab, assm=True):
+    set_batch_nodeID(tree_batch, vocab)
+    smiles_batch = [tree.smiles for tree in tree_batch]
+    jtenc_holder,mess_dict = JTNNEncoder.tensorize(tree_batch)
+    jtenc_holder = jtenc_holder
+    mpn_holder = MPN.tensorize(smiles_batch)
+    if assm is False:
+        return tree_batch, jtenc_holder, mpn_holder
+    cands = []
+    batch_idx = []
+    for i,mol_tree in enumerate(tree_batch):
+        for node in mol_tree.nodes:
+            #Leaf node's attachment is determined by neighboring node's attachment
+            if node.is_leaf or len(node.cands) == 1: continue
+            cands.extend( [(cand, mol_tree.nodes, node) for cand in node.cands] )
+            batch_idx.extend([i] * len(node.cands))
+    jtmpn_holder = JTMPN.tensorize(cands, mess_dict)
+    batch_idx = torch.LongTensor(batch_idx)
+    return tree_batch, jtenc_holder, mpn_holder, (jtmpn_holder,batch_idx)
+def set_batch_nodeID(mol_batch, vocab):
+    tot = 0
+    for mol_tree in mol_batch:
+        for node in mol_tree.nodes:
+            node.idx = tot
+            node.wid = vocab.get_index(node.smiles)
+            tot += 1
+class PropMolTreeDataset(Dataset):
+    def __init__(self, data, vocab, assm=True):
+        self.data = data
+        self.vocab = vocab
+        self.assm = assm
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return tensorize_prop(self.data[idx],self.vocab, assm=self.assm)
+class PropMolTreeFolder(object):
+    def __init__(self, data_folder, vocab, batch_size, num_workers=4, shuffle=True, assm=True, replicate=None):
+        self.data_folder = data_folder
+        self.data_files = [fn for fn in os.listdir(data_folder)]
+        self.batch_size = batch_size
+        self.vocab = vocab
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.assm = assm
+        if replicate is not None: #expand is int
+            self.data_files = self.data_files * replicate
+    def __iter__(self):
+        for fn in self.data_files:
+            fn = os.path.join(self.data_folder, fn)
+            with open(fn, 'rb') as f:
+                data = pickle.load(f)
+            # print(data[0:5])
+            if self.shuffle:
+                random.shuffle(data) #shuffle data before batch
+            batches = [data[i : i + self.batch_size] for i in range(0, len(data), self.batch_size)]
+            if len(batches[-1]) < self.batch_size:
+                batches.pop()
+            dataset = PropMolTreeDataset(batches, self.vocab, self.assm)
+            dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, collate_fn=lambda x:x[0])
+            for b in dataloader:
+                yield b
+            del data, batches, dataset, dataloader
+def tensorize_prop(data, vocab, assm=True):
+    tree_batch,prop = list(zip(*data))
+    set_batch_nodeID(tree_batch, vocab)
+    smiles_batch = [tree.smiles for tree in tree_batch]
+    jtenc_holder,mess_dict = JTNNEncoder.tensorize(tree_batch)
+    jtenc_holder = jtenc_holder
+    mpn_holder = MPN.tensorize(smiles_batch)
+    if assm is False:
+        return tree_batch, jtenc_holder, mpn_holder
+    cands = []
+    batch_idx = []
+    for i,mol_tree in enumerate(tree_batch):
+        for node in mol_tree.nodes:
+            #Leaf node's attachment is determined by neighboring node's attachment
+            if node.is_leaf or len(node.cands) == 1: continue
+            cands.extend( [(cand, mol_tree.nodes, node) for cand in node.cands] )
+            batch_idx.extend([i] * len(node.cands))
+    jtmpn_holder = JTMPN.tensorize(cands, mess_dict)
+    batch_idx = torch.LongTensor(batch_idx)
+    return tree_batch, jtenc_holder, mpn_holder, (jtmpn_holder,batch_idx), prop

fast_jtnn/jtmpn.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nnutils import create_var, index_select_ND
+from chemutils import get_mol
+import rdkit.Chem as Chem
+ELEM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'Al', 'I', 'B', 'K', 'Se', 'Zn', 'H', 'Cu', 'Mn', 'unknown']
+ATOM_FDIM = len(ELEM_LIST) + 6 + 5 + 1
+BOND_FDIM = 5
+MAX_NB = 15
+def onek_encoding_unk(x, allowable_set):
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def atom_features(atom):
+    return torch.Tensor(onek_encoding_unk(atom.GetSymbol(), ELEM_LIST)
+            + onek_encoding_unk(atom.GetDegree(), [0,1,2,3,4,5])
+            + onek_encoding_unk(atom.GetFormalCharge(), [-1,-2,1,2,0])
+            + [atom.GetIsAromatic()])
+def bond_features(bond):
+    bt = bond.GetBondType()
+    return torch.Tensor([bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC, bond.IsInRing()])
+class JTMPN(nn.Module):
+    def __init__(self, hidden_size, depth):
+        super(JTMPN, self).__init__()
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.W_i = nn.Linear(ATOM_FDIM + BOND_FDIM, hidden_size, bias=False)
+        self.W_h = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.W_o = nn.Linear(ATOM_FDIM + hidden_size, hidden_size)
+    def forward(self, fatoms, fbonds, agraph, bgraph, scope, tree_message): #tree_message[0] == vec(0)
+        fatoms = create_var(fatoms)
+        fbonds = create_var(fbonds)
+        agraph = create_var(agraph)
+        bgraph = create_var(bgraph)
+        binput = self.W_i(fbonds)
+        graph_message = F.relu(binput)
+        for i in range(self.depth - 1):
+            message = torch.cat([tree_message,graph_message], dim=0)
+            nei_message = index_select_ND(message, 0, bgraph)
+            nei_message = nei_message.sum(dim=1) #assuming tree_message[0] == vec(0)
+            nei_message = self.W_h(nei_message)
+            graph_message = F.relu(binput + nei_message)
+        message = torch.cat([tree_message,graph_message], dim=0)
+        nei_message = index_select_ND(message, 0, agraph)
+        nei_message = nei_message.sum(dim=1)
+        ainput = torch.cat([fatoms, nei_message], dim=1)
+        atom_hiddens = F.relu(self.W_o(ainput))
+        mol_vecs = []
+        for st,le in scope:
+            mol_vec = atom_hiddens.narrow(0, st, le).sum(dim=0) / le
+            mol_vecs.append(mol_vec)
+        mol_vecs = torch.stack(mol_vecs, dim=0)
+        return mol_vecs
+    @staticmethod
+    def tensorize(cand_batch, mess_dict):
+        fatoms,fbonds = [],[]
+        in_bonds,all_bonds = [],[]
+        total_atoms = 0
+        total_mess = len(mess_dict) + 1 #must include vec(0) padding
+        scope = []
+        for smiles,all_nodes,ctr_node in cand_batch:
+            mol = Chem.MolFromSmiles(smiles)
+            Chem.Kekulize(mol) #The original jtnn version kekulizes. Need to revisit why it is necessary
+            n_atoms = mol.GetNumAtoms()
+            ctr_bid = ctr_node.idx
+            for atom in mol.GetAtoms():
+                fatoms.append( atom_features(atom) )
+                in_bonds.append([])
+            for bond in mol.GetBonds():
+                a1 = bond.GetBeginAtom()
+                a2 = bond.GetEndAtom()
+                x = a1.GetIdx() + total_atoms
+                y = a2.GetIdx() + total_atoms
+                #Here x_nid,y_nid could be 0
+                x_nid,y_nid = a1.GetAtomMapNum(),a2.GetAtomMapNum()
+                x_bid = all_nodes[x_nid - 1].idx if x_nid > 0 else -1
+                y_bid = all_nodes[y_nid - 1].idx if y_nid > 0 else -1
+                bfeature = bond_features(bond)
+                b = total_mess + len(all_bonds)  #bond idx offseted by total_mess
+                all_bonds.append((x,y))
+                fbonds.append( torch.cat([fatoms[x], bfeature], 0) )
+                in_bonds[y].append(b)
+                b = total_mess + len(all_bonds)
+                all_bonds.append((y,x))
+                fbonds.append( torch.cat([fatoms[y], bfeature], 0) )
+                in_bonds[x].append(b)
+                if x_bid >= 0 and y_bid >= 0 and x_bid != y_bid:
+                    if (x_bid,y_bid) in mess_dict:
+                        mess_idx = mess_dict[(x_bid,y_bid)]
+                        in_bonds[y].append(mess_idx)
+                    if (y_bid,x_bid) in mess_dict:
+                        mess_idx = mess_dict[(y_bid,x_bid)]
+                        in_bonds[x].append(mess_idx)
+            scope.append((total_atoms,n_atoms))
+            total_atoms += n_atoms
+        total_bonds = len(all_bonds)
+        fatoms = torch.stack(fatoms, 0)
+        fbonds = torch.stack(fbonds, 0)
+        agraph = torch.zeros(total_atoms,MAX_NB).long()
+        bgraph = torch.zeros(total_bonds,MAX_NB).long()
+        for a in range(total_atoms):
+            for i,b in enumerate(in_bonds[a]):
+                agraph[a,i] = b
+        for b1 in range(total_bonds):
+            x,y = all_bonds[b1]
+            for i,b2 in enumerate(in_bonds[x]): #b2 is offseted by total_mess
+                if b2 < total_mess or all_bonds[b2-total_mess][0] != y:
+                    bgraph[b1,i] = b2
+        return (fatoms, fbonds, agraph, bgraph, scope)

fast_jtnn/jtnn_dec.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mol_tree import Vocab, MolTree, MolTreeNode
+from nnutils import create_var, GRU
+from chemutils import enum_assemble, set_atommap
+import copy
+MAX_NB = 15
+MAX_DECODE_LEN = 100
+class JTNNDecoder(nn.Module):
+    def __init__(self, vocab, hidden_size, latent_size, embedding):
+        super(JTNNDecoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab.size()
+        self.vocab = vocab
+        self.embedding = embedding
+        #GRU Weights
+        self.W_z = nn.Linear(2 * hidden_size, hidden_size)
+        self.U_r = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.W_r = nn.Linear(hidden_size, hidden_size)
+        self.W_h = nn.Linear(2 * hidden_size, hidden_size)
+        #Word Prediction Weights
+        self.W = nn.Linear(hidden_size + latent_size, hidden_size)
+        #Stop Prediction Weights
+        self.U = nn.Linear(hidden_size + latent_size, hidden_size)
+        self.U_i = nn.Linear(2 * hidden_size, hidden_size)
+        #Output Weights
+        self.W_o = nn.Linear(hidden_size, self.vocab_size)
+        self.U_o = nn.Linear(hidden_size, 1)
+        #Loss Functions
+        # self.pred_loss = nn.CrossEntropyLoss(size_average=False)
+        # self.stop_loss = nn.BCEWithLogitsLoss(size_average=False)
+        self.pred_loss = nn.CrossEntropyLoss(reduction='sum')
+        self.stop_loss = nn.BCEWithLogitsLoss(reduction='sum')
+    def aggregate(self, hiddens, contexts, x_tree_vecs, mode):
+        if mode == 'word':
+            V, V_o = self.W, self.W_o
+        elif mode == 'stop':
+            V, V_o = self.U, self.U_o
+        else:
+            raise ValueError('aggregate mode is wrong')
+        tree_contexts = x_tree_vecs.index_select(0, contexts)
+        input_vec = torch.cat([hiddens, tree_contexts], dim=-1)
+        output_vec = F.relu( V(input_vec) )
+        return V_o(output_vec)
+    def forward(self, mol_batch, x_tree_vecs):
+        pred_hiddens,pred_contexts,pred_targets = [],[],[]
+        stop_hiddens,stop_contexts,stop_targets = [],[],[]
+        traces = []
+        for mol_tree in mol_batch:
+            s = []
+            dfs(s, mol_tree.nodes[0], -1)
+            traces.append(s)
+            for node in mol_tree.nodes:
+                node.neighbors = []
+        #Predict Root
+        batch_size = len(mol_batch)
+        pred_hiddens.append(create_var(torch.zeros(len(mol_batch),self.hidden_size)))
+        pred_targets.extend([mol_tree.nodes[0].wid for mol_tree in mol_batch])
+        pred_contexts.append( create_var( torch.LongTensor(range(batch_size)) ) )
+        max_iter = max([len(tr) for tr in traces])
+        padding = create_var(torch.zeros(self.hidden_size), False)
+        h = {}
+        for t in range(max_iter):
+            prop_list = []
+            batch_list = []
+            for i,plist in enumerate(traces):
+                if t < len(plist):
+                    prop_list.append(plist[t])
+                    batch_list.append(i)
+            cur_x = []
+            cur_h_nei,cur_o_nei = [],[]
+            for node_x, real_y, _ in prop_list:
+                #Neighbors for message passing (target not included)
+                cur_nei = [h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors if node_y.idx != real_y.idx]
+                pad_len = MAX_NB - len(cur_nei)
+                cur_h_nei.extend(cur_nei)
+                cur_h_nei.extend([padding] * pad_len)
+                #Neighbors for stop prediction (all neighbors)
+                cur_nei = [h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors]
+                pad_len = MAX_NB - len(cur_nei)
+                cur_o_nei.extend(cur_nei)
+                cur_o_nei.extend([padding] * pad_len)
+                #Current clique embedding
+                cur_x.append(node_x.wid)
+            #Clique embedding
+            cur_x = create_var(torch.LongTensor(cur_x))
+            cur_x = self.embedding(cur_x)
+            #Message passing
+            cur_h_nei = torch.stack(cur_h_nei, dim=0).view(-1,MAX_NB,self.hidden_size)
+            new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h)
+            #Node Aggregate
+            cur_o_nei = torch.stack(cur_o_nei, dim=0).view(-1,MAX_NB,self.hidden_size)
+            cur_o = cur_o_nei.sum(dim=1)
+            #Gather targets
+            pred_target,pred_list = [],[]
+            stop_target = []
+            for i,m in enumerate(prop_list):
+                node_x,node_y,direction = m
+                x,y = node_x.idx,node_y.idx
+                h[(x,y)] = new_h[i]
+                node_y.neighbors.append(node_x)
+                if direction == 1:
+                    pred_target.append(node_y.wid)
+                    pred_list.append(i)
+                stop_target.append(direction)
+            #Hidden states for stop prediction
+            cur_batch = create_var(torch.LongTensor(batch_list))
+            stop_hidden = torch.cat([cur_x,cur_o], dim=1)
+            stop_hiddens.append( stop_hidden )
+            stop_contexts.append( cur_batch )
+            stop_targets.extend( stop_target )
+            #Hidden states for clique prediction
+            if len(pred_list) > 0:
+                batch_list = [batch_list[i] for i in pred_list]
+                cur_batch = create_var(torch.LongTensor(batch_list))
+                pred_contexts.append( cur_batch )
+                cur_pred = create_var(torch.LongTensor(pred_list))
+                pred_hiddens.append( new_h.index_select(0, cur_pred) )
+                pred_targets.extend( pred_target )
+        #Last stop at root
+        cur_x,cur_o_nei = [],[]
+        for mol_tree in mol_batch:
+            node_x = mol_tree.nodes[0]
+            cur_x.append(node_x.wid)
+            cur_nei = [h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors]
+            pad_len = MAX_NB - len(cur_nei)
+            cur_o_nei.extend(cur_nei)
+            cur_o_nei.extend([padding] * pad_len)
+        cur_x = create_var(torch.LongTensor(cur_x))
+        cur_x = self.embedding(cur_x)
+        cur_o_nei = torch.stack(cur_o_nei, dim=0).view(-1,MAX_NB,self.hidden_size)
+        cur_o = cur_o_nei.sum(dim=1)
+        stop_hidden = torch.cat([cur_x,cur_o], dim=1)
+        stop_hiddens.append( stop_hidden )
+        stop_contexts.append( create_var( torch.LongTensor(range(batch_size)) ) )
+        stop_targets.extend( [0] * len(mol_batch) )
+        #Predict next clique
+        pred_contexts = torch.cat(pred_contexts, dim=0)
+        pred_hiddens = torch.cat(pred_hiddens, dim=0)
+        pred_scores = self.aggregate(pred_hiddens, pred_contexts, x_tree_vecs, 'word')
+        pred_targets = create_var(torch.LongTensor(pred_targets))
+        pred_loss = self.pred_loss(pred_scores, pred_targets) / len(mol_batch)
+        _,preds = torch.max(pred_scores, dim=1)
+        pred_acc = torch.eq(preds, pred_targets).float()
+        pred_acc = torch.sum(pred_acc) / pred_targets.nelement()
+        #Predict stop
+        stop_contexts = torch.cat(stop_contexts, dim=0)
+        stop_hiddens = torch.cat(stop_hiddens, dim=0)
+        stop_hiddens = F.relu( self.U_i(stop_hiddens) )
+        stop_scores = self.aggregate(stop_hiddens, stop_contexts, x_tree_vecs, 'stop')
+        stop_scores = stop_scores.squeeze(-1)
+        stop_targets = create_var(torch.Tensor(stop_targets))
+        stop_loss = self.stop_loss(stop_scores, stop_targets) / len(mol_batch)
+        stops = torch.ge(stop_scores, 0).float()
+        stop_acc = torch.eq(stops, stop_targets).float()
+        stop_acc = torch.sum(stop_acc) / stop_targets.nelement()
+        return pred_loss, stop_loss, pred_acc.item(), stop_acc.item()
+    def decode(self, x_tree_vecs, prob_decode):
+        assert x_tree_vecs.size(0) == 1
+        stack = []
+        init_hiddens = create_var( torch.zeros(1, self.hidden_size) )
+        zero_pad = create_var(torch.zeros(1,1,self.hidden_size))
+        contexts = create_var( torch.LongTensor(1).zero_() )
+        #Root Prediction
+        root_score = self.aggregate(init_hiddens, contexts, x_tree_vecs, 'word')
+        _,root_wid = torch.max(root_score, dim=1)
+        root_wid = root_wid.item()
+        root = MolTreeNode(self.vocab.get_smiles(root_wid))
+        root.wid = root_wid
+        root.idx = 0
+        stack.append( (root, self.vocab.get_slots(root.wid)) )
+        all_nodes = [root]
+        h = {}
+        for step in range(MAX_DECODE_LEN):
+            node_x,fa_slot = stack[-1]
+            cur_h_nei = [ h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors ]
+            if len(cur_h_nei) > 0:
+                cur_h_nei = torch.stack(cur_h_nei, dim=0).view(1,-1,self.hidden_size)
+            else:
+                cur_h_nei = zero_pad
+            cur_x = create_var(torch.LongTensor([node_x.wid]))
+            cur_x = self.embedding(cur_x)
+            #Predict stop
+            cur_h = cur_h_nei.sum(dim=1)
+            stop_hiddens = torch.cat([cur_x,cur_h], dim=1)
+            stop_hiddens = F.relu( self.U_i(stop_hiddens) )
+            stop_score = self.aggregate(stop_hiddens, contexts, x_tree_vecs, 'stop')
+            if prob_decode:
+                backtrack = (torch.bernoulli( torch.sigmoid(stop_score) ).item() == 0)
+            else:
+                backtrack = (stop_score.item() < 0)
+            if not backtrack: #Forward: Predict next clique
+                new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h)
+                pred_score = self.aggregate(new_h, contexts, x_tree_vecs, 'word')
+                if prob_decode:
+                    sort_wid = torch.multinomial(F.softmax(pred_score, dim=1).squeeze(), 5)
+                else:
+                    _,sort_wid = torch.sort(pred_score, dim=1, descending=True)
+                    sort_wid = sort_wid.data.squeeze()
+                next_wid = None
+                for wid in sort_wid[:5]:
+                    slots = self.vocab.get_slots(wid)
+                    node_y = MolTreeNode(self.vocab.get_smiles(wid))
+                    if have_slots(fa_slot, slots) and can_assemble(node_x, node_y):
+                        next_wid = wid
+                        next_slots = slots
+                        break
+                if next_wid is None:
+                    backtrack = True #No more children can be added
+                else:
+                    node_y = MolTreeNode(self.vocab.get_smiles(next_wid))
+                    node_y.wid = next_wid
+                    node_y.idx = len(all_nodes)
+                    node_y.neighbors.append(node_x)
+                    h[(node_x.idx,node_y.idx)] = new_h[0]
+                    stack.append( (node_y,next_slots) )
+                    all_nodes.append(node_y)
+            if backtrack: #Backtrack, use if instead of else
+                if len(stack) == 1:
+                    break #At root, terminate
+                node_fa,_ = stack[-2]
+                cur_h_nei = [ h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors if node_y.idx != node_fa.idx ]
+                if len(cur_h_nei) > 0:
+                    cur_h_nei = torch.stack(cur_h_nei, dim=0).view(1,-1,self.hidden_size)
+                else:
+                    cur_h_nei = zero_pad
+                new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h)
+                h[(node_x.idx,node_fa.idx)] = new_h[0]
+                node_fa.neighbors.append(node_x)
+                stack.pop()
+        return root, all_nodes
+"""
+Helper Functions:
+"""
+def dfs(stack, x, fa_idx):
+    for y in x.neighbors:
+        if y.idx == fa_idx: continue
+        stack.append( (x,y,1) )
+        dfs(stack, y, x.idx)
+        stack.append( (y,x,0) )
+def have_slots(fa_slots, ch_slots):
+    if len(fa_slots) > 2 and len(ch_slots) > 2:
+        return True
+    matches = []
+    for i,s1 in enumerate(fa_slots):
+        a1,c1,h1 = s1
+        for j,s2 in enumerate(ch_slots):
+            a2,c2,h2 = s2
+            if a1 == a2 and c1 == c2 and (a1 != "C" or h1 + h2 >= 4):
+                matches.append( (i,j) )
+    if len(matches) == 0: return False
+    fa_match,ch_match = zip(*matches)
+    if len(set(fa_match)) == 1 and 1 < len(fa_slots) <= 2: #never remove atom from ring
+        fa_slots.pop(fa_match[0])
+    if len(set(ch_match)) == 1 and 1 < len(ch_slots) <= 2: #never remove atom from ring
+        ch_slots.pop(ch_match[0])
+    return True
+def can_assemble(node_x, node_y):
+    node_x.nid = 1
+    node_x.is_leaf = False
+    set_atommap(node_x.mol, node_x.nid)
+    neis = node_x.neighbors + [node_y]
+    for i,nei in enumerate(neis):
+        nei.nid = i + 2
+        nei.is_leaf = (len(nei.neighbors) <= 1)
+        if nei.is_leaf:
+            set_atommap(nei.mol, 0)
+        else:
+            set_atommap(nei.mol, nei.nid)
+    neighbors = [nei for nei in neis if nei.mol.GetNumAtoms() > 1]
+    neighbors = sorted(neighbors, key=lambda x:x.mol.GetNumAtoms(), reverse=True)
+    singletons = [nei for nei in neis if nei.mol.GetNumAtoms() == 1]
+    neighbors = singletons + neighbors
+    cands,aroma_scores = enum_assemble(node_x, neighbors)
+    return len(cands) > 0# and sum(aroma_scores) >= 0
+if __name__ == "__main__":
+    smiles = ["O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1","O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2", "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3", "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1", 'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1', "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"]
+    for s in smiles:
+        print(s)
+        tree = MolTree(s)
+        for i,node in enumerate(tree.nodes):
+            node.idx = i
+        stack = []
+        dfs(stack, tree.nodes[0], -1)
+        for x,y,d in stack:
+            print(x.smiles, y.smiles, d)
+        print('------------------------------')

fast_jtnn/jtnn_enc.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import deque
+from mol_tree import Vocab, MolTree
+from nnutils import create_var, index_select_ND
+class JTNNEncoder(nn.Module):
+    def __init__(self, hidden_size, depth, embedding):
+        super(JTNNEncoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.embedding = embedding
+        self.outputNN = nn.Sequential(
+            nn.Linear(2 * hidden_size, hidden_size),
+            nn.ReLU()
+        )
+        self.GRU = GraphGRU(hidden_size, hidden_size, depth=depth)
+    def forward(self, fnode, fmess, node_graph, mess_graph, scope):
+        fnode = create_var(fnode)
+        fmess = create_var(fmess)
+        node_graph = create_var(node_graph)
+        mess_graph = create_var(mess_graph)
+        messages = create_var(torch.zeros(mess_graph.size(0), self.hidden_size))
+        fnode = self.embedding(fnode)
+        fmess = index_select_ND(fnode, 0, fmess)
+        messages = self.GRU(messages, fmess, mess_graph)
+        mess_nei = index_select_ND(messages, 0, node_graph)
+        node_vecs = torch.cat([fnode, mess_nei.sum(dim=1)], dim=-1)
+        node_vecs = self.outputNN(node_vecs)
+        max_len = max([x for _,x in scope])
+        batch_vecs = []
+        for st,le in scope:
+            cur_vecs = node_vecs[st] #Root is the first node
+            batch_vecs.append( cur_vecs )
+        tree_vecs = torch.stack(batch_vecs, dim=0)
+        return tree_vecs, messages
+    @staticmethod
+    def tensorize(tree_batch):
+        node_batch = []
+        scope = []
+        for tree in tree_batch:
+            scope.append( (len(node_batch), len(tree.nodes)) )
+            node_batch.extend(tree.nodes)
+        return JTNNEncoder.tensorize_nodes(node_batch, scope)
+    @staticmethod
+    def tensorize_nodes(node_batch, scope):
+        messages,mess_dict = [None],{}
+        fnode = []
+        for x in node_batch:
+            fnode.append(x.wid)
+            for y in x.neighbors:
+                mess_dict[(x.idx,y.idx)] = len(messages)
+                messages.append( (x,y) )
+        node_graph = [[] for i in range(len(node_batch))]
+        mess_graph = [[] for i in range(len(messages))]
+        fmess = [0] * len(messages)
+        for x,y in messages[1:]:
+            mid1 = mess_dict[(x.idx,y.idx)]
+            fmess[mid1] = x.idx
+            node_graph[y.idx].append(mid1)
+            for z in y.neighbors:
+                if z.idx == x.idx: continue
+                mid2 = mess_dict[(y.idx,z.idx)]
+                mess_graph[mid2].append(mid1)
+        max_len = max([len(t) for t in node_graph] + [1])
+        for t in node_graph:
+            pad_len = max_len - len(t)
+            t.extend([0] * pad_len)
+        max_len = max([len(t) for t in mess_graph] + [1])
+        for t in mess_graph:
+            pad_len = max_len - len(t)
+            t.extend([0] * pad_len)
+        mess_graph = torch.LongTensor(mess_graph)
+        node_graph = torch.LongTensor(node_graph)
+        fmess = torch.LongTensor(fmess)
+        fnode = torch.LongTensor(fnode)
+        return (fnode, fmess, node_graph, mess_graph, scope), mess_dict
+class GraphGRU(nn.Module):
+    def __init__(self, input_size, hidden_size, depth):
+        super(GraphGRU, self).__init__()
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.depth = depth
+        self.W_z = nn.Linear(input_size + hidden_size, hidden_size)
+        self.W_r = nn.Linear(input_size, hidden_size, bias=False)
+        self.U_r = nn.Linear(hidden_size, hidden_size)
+        self.W_h = nn.Linear(input_size + hidden_size, hidden_size)
+    def forward(self, h, x, mess_graph):
+        mask = torch.ones(h.size(0), 1)
+        mask[0] = 0 #first vector is padding
+        mask = create_var(mask)
+        for it in range(self.depth):
+            h_nei = index_select_ND(h, 0, mess_graph)
+            sum_h = h_nei.sum(dim=1)
+            z_input = torch.cat([x, sum_h], dim=1)
+            z = F.sigmoid(self.W_z(z_input))
+            r_1 = self.W_r(x).view(-1, 1, self.hidden_size)
+            r_2 = self.U_r(h_nei)
+            r = F.sigmoid(r_1 + r_2)
+            gated_h = r * h_nei
+            sum_gated_h = gated_h.sum(dim=1)
+            h_input = torch.cat([x, sum_gated_h], dim=1)
+            pre_h = F.tanh(self.W_h(h_input))
+            h = (1.0 - z) * sum_h + z * pre_h
+            h = h * mask
+        return h

fast_jtnn/jtnn_vae.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mol_tree import Vocab, MolTree
+from nnutils import create_var, flatten_tensor, avg_pool
+from jtnn_enc import JTNNEncoder
+from jtnn_dec import JTNNDecoder
+from mpn import MPN
+from jtmpn import JTMPN
+from datautils import tensorize
+from chemutils import enum_assemble, set_atommap, copy_edit_mol, attach_mols
+import rdkit
+import rdkit.Chem as Chem
+import copy, math
+class JTNNVAE(nn.Module):
+    def __init__(self, vocab, hidden_size, latent_size, depthT, depthG):
+        super(JTNNVAE, self).__init__()
+        self.vocab = vocab
+        self.hidden_size = hidden_size
+        self.latent_size = latent_size = int(latent_size / 2) #Tree and Mol has two vectors
+        self.jtnn = JTNNEncoder(hidden_size, depthT, nn.Embedding(vocab.size(), hidden_size))
+        self.decoder = JTNNDecoder(vocab, hidden_size, latent_size, nn.Embedding(vocab.size(), hidden_size))
+        self.jtmpn = JTMPN(hidden_size, depthG)
+        self.mpn = MPN(hidden_size, depthG)
+        self.A_assm = nn.Linear(latent_size, hidden_size, bias=False)
+        # self.assm_loss = nn.CrossEntropyLoss(size_average=False)
+        self.assm_loss = nn.CrossEntropyLoss(reduction='sum')
+        self.T_mean = nn.Linear(hidden_size, latent_size)
+        self.T_var = nn.Linear(hidden_size, latent_size)
+        self.G_mean = nn.Linear(hidden_size, latent_size)
+        self.G_var = nn.Linear(hidden_size, latent_size)
+    def encode(self, jtenc_holder, mpn_holder):
+        tree_vecs, tree_mess = self.jtnn(*jtenc_holder)
+        mol_vecs = self.mpn(*mpn_holder)
+        return tree_vecs, tree_mess, mol_vecs
+    def encode_from_smiles(self, smiles_list):
+        tree_batch = [MolTree(s) for s in smiles_list]
+        _, jtenc_holder, mpn_holder = tensorize(tree_batch, self.vocab, assm=False)
+        tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder)
+        return torch.cat([tree_vecs, mol_vecs], dim=-1)
+    def encode_latent(self, jtenc_holder, mpn_holder):
+        tree_vecs, _ = self.jtnn(*jtenc_holder)
+        mol_vecs = self.mpn(*mpn_holder)
+        tree_mean = self.T_mean(tree_vecs)
+        mol_mean = self.G_mean(mol_vecs)
+        tree_var = -torch.abs(self.T_var(tree_vecs))
+        mol_var = -torch.abs(self.G_var(mol_vecs))
+        return torch.cat([tree_mean, mol_mean], dim=1), torch.cat([tree_var, mol_var], dim=1)
+    def rsample(self, z_vecs, W_mean, W_var):
+        batch_size = z_vecs.size(0)
+        z_mean = W_mean(z_vecs)
+        z_log_var = -torch.abs(W_var(z_vecs)) #Following Mueller et al.
+        kl_loss = -0.5 * torch.sum(1.0 + z_log_var - z_mean * z_mean - torch.exp(z_log_var)) / batch_size
+        epsilon = create_var(torch.randn_like(z_mean))
+        z_vecs = z_mean + torch.exp(z_log_var / 2) * epsilon
+        return z_vecs, kl_loss
+    def sample_prior(self, prob_decode=False):
+        z_tree = torch.randn(1, self.latent_size).cuda()
+        z_mol = torch.randn(1, self.latent_size).cuda()
+        return self.decode(z_tree, z_mol, prob_decode)
+    def forward(self, x_batch, beta):
+        x_batch, x_jtenc_holder, x_mpn_holder, x_jtmpn_holder = x_batch
+        x_tree_vecs, x_tree_mess, x_mol_vecs = self.encode(x_jtenc_holder, x_mpn_holder)
+        z_tree_vecs,tree_kl = self.rsample(x_tree_vecs, self.T_mean, self.T_var)
+        z_mol_vecs,mol_kl = self.rsample(x_mol_vecs, self.G_mean, self.G_var)
+        kl_div = tree_kl + mol_kl
+        word_loss, topo_loss, word_acc, topo_acc = self.decoder(x_batch, z_tree_vecs)
+        assm_loss, assm_acc = self.assm(x_batch, x_jtmpn_holder, z_mol_vecs, x_tree_mess)
+        return word_loss + topo_loss + assm_loss + beta * kl_div, kl_div.item(), word_acc, topo_acc, assm_acc
+    def assm(self, mol_batch, jtmpn_holder, x_mol_vecs, x_tree_mess):
+        jtmpn_holder,batch_idx = jtmpn_holder
+        fatoms,fbonds,agraph,bgraph,scope = jtmpn_holder
+        batch_idx = create_var(batch_idx)
+        cand_vecs = self.jtmpn(fatoms, fbonds, agraph, bgraph, scope, x_tree_mess)
+        x_mol_vecs = x_mol_vecs.index_select(0, batch_idx)
+        x_mol_vecs = self.A_assm(x_mol_vecs) #bilinear
+        scores = torch.bmm(
+                x_mol_vecs.unsqueeze(1),
+                cand_vecs.unsqueeze(-1)
+        ).squeeze()
+        cnt,tot,acc = 0,0,0
+        all_loss = []
+        for i,mol_tree in enumerate(mol_batch):
+            comp_nodes = [node for node in mol_tree.nodes if len(node.cands) > 1 and not node.is_leaf]
+            cnt += len(comp_nodes)
+            for node in comp_nodes:
+                label = node.cands.index(node.label)
+                ncand = len(node.cands)
+                cur_score = scores.narrow(0, tot, ncand)
+                tot += ncand
+                if cur_score.data[label] >= cur_score.max().item():
+                    acc += 1
+                label = create_var(torch.LongTensor([label]))
+                all_loss.append( self.assm_loss(cur_score.view(1,-1), label) )
+        all_loss = sum(all_loss) / len(mol_batch)
+        return all_loss, acc * 1.0 / cnt
+    def decode(self, x_tree_vecs, x_mol_vecs, prob_decode):
+        #currently do not support batch decoding
+        assert x_tree_vecs.size(0) == 1 and x_mol_vecs.size(0) == 1
+        pred_root,pred_nodes = self.decoder.decode(x_tree_vecs, prob_decode)
+        if len(pred_nodes) == 0: return None
+        elif len(pred_nodes) == 1: return pred_root.smiles
+        #Mark nid & is_leaf & atommap
+        for i,node in enumerate(pred_nodes):
+            node.nid = i + 1
+            node.is_leaf = (len(node.neighbors) == 1)
+            if len(node.neighbors) > 1:
+                set_atommap(node.mol, node.nid)
+        scope = [(0, len(pred_nodes))]
+        jtenc_holder,mess_dict = JTNNEncoder.tensorize_nodes(pred_nodes, scope)
+        _,tree_mess = self.jtnn(*jtenc_holder)
+        tree_mess = (tree_mess, mess_dict) #Important: tree_mess is a matrix, mess_dict is a python dict
+        x_mol_vecs = self.A_assm(x_mol_vecs).squeeze() #bilinear
+        cur_mol = copy_edit_mol(pred_root.mol)
+        global_amap = [{}] + [{} for node in pred_nodes]
+        global_amap[1] = {atom.GetIdx():atom.GetIdx() for atom in cur_mol.GetAtoms()}
+        cur_mol,_ = self.dfs_assemble(tree_mess, x_mol_vecs, pred_nodes, cur_mol, global_amap, [], pred_root, None, prob_decode, check_aroma=True)
+        if cur_mol is None:
+            cur_mol = copy_edit_mol(pred_root.mol)
+            global_amap = [{}] + [{} for node in pred_nodes]
+            global_amap[1] = {atom.GetIdx():atom.GetIdx() for atom in cur_mol.GetAtoms()}
+            cur_mol,pre_mol = self.dfs_assemble(tree_mess, x_mol_vecs, pred_nodes, cur_mol, global_amap, [], pred_root, None, prob_decode, check_aroma=False)
+            if cur_mol is None: cur_mol = pre_mol
+        if cur_mol is None:
+            return None
+        cur_mol = cur_mol.GetMol()
+        set_atommap(cur_mol)
+        cur_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cur_mol))
+        return Chem.MolToSmiles(cur_mol) if cur_mol is not None else None
+    def dfs_assemble(self, y_tree_mess, x_mol_vecs, all_nodes, cur_mol, global_amap, fa_amap, cur_node, fa_node, prob_decode, check_aroma):
+        fa_nid = fa_node.nid if fa_node is not None else -1
+        prev_nodes = [fa_node] if fa_node is not None else []
+        children = [nei for nei in cur_node.neighbors if nei.nid != fa_nid]
+        neighbors = [nei for nei in children if nei.mol.GetNumAtoms() > 1]
+        neighbors = sorted(neighbors, key=lambda x:x.mol.GetNumAtoms(), reverse=True)
+        singletons = [nei for nei in children if nei.mol.GetNumAtoms() == 1]
+        neighbors = singletons + neighbors
+        cur_amap = [(fa_nid,a2,a1) for nid,a1,a2 in fa_amap if nid == cur_node.nid]
+        cands,aroma_score = enum_assemble(cur_node, neighbors, prev_nodes, cur_amap)
+        if len(cands) == 0 or (sum(aroma_score) < 0 and check_aroma):
+            return None, cur_mol
+        cand_smiles,cand_amap = zip(*cands)
+        aroma_score = torch.Tensor(aroma_score).cuda()
+        cands = [(smiles, all_nodes, cur_node) for smiles in cand_smiles]
+        if len(cands) > 1:
+            jtmpn_holder = JTMPN.tensorize(cands, y_tree_mess[1])
+            fatoms,fbonds,agraph,bgraph,scope = jtmpn_holder
+            cand_vecs = self.jtmpn(fatoms, fbonds, agraph, bgraph, scope, y_tree_mess[0])
+            scores = torch.mv(cand_vecs, x_mol_vecs) + aroma_score
+        else:
+            scores = torch.Tensor([1.0])
+        if prob_decode:
+            probs = F.softmax(scores.view(1,-1), dim=1).squeeze() + 1e-7 #prevent prob = 0
+            cand_idx = torch.multinomial(probs, probs.numel())
+        else:
+            _,cand_idx = torch.sort(scores, descending=True)
+        backup_mol = Chem.RWMol(cur_mol)
+        pre_mol = cur_mol
+        for i in range(cand_idx.numel()):
+            cur_mol = Chem.RWMol(backup_mol)
+            pred_amap = cand_amap[cand_idx[i].item()]
+            new_global_amap = copy.deepcopy(global_amap)
+            for nei_id,ctr_atom,nei_atom in pred_amap:
+                if nei_id == fa_nid:
+                    continue
+                new_global_amap[nei_id][nei_atom] = new_global_amap[cur_node.nid][ctr_atom]
+            cur_mol = attach_mols(cur_mol, children, [], new_global_amap) #father is already attached
+            new_mol = cur_mol.GetMol()
+            new_mol = Chem.MolFromSmiles(Chem.MolToSmiles(new_mol))
+            if new_mol is None: continue
+            has_error = False
+            for nei_node in children:
+                if nei_node.is_leaf: continue
+                tmp_mol, tmp_mol2 = self.dfs_assemble(y_tree_mess, x_mol_vecs, all_nodes, cur_mol, new_global_amap, pred_amap, nei_node, cur_node, prob_decode, check_aroma)
+                if tmp_mol is None:
+                    has_error = True
+                    if i == 0: pre_mol = tmp_mol2
+                    break
+                cur_mol = tmp_mol
+            if not has_error: return cur_mol, cur_mol
+        return None, pre_mol

fast_jtnn/jtprop_vae.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mol_tree import Vocab, MolTree
+from nnutils import create_var, flatten_tensor, avg_pool
+from jtnn_enc import JTNNEncoder
+from jtnn_dec import JTNNDecoder
+from mpn import MPN
+from jtmpn import JTMPN
+from datautils import tensorize
+from chemutils import enum_assemble, set_atommap, copy_edit_mol, attach_mols
+import rdkit
+import rdkit.Chem as Chem
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+import copy, math
+class JTPropVAE(nn.Module):
+    def __init__(self, vocab, hidden_size, latent_size, depthT, depthG):
+        super(JTPropVAE, self).__init__()
+        self.vocab = vocab
+        self.hidden_size = hidden_size
+        self.latent_size = latent_size = int(latent_size / 2) #Tree and Mol has two vectors
+        self.jtnn = JTNNEncoder(hidden_size, depthT, nn.Embedding(vocab.size(), hidden_size))
+        self.decoder = JTNNDecoder(vocab, hidden_size, latent_size, nn.Embedding(vocab.size(), hidden_size))
+        self.jtmpn = JTMPN(hidden_size, depthG)
+        self.mpn = MPN(hidden_size, depthG)
+        self.A_assm = nn.Linear(latent_size, hidden_size, bias=False)
+        # self.assm_loss = nn.CrossEntropyLoss(size_average=False)
+        self.assm_loss = nn.CrossEntropyLoss(reduction='sum')
+        self.T_mean = nn.Linear(hidden_size, latent_size)
+        self.T_var = nn.Linear(hidden_size, latent_size)
+        self.G_mean = nn.Linear(hidden_size, latent_size)
+        self.G_var = nn.Linear(hidden_size, latent_size)
+        # Prop
+        self.propNN = nn.Sequential(
+                nn.Linear(self.latent_size*2, self.hidden_size),
+                nn.Tanh(),
+                nn.Linear(self.hidden_size, 1)
+        )
+        self.prop_loss = nn.MSELoss()
+    def encode(self, jtenc_holder, mpn_holder):
+        tree_vecs, tree_mess = self.jtnn(*jtenc_holder)
+        mol_vecs = self.mpn(*mpn_holder)
+        return tree_vecs, tree_mess, mol_vecs
+    def encode_from_smiles(self, smiles_list):
+        tree_batch = [MolTree(s) for s in smiles_list]
+        _, jtenc_holder, mpn_holder = tensorize(tree_batch, self.vocab, assm=False)
+        tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder)
+        return torch.cat([tree_vecs, mol_vecs], dim=-1)
+    def encode_latent(self, jtenc_holder, mpn_holder):
+        tree_vecs, _ = self.jtnn(*jtenc_holder)
+        mol_vecs = self.mpn(*mpn_holder)
+        tree_mean = self.T_mean(tree_vecs)
+        mol_mean = self.G_mean(mol_vecs)
+        tree_var = -torch.abs(self.T_var(tree_vecs))
+        mol_var = -torch.abs(self.G_var(mol_vecs))
+        return torch.cat([tree_mean, mol_mean], dim=1), torch.cat([tree_var, mol_var], dim=1)
+    def rsample(self, z_vecs, W_mean, W_var):
+        batch_size = z_vecs.size(0)
+        z_mean = W_mean(z_vecs)
+        z_log_var = -torch.abs(W_var(z_vecs)) #Following Mueller et al.
+        kl_loss = -0.5 * torch.sum(1.0 + z_log_var - z_mean * z_mean - torch.exp(z_log_var)) / batch_size
+        epsilon = create_var(torch.randn_like(z_mean))
+        z_vecs = z_mean + torch.exp(z_log_var / 2) * epsilon
+        return z_vecs, kl_loss
+    def sample_prior(self, prob_decode=False):
+        z_tree = torch.randn(1, self.latent_size).cuda()
+        z_mol = torch.randn(1, self.latent_size).cuda()
+        return self.decode(z_tree, z_mol, prob_decode)
+    def forward(self, x_batch, beta):
+        x_batch, x_jtenc_holder, x_mpn_holder, x_jtmpn_holder, prop_batch = x_batch
+        x_tree_vecs, x_tree_mess, x_mol_vecs = self.encode(x_jtenc_holder, x_mpn_holder)
+        z_tree_vecs,tree_kl = self.rsample(x_tree_vecs, self.T_mean, self.T_var)
+        z_mol_vecs,mol_kl = self.rsample(x_mol_vecs, self.G_mean, self.G_var)
+        kl_div = tree_kl + mol_kl
+        word_loss, topo_loss, word_acc, topo_acc = self.decoder(x_batch, z_tree_vecs)
+        assm_loss, assm_acc = self.assm(x_batch, x_jtmpn_holder, z_mol_vecs, x_tree_mess)
+        all_vec = torch.cat([z_tree_vecs, z_mol_vecs], dim=1)
+        # prop_label = create_var(torch.Tensor(prop_batch))
+        prop_label = create_var(torch.Tensor(prop_batch))
+        prop_loss = self.prop_loss(self.propNN(all_vec).squeeze(), prop_label)
+        return word_loss + topo_loss + assm_loss + beta * kl_div + prop_loss, kl_div.item(), word_acc, topo_acc, assm_acc, prop_loss.item()
+    def assm(self, mol_batch, jtmpn_holder, x_mol_vecs, x_tree_mess):
+        jtmpn_holder,batch_idx = jtmpn_holder
+        fatoms,fbonds,agraph,bgraph,scope = jtmpn_holder
+        batch_idx = create_var(batch_idx)
+        cand_vecs = self.jtmpn(fatoms, fbonds, agraph, bgraph, scope, x_tree_mess)
+        x_mol_vecs = x_mol_vecs.index_select(0, batch_idx)
+        x_mol_vecs = self.A_assm(x_mol_vecs) #bilinear
+        scores = torch.bmm(
+                x_mol_vecs.unsqueeze(1),
+                cand_vecs.unsqueeze(-1)
+        ).squeeze()
+        cnt,tot,acc = 0,0,0
+        all_loss = []
+        for i,mol_tree in enumerate(mol_batch):
+            comp_nodes = [node for node in mol_tree.nodes if len(node.cands) > 1 and not node.is_leaf]
+            cnt += len(comp_nodes)
+            for node in comp_nodes:
+                label = node.cands.index(node.label)
+                ncand = len(node.cands)
+                cur_score = scores.narrow(0, tot, ncand)
+                tot += ncand
+                if cur_score.data[label] >= cur_score.max().item():
+                    acc += 1
+                label = create_var(torch.LongTensor([label]))
+                all_loss.append( self.assm_loss(cur_score.view(1,-1), label) )
+        all_loss = sum(all_loss) / len(mol_batch)
+        return all_loss, acc * 1.0 / cnt
+    def decode(self, x_tree_vecs, x_mol_vecs, prob_decode):
+        #currently do not support batch decoding
+        assert x_tree_vecs.size(0) == 1 and x_mol_vecs.size(0) == 1
+        pred_root,pred_nodes = self.decoder.decode(x_tree_vecs, prob_decode)
+        if len(pred_nodes) == 0: return None
+        elif len(pred_nodes) == 1: return pred_root.smiles
+        #Mark nid & is_leaf & atommap
+        for i,node in enumerate(pred_nodes):
+            node.nid = i + 1
+            node.is_leaf = (len(node.neighbors) == 1)
+            if len(node.neighbors) > 1:
+                set_atommap(node.mol, node.nid)
+        scope = [(0, len(pred_nodes))]
+        jtenc_holder,mess_dict = JTNNEncoder.tensorize_nodes(pred_nodes, scope)
+        _,tree_mess = self.jtnn(*jtenc_holder)
+        tree_mess = (tree_mess, mess_dict) #Important: tree_mess is a matrix, mess_dict is a python dict
+        x_mol_vecs = self.A_assm(x_mol_vecs).squeeze() #bilinear
+        cur_mol = copy_edit_mol(pred_root.mol)
+        global_amap = [{}] + [{} for node in pred_nodes]
+        global_amap[1] = {atom.GetIdx():atom.GetIdx() for atom in cur_mol.GetAtoms()}
+        cur_mol,_ = self.dfs_assemble(tree_mess, x_mol_vecs, pred_nodes, cur_mol, global_amap, [], pred_root, None, prob_decode, check_aroma=True)
+        if cur_mol is None:
+            cur_mol = copy_edit_mol(pred_root.mol)
+            global_amap = [{}] + [{} for node in pred_nodes]
+            global_amap[1] = {atom.GetIdx():atom.GetIdx() for atom in cur_mol.GetAtoms()}
+            cur_mol,pre_mol = self.dfs_assemble(tree_mess, x_mol_vecs, pred_nodes, cur_mol, global_amap, [], pred_root, None, prob_decode, check_aroma=False)
+            if cur_mol is None: cur_mol = pre_mol
+        if cur_mol is None:
+            return None
+        cur_mol = cur_mol.GetMol()
+        set_atommap(cur_mol)
+        cur_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cur_mol))
+        return Chem.MolToSmiles(cur_mol) if cur_mol is not None else None
+    def dfs_assemble(self, y_tree_mess, x_mol_vecs, all_nodes, cur_mol, global_amap, fa_amap, cur_node, fa_node, prob_decode, check_aroma):
+        fa_nid = fa_node.nid if fa_node is not None else -1
+        prev_nodes = [fa_node] if fa_node is not None else []
+        children = [nei for nei in cur_node.neighbors if nei.nid != fa_nid]
+        neighbors = [nei for nei in children if nei.mol.GetNumAtoms() > 1]
+        neighbors = sorted(neighbors, key=lambda x:x.mol.GetNumAtoms(), reverse=True)
+        singletons = [nei for nei in children if nei.mol.GetNumAtoms() == 1]
+        neighbors = singletons + neighbors
+        cur_amap = [(fa_nid,a2,a1) for nid,a1,a2 in fa_amap if nid == cur_node.nid]
+        cands,aroma_score = enum_assemble(cur_node, neighbors, prev_nodes, cur_amap)
+        if len(cands) == 0 or (sum(aroma_score) < 0 and check_aroma):
+            return None, cur_mol
+        cand_smiles,cand_amap = zip(*cands)
+        if torch.cuda.is_available():
+            aroma_score = torch.Tensor(aroma_score).cuda()
+        else:
+            aroma_score = torch.Tensor(aroma_score)
+        cands = [(smiles, all_nodes, cur_node) for smiles in cand_smiles]
+        if len(cands) > 1:
+            jtmpn_holder = JTMPN.tensorize(cands, y_tree_mess[1])
+            fatoms,fbonds,agraph,bgraph,scope = jtmpn_holder
+            cand_vecs = self.jtmpn(fatoms, fbonds, agraph, bgraph, scope, y_tree_mess[0])
+            scores = torch.mv(cand_vecs, x_mol_vecs) + aroma_score
+        else:
+            scores = torch.Tensor([1.0])
+        if prob_decode:
+            probs = F.softmax(scores.view(1,-1), dim=1).squeeze() + 1e-7 #prevent prob = 0
+            cand_idx = torch.multinomial(probs, probs.numel())
+        else:
+            _,cand_idx = torch.sort(scores, descending=True)
+        backup_mol = Chem.RWMol(cur_mol)
+        pre_mol = cur_mol
+        for i in range(cand_idx.numel()):
+            cur_mol = Chem.RWMol(backup_mol)
+            pred_amap = cand_amap[cand_idx[i].item()]
+            new_global_amap = copy.deepcopy(global_amap)
+            for nei_id,ctr_atom,nei_atom in pred_amap:
+                if nei_id == fa_nid:
+                    continue
+                new_global_amap[nei_id][nei_atom] = new_global_amap[cur_node.nid][ctr_atom]
+            cur_mol = attach_mols(cur_mol, children, [], new_global_amap) #father is already attached
+            new_mol = cur_mol.GetMol()
+            new_mol = Chem.MolFromSmiles(Chem.MolToSmiles(new_mol))
+            if new_mol is None: continue
+            has_error = False
+            for nei_node in children:
+                if nei_node.is_leaf: continue
+                tmp_mol, tmp_mol2 = self.dfs_assemble(y_tree_mess, x_mol_vecs, all_nodes, cur_mol, new_global_amap, pred_amap, nei_node, cur_node, prob_decode, check_aroma)
+                if tmp_mol is None:
+                    has_error = True
+                    if i == 0: pre_mol = tmp_mol2
+                    break
+                cur_mol = tmp_mol
+            if not has_error: return cur_mol, cur_mol
+        return None, pre_mol
+    def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20):
+        # mol_tree = MolTree(smiles)
+        # mol_tree.recover()
+        tree_batch = [MolTree(smiles)]
+        _, jtenc_holder, mpn_holder = tensorize(tree_batch, self.vocab, assm=False)
+        tree_vec, _, mol_vec = self.encode(jtenc_holder, mpn_holder)
+        mol = Chem.MolFromSmiles(smiles)
+        fp1 = AllChem.GetMorganFingerprint(mol, 2)
+        tree_mean = self.T_mean(tree_vec)
+        tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al.
+        mol_mean = self.G_mean(mol_vec)
+        mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al.
+        mean = torch.cat([tree_mean, mol_mean], dim=1)
+        log_var = torch.cat([tree_log_var, mol_log_var], dim=1)
+        cur_vec = create_var(mean.data, True)
+        visited = []
+        for step in range(num_iter):
+            prop_val = self.propNN(cur_vec).squeeze()
+            grad = torch.autograd.grad(prop_val, cur_vec)[0]
+            cur_vec = cur_vec.data + lr * grad.data
+            cur_vec = create_var(cur_vec, True)
+            visited.append(cur_vec)
+        l,r = 0, num_iter - 1
+        while l < r - 1:
+            mid = (l + r) // 2
+            new_vec = visited[mid]
+            tree_vec,mol_vec = torch.chunk(new_vec, 2, dim=1)
+            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
+            if new_smiles is None:
+                r = mid - 1
+                continue
+            new_mol = Chem.MolFromSmiles(new_smiles)
+            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
+            sim = DataStructs.TanimotoSimilarity(fp1, fp2)
+            if sim < sim_cutoff:
+                r = mid - 1
+            else:
+                l = mid
+        """
+        best_vec = visited[0]
+        for new_vec in visited:
+            tree_vec,mol_vec = torch.chunk(new_vec, 2, dim=1)
+            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
+            if new_smiles is None: continue
+            new_mol = Chem.MolFromSmiles(new_smiles)
+            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
+            sim = DataStructs.TanimotoSimilarity(fp1, fp2)
+            if sim >= sim_cutoff:
+                best_vec = new_vec
+        """
+        tree_vec,mol_vec = torch.chunk(visited[l], 2, dim=1)
+        #tree_vec,mol_vec = torch.chunk(best_vec, 2, dim=1)
+        new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
+        if new_smiles is None:
+            return None, None
+        new_mol = Chem.MolFromSmiles(new_smiles)
+        fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
+        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
+        if sim >= sim_cutoff:
+            return new_smiles, sim
+        else:
+            return None, None

fast_jtnn/mol_tree.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import rdkit
+import rdkit.Chem as Chem
+from chemutils import get_clique_mol, tree_decomp, get_mol, get_smiles, set_atommap, enum_assemble, decode_stereo
+from vocab import *
+import argparse
+class MolTreeNode(object):
+    def __init__(self, smiles, clique=[]):
+        self.smiles = smiles
+        self.mol = get_mol(self.smiles)
+        self.clique = [x for x in clique] #copy
+        self.neighbors = []
+    def add_neighbor(self, nei_node):
+        self.neighbors.append(nei_node)
+    def recover(self, original_mol):
+        clique = []
+        clique.extend(self.clique)
+        if not self.is_leaf:
+            for cidx in self.clique:
+                original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(self.nid)
+        for nei_node in self.neighbors:
+            clique.extend(nei_node.clique)
+            if nei_node.is_leaf: #Leaf node, no need to mark
+                continue
+            for cidx in nei_node.clique:
+                #allow singleton node override the atom mapping
+                if cidx not in self.clique or len(nei_node.clique) == 1:
+                    atom = original_mol.GetAtomWithIdx(cidx)
+                    atom.SetAtomMapNum(nei_node.nid)
+        clique = list(set(clique))
+        label_mol = get_clique_mol(original_mol, clique)
+        self.label = Chem.MolToSmiles(Chem.MolFromSmiles(get_smiles(label_mol)))
+        for cidx in clique:
+            original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(0)
+        return self.label
+    def assemble(self):
+        neighbors = [nei for nei in self.neighbors if nei.mol.GetNumAtoms() > 1]
+        neighbors = sorted(neighbors, key=lambda x:x.mol.GetNumAtoms(), reverse=True)
+        singletons = [nei for nei in self.neighbors if nei.mol.GetNumAtoms() == 1]
+        neighbors = singletons + neighbors
+        cands,aroma = enum_assemble(self, neighbors)
+        new_cands = [cand for i,cand in enumerate(cands) if aroma[i] >= 0]
+        if len(new_cands) > 0: cands = new_cands
+        if len(cands) > 0:
+            self.cands, _ = zip(*cands)
+            self.cands = list(self.cands)
+        else:
+            self.cands = []
+class MolTree(object):
+    def __init__(self, smiles):
+        self.smiles = smiles
+        self.mol = get_mol(smiles)
+        #Stereo Generation (currently disabled)
+        #mol = Chem.MolFromSmiles(smiles)
+        #self.smiles3D = Chem.MolToSmiles(mol, isomericSmiles=True)
+        #self.smiles2D = Chem.MolToSmiles(mol)
+        #self.stereo_cands = decode_stereo(self.smiles2D)
+        cliques, edges = tree_decomp(self.mol)
+        self.nodes = []
+        root = 0
+        for i,c in enumerate(cliques):
+            cmol = get_clique_mol(self.mol, c)
+            node = MolTreeNode(get_smiles(cmol), c)
+            self.nodes.append(node)
+            if min(c) == 0: root = i
+        for x,y in edges:
+            self.nodes[x].add_neighbor(self.nodes[y])
+            self.nodes[y].add_neighbor(self.nodes[x])
+        if root > 0:
+            self.nodes[0],self.nodes[root] = self.nodes[root],self.nodes[0]
+        for i,node in enumerate(self.nodes):
+            node.nid = i + 1
+            if len(node.neighbors) > 1: #Leaf node mol is not marked
+                set_atommap(node.mol, node.nid)
+            node.is_leaf = (len(node.neighbors) == 1)
+    def size(self):
+        return len(self.nodes)
+    def recover(self):
+        for node in self.nodes:
+            node.recover(self.mol)
+    def assemble(self):
+        for node in self.nodes:
+            node.assemble()
+def dfs(node, fa_idx):
+    max_depth = 0
+    for child in node.neighbors:
+        if child.idx == fa_idx: continue
+        max_depth = max(max_depth, dfs(child, node.idx))
+    return max_depth + 1
+def data_process_chunk(smiles_list):
+    cset = set()
+    for line in smiles_list:
+        smiles = line.split()[0]
+        # print(smiles)
+        mol = MolTree(smiles)
+        for c in mol.nodes:
+            cset.add(c.smiles)
+        # i+=1
+        # if i%10000 == 0:
+        #     # print(i,end='\x1b[1K\r')
+        #     print(i, ' / 1584663')
+    return list(cset)
+if __name__ == "__main__":
+    import sys
+    lg = rdkit.RDLogger.logger()
+    lg.setLevel(rdkit.RDLogger.CRITICAL)
+    i = 0
+    import os
+    from joblib import Parallel,delayed
+    from tqdm import tqdm
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smiles_path', type=str,required=True)
+    parser.add_argument('--vocab_path', type=str,required=True)
+    parser.add_argument('--prop', type=bool,default=False)
+    parser.add_argument('--ncpu', default=8,type=int)
+    args = parser.parse_args()
+    if args.prop:
+        import pandas as pd
+        smiles_list = pd.read_csv(args.smiles_path,usecols=['SMILES'])
+        smiles_list = list(smiles_list.SMILES)
+    else:
+        with open(args.smiles_path,'r') as f:
+            smiles_list = [line.split()[0] for line in f]
+    print('Total smiles = ',len(smiles_list))
+    # moses: 1584663
+    chunk_size = 10000
+    vocab_set_list = Parallel(n_jobs=args.ncpu)(
+        delayed(data_process_chunk)(smiles_list[start: start + chunk_size])
+        for start in tqdm(range(0, len(smiles_list), chunk_size))
+    )
+    vocab_list =[]
+    for set_list in vocab_set_list:
+        vocab_list.extend(set_list)
+    cset = set(vocab_list)
+    with open(args.vocab_path,'w') as f:
+        for x in cset:
+            f.write(''.join([x,'\n']))

fast_jtnn/mpn.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+import torch.nn as nn
+import rdkit.Chem as Chem
+import torch.nn.functional as F
+from nnutils import *
+from chemutils import get_mol
+ELEM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'Al', 'I', 'B', 'K', 'Se', 'Zn', 'H', 'Cu', 'Mn', 'unknown']
+ATOM_FDIM = len(ELEM_LIST) + 6 + 5 + 4 + 1
+BOND_FDIM = 5 + 6
+MAX_NB = 6
+def onek_encoding_unk(x, allowable_set):
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def atom_features(atom):
+    return torch.Tensor(onek_encoding_unk(atom.GetSymbol(), ELEM_LIST)
+            + onek_encoding_unk(atom.GetDegree(), [0,1,2,3,4,5])
+            + onek_encoding_unk(atom.GetFormalCharge(), [-1,-2,1,2,0])
+            + onek_encoding_unk(int(atom.GetChiralTag()), [0,1,2,3])
+            + [atom.GetIsAromatic()])
+def bond_features(bond):
+    bt = bond.GetBondType()
+    stereo = int(bond.GetStereo())
+    fbond = [bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC, bond.IsInRing()]
+    fstereo = onek_encoding_unk(stereo, [0,1,2,3,4,5])
+    return torch.Tensor(fbond + fstereo)
+class MPN(nn.Module):
+    def __init__(self, hidden_size, depth):
+        super(MPN, self).__init__()
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.W_i = nn.Linear(ATOM_FDIM + BOND_FDIM, hidden_size, bias=False)
+        self.W_h = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.W_o = nn.Linear(ATOM_FDIM + hidden_size, hidden_size)
+    def forward(self, fatoms, fbonds, agraph, bgraph, scope):
+        fatoms = create_var(fatoms)
+        fbonds = create_var(fbonds)
+        agraph = create_var(agraph)
+        bgraph = create_var(bgraph)
+        binput = self.W_i(fbonds)
+        message = F.relu(binput)
+        for i in range(self.depth - 1):
+            nei_message = index_select_ND(message, 0, bgraph)
+            nei_message = nei_message.sum(dim=1)
+            nei_message = self.W_h(nei_message)
+            message = F.relu(binput + nei_message)
+        nei_message = index_select_ND(message, 0, agraph)
+        nei_message = nei_message.sum(dim=1)
+        ainput = torch.cat([fatoms, nei_message], dim=1)
+        atom_hiddens = F.relu(self.W_o(ainput))
+        max_len = max([x for _,x in scope])
+        batch_vecs = []
+        for st,le in scope:
+            cur_vecs = atom_hiddens[st : st + le].mean(dim=0)
+            batch_vecs.append( cur_vecs )
+        mol_vecs = torch.stack(batch_vecs, dim=0)
+        return mol_vecs
+    @staticmethod
+    def tensorize(mol_batch):
+        padding = torch.zeros(ATOM_FDIM + BOND_FDIM)
+        fatoms,fbonds = [],[padding] #Ensure bond is 1-indexed
+        in_bonds,all_bonds = [],[(-1,-1)] #Ensure bond is 1-indexed
+        scope = []
+        total_atoms = 0
+        for smiles in mol_batch:
+            mol = get_mol(smiles)
+            #mol = Chem.MolFromSmiles(smiles)
+            n_atoms = mol.GetNumAtoms()
+            for atom in mol.GetAtoms():
+                fatoms.append( atom_features(atom) )
+                in_bonds.append([])
+            for bond in mol.GetBonds():
+                a1 = bond.GetBeginAtom()
+                a2 = bond.GetEndAtom()
+                x = a1.GetIdx() + total_atoms
+                y = a2.GetIdx() + total_atoms
+                b = len(all_bonds)
+                all_bonds.append((x,y))
+                fbonds.append( torch.cat([fatoms[x], bond_features(bond)], 0) )
+                in_bonds[y].append(b)
+                b = len(all_bonds)
+                all_bonds.append((y,x))
+                fbonds.append( torch.cat([fatoms[y], bond_features(bond)], 0) )
+                in_bonds[x].append(b)
+            scope.append((total_atoms,n_atoms))
+            total_atoms += n_atoms
+        total_bonds = len(all_bonds)
+        fatoms = torch.stack(fatoms, 0)
+        fbonds = torch.stack(fbonds, 0)
+        agraph = torch.zeros(total_atoms,MAX_NB).long()
+        bgraph = torch.zeros(total_bonds,MAX_NB).long()
+        for a in range(total_atoms):
+            for i,b in enumerate(in_bonds[a]):
+                agraph[a,i] = b
+        for b1 in range(1, total_bonds):
+            x,y = all_bonds[b1]
+            for i,b2 in enumerate(in_bonds[x]):
+                if all_bonds[b2][0] != y:
+                    bgraph[b1,i] = b2
+        return (fatoms, fbonds, agraph, bgraph, scope)

fast_jtnn/nnutils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+def create_var(tensor, requires_grad=None):
+    if requires_grad is None:
+        if torch.cuda.is_available():
+            return Variable(data = tensor).cuda()
+        else:
+            return Variable(data = tensor)
+    else:
+        if torch.cuda.is_available():
+            return Variable(data = tensor, requires_grad=requires_grad).cuda()
+        else:
+            return Variable(data = tensor, requires_grad=requires_grad)
+def index_select_ND(source, dim, index):
+    index_size = index.size()
+    suffix_dim = source.size()[1:]
+    final_size = index_size + suffix_dim
+    target = source.index_select(dim, index.view(-1))
+    return target.view(final_size)
+def avg_pool(all_vecs, scope, dim):
+    size = create_var(torch.Tensor([le for _,le in scope]))
+    return all_vecs.sum(dim=dim) / size.unsqueeze(-1)
+def stack_pad_tensor(tensor_list):
+    max_len = max([t.size(0) for t in tensor_list])
+    for i,tensor in enumerate(tensor_list):
+        pad_len = max_len - tensor.size(0)
+        tensor_list[i] = F.pad( tensor, (0,0,0,pad_len) )
+    return torch.stack(tensor_list, dim=0)
+#3D padded tensor to 2D matrix, with padded zeros removed
+def flatten_tensor(tensor, scope):
+    assert tensor.size(0) == len(scope)
+    tlist = []
+    for i,tup in enumerate(scope):
+        le = tup[1]
+        tlist.append( tensor[i, 0:le] )
+    return torch.cat(tlist, dim=0)
+#2D matrix to 3D padded tensor
+def inflate_tensor(tensor, scope):
+    max_len = max([le for _,le in scope])
+    batch_vecs = []
+    for st,le in scope:
+        cur_vecs = tensor[st : st + le]
+        cur_vecs = F.pad( cur_vecs, (0,0,0,max_len-le) )
+        batch_vecs.append( cur_vecs )
+    return torch.stack(batch_vecs, dim=0)
+def GRU(x, h_nei, W_z, W_r, U_r, W_h):
+    hidden_size = x.size()[-1]
+    sum_h = h_nei.sum(dim=1)
+    z_input = torch.cat([x,sum_h], dim=1)
+    z = F.sigmoid(W_z(z_input))
+    r_1 = W_r(x).view(-1,1,hidden_size)
+    r_2 = U_r(h_nei)
+    r = F.sigmoid(r_1 + r_2)
+    gated_h = r * h_nei
+    sum_gated_h = gated_h.sum(dim=1)
+    h_input = torch.cat([x,sum_gated_h], dim=1)
+    pre_h = F.tanh(W_h(h_input))
+    new_h = (1.0 - z) * sum_h + z * pre_h
+    return new_h

fast_jtnn/vocab.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import rdkit
+import rdkit.Chem as Chem
+import copy
+def get_slots(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    return [(atom.GetSymbol(), atom.GetFormalCharge(), atom.GetTotalNumHs()) for atom in mol.GetAtoms()]
+class Vocab(object):
+    benzynes = ['C1=CC=CC=C1', 'C1=CC=NC=C1', 'C1=CC=NN=C1', 'C1=CN=CC=N1', 'C1=CN=CN=C1', 'C1=CN=NC=N1', 'C1=CN=NN=C1', 'C1=NC=NC=N1', 'C1=NN=CN=N1']
+    penzynes = ['C1=C[NH]C=C1', 'C1=C[NH]C=N1', 'C1=C[NH]N=C1', 'C1=C[NH]N=N1', 'C1=COC=C1', 'C1=COC=N1', 'C1=CON=C1', 'C1=CSC=C1', 'C1=CSC=N1', 'C1=CSN=C1', 'C1=CSN=N1', 'C1=NN=C[NH]1', 'C1=NN=CO1', 'C1=NN=CS1', 'C1=N[NH]C=N1', 'C1=N[NH]N=C1', 'C1=N[NH]N=N1', 'C1=NN=N[NH]1', 'C1=NN=NS1', 'C1=NOC=N1', 'C1=NON=C1', 'C1=NSC=N1', 'C1=NSN=C1']
+    def __init__(self, smiles_list):
+        self.vocab = smiles_list
+        self.vmap = {x:i for i,x in enumerate(self.vocab)}
+        self.slots = [get_slots(smiles) for smiles in self.vocab]
+        Vocab.benzynes = [s for s in smiles_list if s.count('=') >= 2 and Chem.MolFromSmiles(s).GetNumAtoms() == 6] + ['C1=CCNCC1']
+        Vocab.penzynes = [s for s in smiles_list if s.count('=') >= 2 and Chem.MolFromSmiles(s).GetNumAtoms() == 5] + ['C1=NCCN1','C1=NNCC1']
+    def get_index(self, smiles):
+        return self.vmap[smiles]
+    def get_smiles(self, idx):
+        return self.vocab[idx]
+    def get_slots(self, idx):
+        return copy.deepcopy(self.slots[idx])
+    def size(self):
+        return len(self.vocab)

fpscores.pkl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dcef9340c873e7b987924461b0af5365eb8dd96be607203debe8ddf80c1e73
+size 3848394

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+rdkit
+numpy
+torch
+argparse
+tqdm
+networkx
+scipy
+copy
+molbloom

sascorer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#
+# calculation of synthetic accessibility score as described in:
+#
+# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
+# Peter Ertl and Ansgar Schuffenhauer
+# Journal of Cheminformatics 1:8 (2009)
+# http://www.jcheminf.com/content/1/1/8
+#
+# several small modifications to the original paper are included
+# particularly slightly different formula for marocyclic penalty
+# and taking into account also molecule symmetry (fingerprint density)
+#
+# for a set of 10k diverse molecules the agreement between the original method
+# as implemented in PipelinePilot and this implementation is r2 = 0.97
+#
+# peter ertl & greg landrum, september 2013
+#
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+import pickle
+import math
+from collections import defaultdict
+import os.path as op
+_fscores = None
+def readFragmentScores(name='fpscores'):
+    import gzip
+    global _fscores
+    # generate the full path filename:
+    if name == "fpscores":
+        name = op.join(op.dirname(__file__), name)
+    _fscores = pickle.load(gzip.open('%s.pkl.gz' % name))
+    outDict = {}
+    for i in _fscores:
+        for j in range(1, len(i)):
+            outDict[i[j]] = float(i[0])
+    _fscores = outDict
+def numBridgeheadsAndSpiro(mol, ri=None):
+    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+    return nBridgehead, nSpiro
+def calculateScore(m):
+    if _fscores is None:
+        readFragmentScores()
+    # fragment score
+    fp = rdMolDescriptors.GetMorganFingerprint(m,
+                                               2)  # <- 2 is the *radius* of the circular fingerprint
+    fps = fp.GetNonzeroElements()
+    score1 = 0.
+    nf = 0
+    for bitId, v in fps.items():
+        nf += v
+        sfp = bitId
+        score1 += _fscores.get(sfp, -4) * v
+    score1 /= nf
+    # features score
+    nAtoms = m.GetNumAtoms()
+    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
+    ri = m.GetRingInfo()
+    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
+    nMacrocycles = 0
+    for x in ri.AtomRings():
+        if len(x) > 8:
+            nMacrocycles += 1
+    sizePenalty = nAtoms**1.005 - nAtoms
+    stereoPenalty = math.log10(nChiralCenters + 1)
+    spiroPenalty = math.log10(nSpiro + 1)
+    bridgePenalty = math.log10(nBridgeheads + 1)
+    macrocyclePenalty = 0.
+    # ---------------------------------------
+    # This differs from the paper, which defines:
+    #  macrocyclePenalty = math.log10(nMacrocycles+1)
+    # This form generates better results when 2 or more macrocycles are present
+    if nMacrocycles > 0:
+        macrocyclePenalty = math.log10(2)
+    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
+    # correction for the fingerprint density
+    # not in the original publication, added in version 1.1
+    # to make highly symmetrical molecules easier to synthetise
+    score3 = 0.
+    if nAtoms > len(fps):
+        score3 = math.log(float(nAtoms) / len(fps)) * .5
+    sascore = score1 + score2 + score3
+    # need to transform "raw" value into scale between 1 and 10
+    min = -4.0
+    max = 2.5
+    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+    # smooth the 10-end
+    if sascore > 8.:
+        sascore = 8. + math.log(sascore + 1. - 9.)
+    if sascore > 10.:
+        sascore = 10.0
+    elif sascore < 1.:
+        sascore = 1.0
+    return sascore
+def processMols(mols):
+    print('smiles\tName\tsa_score')
+    for i, m in enumerate(mols):
+        if m is None:
+            continue
+        s = calculateScore(m)
+        smiles = Chem.MolToSmiles(m)
+        print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
+if __name__ == '__main__':
+    import sys
+    import time
+    t1 = time.time()
+    readFragmentScores("fpscores")
+    t2 = time.time()
+    suppl = Chem.SmilesMolSupplier(sys.argv[1])
+    t3 = time.time()
+    processMols(suppl)
+    t4 = time.time()
+    print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
+          file=sys.stderr)
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#

vocab.txt ADDED Viewed

	@@ -0,0 +1,533 @@

+C=N
+C1CSCNN1
+C1CCSCC1
+C1=NSN=C1
+C1=CSCS1
+C1NCS1
+C1=CSCC1
+C1=CC2C=CC1O2
+C1CC2COCC(CN1)N2
+C1=COCCCN1
+C1=CNSNC1
+C1=NNCS1
+C1=NNCSC1
+C1=CSNCS1
+C1=NC=NC=N1
+C1=CCCCC=C1
+C1CNCN=N1
+C1=NNN=N1
+C1NCC2OCC1O2
+C1CC2CCOC(C2)N1
+C1=CC2CCC1C2
+C1=NN=NCC1
+C1=CC2CC1C2
+C1=CNCN=C1
+C1=CC2C=CC1NN2
+C1CN=NNC1
+C1=CSCCN1
+C1=CC=COC=C1
+C1=NCSC1
+C1CC2CNCC1N2
+C1N=NNCN1
+C=S
+C1CC2OCC3CCC(O1)C2C3
+C1COCCOC1
+C1NCSN1
+C1=CNN=C1
+C1CNCCOC1
+C1CSNCN1
+C1CNSN1
+C1CC2CCC(O1)O2
+C1=CNSC=C1
+C1=NCCO1
+C1=CCSNC1
+C1=NCOCC1
+C1=CONC1
+C1CC2CCC1O2
+C1=NN=NS1
+C1=CC2CCCC1NN2
+C1CSCN1
+C1NC2CC3CC1CC(C3)C2
+C1=NC=N[SH]=N1
+C1OC2CNC1C2
+C1=CCSC1
+C1=CC2C=CC1N2
+C1=CCNC=NC1
+O=S
+C1NN=NN1
+C1=CCNC1
+C1=CCCN=C1
+C1=COCCOC1
+C1CC2CNCC(C1)NC2
+C1=CCOC1
+C1=CCNNCC1
+C1=CNNCC1
+C1=CSC1
+C1CC2CCNC(C1)C2
+C1CC2NC(CCS2)S1
+C1CNN1
+C1=CNCCNC1
+C1=CNCCCN1
+C1NCNCN1
+C1=CC2C=CC1C2
+OCl
+C1=CN=CSC1
+C1=CCNC=CC1
+C1=CC2CC1CN2
+C1=CC2CCCN(C1)C2
+C1=CC2CC(C1)CN2
+NO
+C1=COC=NC1
+C1=NSN=CC1
+C1=CN=COC1
+C1=CSCCN=C1
+C1=CC=CNC=C1
+C1=NN=CO1
+C1=CNSC1
+C1CNC1
+C1=COCN1
+C1CCSCNC1
+C1CC2CNCC1C2
+N
+C1=NN=CN1
+C1=NCCNCC1
+C1CCCCCC1
+C1CC2CC(N1)C1COC2O1
+C1=NCNNC1
+C1=CN=NC=N1
+C1COCN1
+C1CNNCN1
+C1=NCC=NCC1
+C1=NCNS1
+C1OC2OCC1CO2
+C1CCOCNC1
+CF
+C1=NNCO1
+C1=NC=NCCN1
+C1CC2CCCC(C1)O2
+C1=NNCN1
+C1=CCN=CC1
+C1CC2CNCC1CN2
+C1CC2CNCC(C1)N2
+C1=CCSCC1
+C1CSNS1
+C1=NCNC1
+C1CC2C3NNC2C1O3
+C1CC2CNC(C2)N1
+C1=CCOCCC1
+C1CC2CCC(C1)O2
+C1=CC2CC1NN2
+C1=CC2CCCC(C1)C2
+C1=CN=CCC1
+C1CC2NCC1CN2
+C1=CSCN1
+C1=NCCC1
+C1CC2C3CC1CC23
+C1=CN=CN=C1
+C1=NSNC1
+C1CSN=N1
+C1COCCSC1
+C1CNSC1
+C1=CCCCCC1
+C1=NCNCN1
+C1CC2CCC(C1)C2
+C1CC2CNCC(C1)C2
+C1=CC2CCCC1C2
+C1=CC2CCC1O2
+C1=CC2CCC1CC2
+C1=CC2C=CC(C1)CC2
+C1CSNCS1
+NBr
+C1COCSN1
+C1=CSCNC1
+C1CC2NCC1NN2
+C1=CN=CCNC1
+C1CC2CNCC1O2
+C1=NNCCC1
+C1CC2CC(C1)N2
+C1=NNSC1
+C1=COCCN1
+C1=CCC2CC(C1)C2
+C1=CC2CCCC1CC2
+C1=CCNCC1
+C1CNNC1
+C1=NC=NC1
+C1=COC=N1
+C1CC2CNC(C2)O1
+C1=CCC2CC=CC(C1)C2
+C1=CC=CC=C1
+C1=CCSOC1
+C1CN2CCC1CC2
+C1=CSCCCN1
+C1CNCCSC1
+C1=NCCSN1
+C1=NCCNN1
+C1NCON1
+C1NCC2CC1CN2
+C1=CNC=N1
+C1=CC2CCC1NN2
+C1CNCOC1
+C1C2CC3CC1OC(O2)O3
+C1=NNCSCC1
+C1=CCOCC1
+C1=NSCC1
+C1=CNSCC1
+C1=CCC1
+C1CCCNCC1
+C1CC2CCCN(C1)C2
+CN
+C1CC2CC(O1)C1OCC2O1
+C1CC2CNC(C1)N2
+CO
+C1=C2CCCCC1C2
+C1=COCCNC1
+C1=CN=CNC1
+C1=CNCCCC1
+C1NN=NS1
+C1=NC=NCS1
+C1=NN=CCC1
+C1=NCCCC1
+C1CC2CCC3CC1NC23
+C1=NCCOCC1
+C1=CSNC=N1
+C1=CSCNNC1
+C1=COCCC1
+C1=COCCC=N1
+C1=NNCCN1
+C1C2CC1C2
+N1C2NC3NC4NC3NC2NC14
+C1=CC2C3NSC2C13
+C1CC2COCC1N2
+C1=CC2CCN3CC1CC23
+C1CCONC1
+C1=CC1
+C1CCNNC1
+C1=CSC=N1
+C1CC1
+C1=NCNCC1
+C1=CC2CC(NCN2)O1
+C1=CNSN=C1
+C1=CCCNCC1
+C1=CSN=CN1
+NS
+C1=CSN=N1
+C1=CN=CC1
+C1=CC2CCC(O1)O2
+C1C2CC3C1OC1NCC2C13
+C1COCSC1
+C1=CCCOCC1
+OBr
+C1=COCSN1
+C1=CN=N[SH]=C1
+C1=CSCC=NC1
+C1=CC=C2CCCCC(=C1)C2
+C1=CSC=CC1
+C=C
+C1=CNCOC1
+C1=NON=C1
+C1=C[SH]=NC=N1
+C1=CNCNC1
+C1CN2C3CC4CC(C1C3)C2C4
+C1=CONCNN1
+CBr
+C1CCNCNC1
+C1=CCNN=C1
+C1=CCNCCC1
+C1=CSCCCC1
+C1=CC2COCC(C1)C2
+C1=CCCC=CC1
+C1COCCN1
+N1NN1
+C1=CC2CCCC(C1)N2
+C1=CC=NC=C1
+C1=NC=NNC1
+C1CC2CCC1CNC2
+C1=CNCC1
+N=S
+CC
+C1=NNCNCC1
+C1=NN=NN1
+C1=NSNCC1
+C1=CNCCSC1
+C1=NC=NCCC1
+C1CCNSCC1
+C1=NC=NN=C1
+C1=CCOCOC1
+C1=NN=CNC1
+C1=COCCCC1
+C1=NCCCO1
+C1CC2CNC(C1)C2
+C1=CSN=CS1
+C1CC2CCC(C1)CNC2
+C1C[SH]=NS1
+C1CC2CCCC(C1)C2
+C1CC2CCC(C1)N2
+C1=NOCC1
+C1=NCCOC1
+C1CC2C3CC1C2CO3
+C1NCN1
+C1COC2CCC(C2)N1
+C1=CNC=CC1
+C1=CNCCC1
+C1=NN=CN=N1
+C1=CNCCC=N1
+C1NCC2CC1C2
+C1C2CC3CC1CC(C2)C3
+C1=NC2CC(C1)CN2
+C1=CNCN=CC1
+C1NCSCN1
+S
+C1CC2CCC1C2
+C1=CCC=NC=C1
+C1CCOCC1
+C1CCN2CCCC(C1)C2
+C1=NSCCS1
+C1NC2CCOC(C2)N1
+C1=NCNN1
+C1=CSC=CNC1
+C1=CCSC=C1
+C1=COC=CO1
+C1=CCCNC=C1
+C1CSCCN1
+C1CCCC1
+C1CCSOC1
+C1=NSCO1
+C1=NCN=CO1
+C1CN2CCN1CC2
+C1=NCN=CN1
+C1CC2CCC(N1)O2
+C1COCO1
+C1=COC=C1
+C1=CCC2CCC(C1)N2
+C1CCC2CCCC(C1)C2
+C1=COCC1
+C1=NSN=CO1
+C1=CCCOC=C1
+C1NNCO1
+CS
+C1CC2CC1CO2
+C1NC2CNC1C2
+C1=CC=C1
+C1=COC=CC1
+C1=CCNNC1
+NN
+C1=COCCO1
+C1=NCC=NN1
+C1CNCNCN1
+C1=CN=NC=C1
+C1=CSCO1
+C1=NC=NSO1
+C1=NCCS1
+C1CC2CNC1CN2
+C1=NNCC1
+C1=CCCC=C1
+C1=NCCN1
+C1=NCCSCC1
+C1=NNNC1
+C1=CC2CCNC(C1)C2
+C1=NCCN=N1
+C1=CNSN1
+C1=CNOC1
+C1=CSN=C1
+C1CCC2CCC(C1)C2
+C1NCC2COCC1C2
+C1COC2CC(N1)C1COC2O1
+C1=NC=NN1
+C1CSNSC1
+C1CCN=NC1
+C1=CO1
+C1=CNCCOC1
+C1=NNN=C1
+C1=CC=NN=C1
+C1CC2CCC1CC2
+C1NC2CC(CO2)O1
+C1NNCS1
+C1=CC2CCCCC(CC1)C2
+C1=CN=NCC1
+C1=CON=C1
+C1CSN1
+C1=CSNC1
+C1CC2CC(C1)C2
+C1CNCN1
+C1=CSCNN1
+C1=CSCC=N1
+C1=CC2CCC1CN2
+C1=CSCCC1
+C1=CC2CCC(C2)N1
+C1CC2CC3CC1CC(C2)C3
+C1=CSCCS1
+C1=NC=NCC1
+C1=CCC2CC(C=CN2)C1
+C1OC2C3OC4C1C1OC2C3OCC41
+C1=NCCCN1
+C1COCOC1
+C1=CNCC=N1
+C1=NCCCS1
+C1=CC2C=CC(C=C1)C2
+C1CC2CC1CN2
+C1=CNN1
+C1=CC2CCCC1O2
+C1=CCCCC1
+C1=CNC=NC1
+C1=CSNCN1
+C1=CSCNN=C1
+C1=CNC=CNC1
+C1OC2CC3CC1C2C3
+C1=NCN=C1
+C1=CSC=CS1
+C
+C1=NCCN=CC1
+C1=CSNCCN1
+C1=CCN=CCC1
+C1=CC2CCNC(C2)O1
+C1C2CN3CCN(C2)CC1C3
+C1C2CN3CC1CN(C2)C3
+C1=NCCCCN1
+C1C2CC3CC(CC1O3)N2
+C1=NN=COC1
+C1=CSCCNC1
+C1=NOCN1
+C1NCC2CNCC1C2
+C1CCSC1
+C1=CN=NNC1
+C1CSCS1
+C1CNN=N1
+C1NCNSN1
+C1=NCC1
+N1NO1
+C1COSCSO1
+C1=CNNCN=C1
+C1CSCCS1
+C1=CSC=NC1
+C1=CC2CCC(C1)NN2
+C1CNCCNC1
+C1C2CC3OC1CC3O2
+C1CNCCN1
+C1=NCCCNC1
+C=O
+C1CNSCCN1
+C1CNCSC1
+C1CCC2C3CCC2C(C1)C3
+C1=CCC=C1
+C1COC1
+C1CC2CC1NN2
+C1=NN=CS1
+C1COCCSN1
+C1CCNC1
+C1=NOCCC1
+C1CCC1
+C1CC2CCC1CO2
+C1=CCNC=C1
+C1=CN=CN=CC1
+C1=NC=NO1
+C1=CC2CCOC(C2)N1
+C1=CNCN=N1
+C1NCN2CNCC1C2
+C1CN2CCC3CC1CC2C3
+C1=CSC=CO1
+C1=CNCCN=C1
+C1CCC2CC(C1)C2
+C1=COCO1
+C1=NCOC1
+C1CSCO1
+C1=NCCSC1
+C1CCC2CCCC(C1)N2
+C1=NCC=NC1
+C1CSCCO1
+C1=CNC=C1
+C1COSN1
+C1=CC2CC(O1)C1OCC2O1
+C1CC2CCC(CN1)N2
+C1=COCCN=C1
+C1=CCOC=CC1
+C1=NCN=CC1
+C1OC2COC1C2
+C1=NC2CC(C1)CCO2
+C1=NC=NS1
+C1=CSN=CO1
+C1=CNN=CC1
+C1=CC2C=CC(C1)CNC2
+C1CC2CCNC(C2)N1
+C1=NCCNC1
+C1CC2COCC(C1)C2
+C1=CN=CCN=C1
+C1=CNN=NC1
+C1CC2CC3CCN2C(C1)C3
+C1=CC2COCC(C1)N2
+C1CNOC1
+C1=CCCC1
+C1CCCSCC1
+C1CCOCOC1
+C1=NC=NCN1
+C1=NCCNS1
+C1NCNN1
+C1CN2C3CC4CC1CC2C4O3
+C1=COCNC1
+C1CN2CC3OCC2CC13
+C1NC2CC1C2
+C1=NSCCN1
+C1=CCOC=C1
+C1CCSNC1
+C1=CC2C=CC1CC2
+C1CNSNC1
+C1=CN=CNCC1
+C1=NNCNC1
+C1=CNCNN=C1
+C1=NNCCCC1
+C1=NCC2CCCC1C2
+C1CCNCC1
+C1NCC2CNCC1O2
+C1=COCOC1
+C1CC2COC(C1)O2
+C#N
+C1=NCNN=C1
+C1CC2COC(C1)C2
+C1CC2CCC1NN2
+C1=CN=CC=N1
+C1=NCCN=C1
+C1CSCNCN1
+C1NCC2COCC1N2
+C1=CCN=C1
+C1COCCO1
+C1=COCCCO1
+C1CC2COCC(C1)N2
+C1=COCC=N1
+C1CC2OC(CCS2)S1
+C1=CCC=CC1
+C1=NC1
+C1CCCOCC1
+C1=CC2CCC(C1)N2
+C1=CNC=CN1
+C1CCOC1
+C1=CC=CCC=C1
+C1=CC2CCNC(CC1)C2
+C1=CNCSC1
+C1CC2CCC1N2
+C1=NSNCN1
+C1=CSNCC1
+C1CC2CCCC(C1)N2
+C1=CN=CCC=N1
+C1NCC2COCC1CN2
+C1=CSC=C1
+C1C2CN3CN1CN(C2)C3
+C1=CCON=C1
+C1=NCCCCC1
+C1=CSCCCS1
+C1=CNN=N1
+C1CC2C3CC1C2CN3
+C1CC2CCC(C2)O1
+OS
+C1CC2OC3CC1CC2O3
+C1CC2CCC1CN2
+C1=CNCN1
+C1=CC2CC(N1)C1C=CC2C1
+C1=CC2CC(C1)C2
+C1=NCCCSC1
+C1NC2CC(N1)C1OCC2O1
+C#C
+CCl
+C1=CN=NN=C1
+C1=CNCCN1
+C1CCCCC1
+C1CNCNC1
+C1=CNNC1