from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles import networkx as nx import os.path as op import math #from rdkit.six.moves import cPickle import _pickle as cPickle #from rdkit.six import iteritems from rdkit import Chem import pickle import numpy as np import sys import os from rdkit.Chem import RDConfig sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) import sascorer from rdkit.DataStructs.cDataStructs import TanimotoSimilarity from rdkit.Chem.Fingerprints import FingerprintMols def compute_rmse(gt, pred): return mean_squared_error(gt, pred, squared=False) def compute_r2score(gt, pred): return r2_score(gt, pred) def compute_roc_auc(gt, pred): return roc_auc_score(gt, pred) def check_valid(smiles_list): total_num = len(smiles_list) empty_num = smiles_list.count("") return 1 - empty_num / float(total_num) def check_unique(smiles_list): total_num = len(smiles_list) smiles_set = set(smiles_list) if "" in smiles_set: smiles_set.remove("") return len(smiles_set) / float(total_num) def check_nolvelty(gen_smiles, train_smiles): if len(gen_smiles) == 0: novel_ratio = 0. else: duplicates = [1 for mol in gen_smiles if mol in train_smiles] novel = len(gen_smiles) - sum(duplicates) novel_ratio = novel*100./len(gen_smiles) return novel_ratio _fscores = None def readFragmentScores(name='fpscores'): import gzip global _fscores # generate the full path filename: if name == "fpscores": name = op.join(op.dirname(__file__), name) _fscores = cPickle.load(gzip.open('%s.pkl.gz'%name)) outDict = {} for i in _fscores: for j in range(1,len(i)): outDict[i[j]] = float(i[0]) _fscores = outDict def numBridgeheadsAndSpiro(mol,ri=None): nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) return nBridgehead,nSpiro def calculateScore(m): if _fscores is None: readFragmentScores() # fragment score fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint fps = fp.GetNonzeroElements() score1 = 0. nf = 0 for bitId,v in iteritems(fps): nf += v sfp = bitId score1 += _fscores.get(sfp,-4)*v score1 /= nf # features score nAtoms = m.GetNumAtoms() nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True)) ri = m.GetRingInfo() nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri) nMacrocycles=0 for x in ri.AtomRings(): if len(x)>8: nMacrocycles+=1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters+1) spiroPenalty = math.log10(nSpiro+1) bridgePenalty = math.log10(nBridgeheads+1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore def compute_plogp(mol): #mol = MolFromSmiles(smiles_string) #logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values) logp = Crippen.MolLogP(mol) #SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores) SA_score = -calculateScore(mol) cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([ len(j) for j in cycle_list ]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 #cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores) cycle_score = -cycle_length #plogp = -(logp + SA_score + cycle_score) plogp = (logp + SA_score + cycle_score) return plogp clf_model = None def load_model(): global clf_model #name = op.join(op.dirname(__file__), 'clf_py36.pkl') name = op.join(op.dirname(__file__), 'drd2_current.pkl') with open(name, "rb") as f: clf_model = pickle.load(f) def fingerprints_from_mol(mol): fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True) size = 2048 nfp = np.zeros((1, size), np.int32) for idx,v in fp.GetNonzeroElements().items(): nidx = idx%size nfp[0, nidx] += int(v) return nfp def compute_drd2(mol): if clf_model is None: load_model() #print(smile) #mol = Chem.MolFromSmiles(smile) if mol: fp = fingerprints_from_mol(mol) score = clf_model.predict_proba(fp)[:, 1] return float(score) return 0.0 def compute_qed(mol): return QED.qed(mol) def compute_logp(mol): return Crippen.MolLogP(mol) def compute_tpsa(mol): return rdMolDescriptors.CalcTPSA(mol) def compute_sas(mol): return sascorer.calculateScore(mol) def check_valid_unique(smiles_list): total_num = len(smiles_list) empty_num = smiles_list.count("") smiles_set = set(smiles_list) if "" in smiles_set: smiles_set.remove("") return 1 - empty_num / float(total_num), \ len(smiles_set) / float(total_num - empty_num) def get_similarity(smiles1, smiles2): if smiles1 == "" or smiles2 == "": return np.nan sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)), FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2))) return sim def get_scaffold(smiles): scaffold = MurckoScaffoldSmiles(smiles) return scaffold