Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,591 Bytes
1d1d4f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
import networkx as nx
import os.path as op
import math
#from rdkit.six.moves import cPickle
import _pickle as cPickle
#from rdkit.six import iteritems
from rdkit import Chem
import pickle
import numpy as np
import sys
import os
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols
def compute_rmse(gt, pred):
return mean_squared_error(gt, pred, squared=False)
def compute_r2score(gt, pred):
return r2_score(gt, pred)
def compute_roc_auc(gt, pred):
return roc_auc_score(gt, pred)
def check_valid(smiles_list):
total_num = len(smiles_list)
empty_num = smiles_list.count("")
return 1 - empty_num / float(total_num)
def check_unique(smiles_list):
total_num = len(smiles_list)
smiles_set = set(smiles_list)
if "" in smiles_set:
smiles_set.remove("")
return len(smiles_set) / float(total_num)
def check_nolvelty(gen_smiles, train_smiles):
if len(gen_smiles) == 0:
novel_ratio = 0.
else:
duplicates = [1 for mol in gen_smiles if mol in train_smiles]
novel = len(gen_smiles) - sum(duplicates)
novel_ratio = novel*100./len(gen_smiles)
return novel_ratio
_fscores = None
def readFragmentScores(name='fpscores'):
import gzip
global _fscores
# generate the full path filename:
if name == "fpscores":
name = op.join(op.dirname(__file__), name)
_fscores = cPickle.load(gzip.open('%s.pkl.gz'%name))
outDict = {}
for i in _fscores:
for j in range(1,len(i)):
outDict[i[j]] = float(i[0])
_fscores = outDict
def numBridgeheadsAndSpiro(mol,ri=None):
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
return nBridgehead,nSpiro
def calculateScore(m):
if _fscores is None: readFragmentScores()
# fragment score
fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint
fps = fp.GetNonzeroElements()
score1 = 0.
nf = 0
for bitId,v in iteritems(fps):
nf += v
sfp = bitId
score1 += _fscores.get(sfp,-4)*v
score1 /= nf
# features score
nAtoms = m.GetNumAtoms()
nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True))
ri = m.GetRingInfo()
nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri)
nMacrocycles=0
for x in ri.AtomRings():
if len(x)>8: nMacrocycles+=1
sizePenalty = nAtoms**1.005 - nAtoms
stereoPenalty = math.log10(nChiralCenters+1)
spiroPenalty = math.log10(nSpiro+1)
bridgePenalty = math.log10(nBridgeheads+1)
macrocyclePenalty = 0.
# ---------------------------------------
# This differs from the paper, which defines:
# macrocyclePenalty = math.log10(nMacrocycles+1)
# This form generates better results when 2 or more macrocycles are present
if nMacrocycles > 0: macrocyclePenalty = math.log10(2)
score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty
# correction for the fingerprint density
# not in the original publication, added in version 1.1
# to make highly symmetrical molecules easier to synthetise
score3 = 0.
if nAtoms > len(fps):
score3 = math.log(float(nAtoms) / len(fps)) * .5
sascore = score1 + score2 + score3
# need to transform "raw" value into scale between 1 and 10
min = -4.0
max = 2.5
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
# smooth the 10-end
if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.)
if sascore > 10.: sascore = 10.0
elif sascore < 1.: sascore = 1.0
return sascore
def compute_plogp(mol):
#mol = MolFromSmiles(smiles_string)
#logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values)
logp = Crippen.MolLogP(mol)
#SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores)
SA_score = -calculateScore(mol)
cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
if len(cycle_list) == 0:
cycle_length = 0
else:
cycle_length = max([ len(j) for j in cycle_list ])
if cycle_length <= 6:
cycle_length = 0
else:
cycle_length = cycle_length - 6
#cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores)
cycle_score = -cycle_length
#plogp = -(logp + SA_score + cycle_score)
plogp = (logp + SA_score + cycle_score)
return plogp
clf_model = None
def load_model():
global clf_model
#name = op.join(op.dirname(__file__), 'clf_py36.pkl')
name = op.join(op.dirname(__file__), 'drd2_current.pkl')
with open(name, "rb") as f:
clf_model = pickle.load(f)
def fingerprints_from_mol(mol):
fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
size = 2048
nfp = np.zeros((1, size), np.int32)
for idx,v in fp.GetNonzeroElements().items():
nidx = idx%size
nfp[0, nidx] += int(v)
return nfp
def compute_drd2(mol):
if clf_model is None:
load_model()
#print(smile)
#mol = Chem.MolFromSmiles(smile)
if mol:
fp = fingerprints_from_mol(mol)
score = clf_model.predict_proba(fp)[:, 1]
return float(score)
return 0.0
def compute_qed(mol):
return QED.qed(mol)
def compute_logp(mol):
return Crippen.MolLogP(mol)
def compute_tpsa(mol):
return rdMolDescriptors.CalcTPSA(mol)
def compute_sas(mol):
return sascorer.calculateScore(mol)
def check_valid_unique(smiles_list):
total_num = len(smiles_list)
empty_num = smiles_list.count("")
smiles_set = set(smiles_list)
if "" in smiles_set:
smiles_set.remove("")
return 1 - empty_num / float(total_num), \
len(smiles_set) / float(total_num - empty_num)
def get_similarity(smiles1, smiles2):
if smiles1 == "" or smiles2 == "":
return np.nan
sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)),
FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2)))
return sim
def get_scaffold(smiles):
scaffold = MurckoScaffoldSmiles(smiles)
return scaffold |