File size: 6,591 Bytes
1d1d4f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
import networkx as nx
import os.path as op
import math
#from rdkit.six.moves import cPickle
import _pickle as cPickle
#from rdkit.six import iteritems
from rdkit import Chem
import pickle
import numpy as np

import sys
import os
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols

def compute_rmse(gt, pred):
    return mean_squared_error(gt, pred, squared=False)

def compute_r2score(gt, pred):
    return r2_score(gt, pred)

def compute_roc_auc(gt, pred):
    return roc_auc_score(gt, pred)

def check_valid(smiles_list):
    total_num = len(smiles_list)
    empty_num = smiles_list.count("")
    return 1 - empty_num / float(total_num)

def check_unique(smiles_list):
    total_num = len(smiles_list)
    smiles_set = set(smiles_list)
    if "" in smiles_set:
        smiles_set.remove("")
    return len(smiles_set) / float(total_num)

def check_nolvelty(gen_smiles, train_smiles):
    if len(gen_smiles) == 0:
        novel_ratio = 0.
    else:
        duplicates = [1 for mol in gen_smiles if mol in train_smiles]
        novel = len(gen_smiles) - sum(duplicates)
        novel_ratio = novel*100./len(gen_smiles)
    return novel_ratio

_fscores = None
def readFragmentScores(name='fpscores'):
    import gzip
    global _fscores
    # generate the full path filename:
    if name == "fpscores":
        name = op.join(op.dirname(__file__), name)
    _fscores = cPickle.load(gzip.open('%s.pkl.gz'%name))
    outDict = {}
    for i in _fscores:
        for j in range(1,len(i)):
            outDict[i[j]] = float(i[0])
    _fscores = outDict

def numBridgeheadsAndSpiro(mol,ri=None):
    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
    return nBridgehead,nSpiro

def calculateScore(m):
    if _fscores is None: readFragmentScores()

    # fragment score
    fp = rdMolDescriptors.GetMorganFingerprint(m,2)  #<- 2 is the *radius* of the circular fingerprint
    fps = fp.GetNonzeroElements()
    score1 = 0.
    nf = 0
    for bitId,v in iteritems(fps):
        nf += v
        sfp = bitId
        score1 += _fscores.get(sfp,-4)*v
    score1 /= nf

    # features score
    nAtoms = m.GetNumAtoms()
    nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True))
    ri = m.GetRingInfo()
    nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri)
    nMacrocycles=0
    for x in ri.AtomRings():
        if len(x)>8: nMacrocycles+=1

    sizePenalty = nAtoms**1.005 - nAtoms
    stereoPenalty = math.log10(nChiralCenters+1)
    spiroPenalty = math.log10(nSpiro+1)
    bridgePenalty = math.log10(nBridgeheads+1)
    macrocyclePenalty = 0.
    # ---------------------------------------
    # This differs from the paper, which defines:
    #  macrocyclePenalty = math.log10(nMacrocycles+1)
    # This form generates better results when 2 or more macrocycles are present
    if nMacrocycles > 0: macrocyclePenalty = math.log10(2)

    score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty

    # correction for the fingerprint density
    # not in the original publication, added in version 1.1
    # to make highly symmetrical molecules easier to synthetise
    score3 = 0.
    if nAtoms > len(fps):
      score3 = math.log(float(nAtoms) / len(fps)) * .5

    sascore = score1 + score2 + score3

    # need to transform "raw" value into scale between 1 and 10
    min = -4.0
    max = 2.5
    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
    # smooth the 10-end
    if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.)
    if sascore > 10.: sascore = 10.0
    elif sascore < 1.: sascore = 1.0 

    return sascore

def compute_plogp(mol):

    #mol = MolFromSmiles(smiles_string)
    #logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values)
    logp = Crippen.MolLogP(mol)
    #SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores)
    SA_score = -calculateScore(mol)
    cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([ len(j) for j in cycle_list ])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6

    #cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores)
    cycle_score = -cycle_length
    #plogp = -(logp + SA_score + cycle_score)
    plogp = (logp + SA_score + cycle_score)
    return plogp

clf_model = None
def load_model():
    global clf_model
    #name = op.join(op.dirname(__file__), 'clf_py36.pkl')
    name = op.join(op.dirname(__file__), 'drd2_current.pkl')
    with open(name, "rb") as f:
        clf_model = pickle.load(f)

def fingerprints_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
    size = 2048
    nfp = np.zeros((1, size), np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[0, nidx] += int(v)
    return nfp

def compute_drd2(mol):
    if clf_model is None:
        load_model()

    #print(smile)
    #mol = Chem.MolFromSmiles(smile)
    if mol:
        fp = fingerprints_from_mol(mol)
        score = clf_model.predict_proba(fp)[:, 1]
        return float(score)
    return 0.0

def compute_qed(mol):
    return QED.qed(mol)

def compute_logp(mol):
    return Crippen.MolLogP(mol)

def compute_tpsa(mol):
    return rdMolDescriptors.CalcTPSA(mol)

def compute_sas(mol):
    return sascorer.calculateScore(mol)


def check_valid_unique(smiles_list):
    total_num = len(smiles_list)
    empty_num = smiles_list.count("")

    smiles_set = set(smiles_list)
    if "" in smiles_set:
        smiles_set.remove("")
    return 1 - empty_num / float(total_num), \
        len(smiles_set) / float(total_num - empty_num)

def get_similarity(smiles1, smiles2):
    if smiles1 == "" or smiles2 == "":
        return np.nan
    sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)),
                       FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2)))
    
    return sim

def get_scaffold(smiles):
    scaffold = MurckoScaffoldSmiles(smiles)
    return scaffold