File size: 3,432 Bytes
ad85063
 
45a96cc
ad85063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df2422f
ad85063
 
 
 
 
 
 
 
 
 
 
 
 
 
3cfc3d1
 
 
 
 
 
 
 
 
c6bc8a1
3cfc3d1
c6bc8a1
3cfc3d1
 
 
1249729
 
42325a4
ad85063
 
 
b8588c3
 
c8874ba
ad85063
b8588c3
ad85063
 
 
0930514
ad85063
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#from src.utils import plogp, sf_decode, sim
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Descriptors
import selfies as sf
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

def get_largest_ring_size(mol):
    cycle_list = mol.GetRingInfo().AtomRings()
    if cycle_list:
        cycle_length = max([len(j) for j in cycle_list])
    else:
        cycle_length = 0
    return cycle_length

def plogp(smile):
    if smile:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            log_p = Descriptors.MolLogP(mol)
            sas_score = sascorer.calculateScore(mol)
            largest_ring_size = get_largest_ring_size(mol)
            cycle_score = max(largest_ring_size - 6, 0)
            if log_p and sas_score and largest_ring_size:
                p_logp = log_p - sas_score - cycle_score
                return p_logp
            else: 
                return -100
        else:
            return -100
    else:
        return -100
    
def sf_decode(selfies):
    try:
        decode = sf.decoder(selfies)
        return decode
    except sf.DecoderError:
        return ''
    
def sim(input_smile, output_smile):
    if input_smile and output_smile:
        input_mol = Chem.MolFromSmiles(input_smile)
        output_mol = Chem.MolFromSmiles(output_smile)
        if input_mol and output_mol:
            input_fp = AllChem.GetMorganFingerprint(input_mol, 2)
            output_fp = AllChem.GetMorganFingerprint(output_mol, 2)
            sim = DataStructs.TanimotoSimilarity(input_fp, output_fp)
            return sim
        else: return None
    else: return None 


def greet(name):

    tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt")
    model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt")
    
    input = name
    
    sf_input = tokenizer(input, return_tensors="pt")
    molecules = model.generate(
                    input_ids=sf_input["input_ids"],
                    attention_mask=sf_input["attention_mask"],
                    do_sample=True,
                    max_length=100,
                    min_length=5,
                    top_k=30,
                    top_p=1,
                    num_return_sequences=10
                    )
    sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules]
    sf_output = list(set(sf_output))
    input_sm = sf_decode(input)
    sm_output = [sf_decode(sf) for sf in sf_output]
    
    
    
    input_plogp = plogp(input_sm)
    plogp_improve = [plogp(i)-input_plogp for i in sm_output]
    
    
    simm = [sim(i,input_sm) for i in sm_output]
    
    candidate_selfies = {"candidates": sf_output, "improvement": plogp_improve, "sim": simm}
    data = pd.DataFrame(candidate_selfies)
    
    return data[(data['improvement']> 0) & (data['sim']>0.4)]

  
    



examples = [
            
    ['[C][C][=Branch1][C][=O][N][C][C][O][C][C][O][C][C][O][C][C][Ring1][N]'],['[C][C][S][C][C][S][C][C][C][S][C][C][S][C][Ring1][=C]']

]



iface = gr.Interface(fn=greet, inputs="text", outputs="numpy", title="Molecular Language Model as Multi-task Generator",examples=examples)
iface.launch()