File size: 3,432 Bytes
ad85063 45a96cc ad85063 df2422f ad85063 3cfc3d1 c6bc8a1 3cfc3d1 c6bc8a1 3cfc3d1 1249729 42325a4 ad85063 b8588c3 c8874ba ad85063 b8588c3 ad85063 0930514 ad85063 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#from src.utils import plogp, sf_decode, sim
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Descriptors
import selfies as sf
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
def get_largest_ring_size(mol):
cycle_list = mol.GetRingInfo().AtomRings()
if cycle_list:
cycle_length = max([len(j) for j in cycle_list])
else:
cycle_length = 0
return cycle_length
def plogp(smile):
if smile:
mol = Chem.MolFromSmiles(smile)
if mol:
log_p = Descriptors.MolLogP(mol)
sas_score = sascorer.calculateScore(mol)
largest_ring_size = get_largest_ring_size(mol)
cycle_score = max(largest_ring_size - 6, 0)
if log_p and sas_score and largest_ring_size:
p_logp = log_p - sas_score - cycle_score
return p_logp
else:
return -100
else:
return -100
else:
return -100
def sf_decode(selfies):
try:
decode = sf.decoder(selfies)
return decode
except sf.DecoderError:
return ''
def sim(input_smile, output_smile):
if input_smile and output_smile:
input_mol = Chem.MolFromSmiles(input_smile)
output_mol = Chem.MolFromSmiles(output_smile)
if input_mol and output_mol:
input_fp = AllChem.GetMorganFingerprint(input_mol, 2)
output_fp = AllChem.GetMorganFingerprint(output_mol, 2)
sim = DataStructs.TanimotoSimilarity(input_fp, output_fp)
return sim
else: return None
else: return None
def greet(name):
tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt")
model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt")
input = name
sf_input = tokenizer(input, return_tensors="pt")
molecules = model.generate(
input_ids=sf_input["input_ids"],
attention_mask=sf_input["attention_mask"],
do_sample=True,
max_length=100,
min_length=5,
top_k=30,
top_p=1,
num_return_sequences=10
)
sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules]
sf_output = list(set(sf_output))
input_sm = sf_decode(input)
sm_output = [sf_decode(sf) for sf in sf_output]
input_plogp = plogp(input_sm)
plogp_improve = [plogp(i)-input_plogp for i in sm_output]
simm = [sim(i,input_sm) for i in sm_output]
candidate_selfies = {"candidates": sf_output, "improvement": plogp_improve, "sim": simm}
data = pd.DataFrame(candidate_selfies)
return data[(data['improvement']> 0) & (data['sim']>0.4)]
examples = [
['[C][C][=Branch1][C][=O][N][C][C][O][C][C][O][C][C][O][C][C][Ring1][N]'],['[C][C][S][C][C][S][C][C][C][S][C][C][S][C][Ring1][=C]']
]
iface = gr.Interface(fn=greet, inputs="text", outputs="numpy", title="Molecular Language Model as Multi-task Generator",examples=examples)
iface.launch() |