Spaces:

ChemFM
/

reaction_prediction

Running on Zero

App Files Files Community

feiyang-cai commited on Jan 24

Commit

edaff0a

1 Parent(s): 5362c33

update the reaction

Browse files

Files changed (3) hide show

app.py +249 -0
requirements.txt +9 -0
utils.py +280 -0

app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import gradio as gr
+from huggingface_hub import HfApi, get_collection, list_collections, list_models
+#from utils import MolecularPropertyPredictionModel, dataset_task_types, dataset_descriptions, dataset_property_names, dataset_property_names_to_dataset
+from utils import ReactionPredictionModel
+import pandas as pd
+import os
+import spaces
+def get_models():
+    # we only support two models
+    # 1. ChemFM/uspto_mit_synthesis
+    # 2. ChemFM/uspto_full_retro
+    models = dict()
+    models['mit_synthesis'] = 'ChemFM/uspto_mit_synthesis'
+    models['full_retro'] = 'ChemFM/uspto_full_retro'
+    #for item in collection.items:
+    #    if item.item_type == "model":
+    #        item_name = item.item_id.split("/")[-1]
+    #        models[item_name] = item.item_id
+    #        assert item_name in dataset_task_types, f"{item_name} is not in the task_types"
+    #        assert item_name in dataset_descriptions, f"{item_name} is not in the dataset_descriptions"
+    return models
+candidate_models = get_models()
+task_names = {
+    'mit_synthesis': 'Reaction Synthesis',
+    'full_retro': 'Reaction Retro Synthesis'
+}
+task_names_to_tasks = {v: k for k, v in task_names.items()}
+tasks = list(candidate_models.keys())
+task_descriptions = {
+    'mit_synthesis': 'Predict the reaction products given the reactants and reagents (reactants and reagents are mixed; different compounds are separated by ".").' + \
+                     'C1CCOC1.N#Cc1ccsc1N.O=[N+]([O-])c1cc(F)c(F)cc1F.[H-].[Na+]',
+    'full_retro': 'Predict the reaction precursors given the reaction products (different compounds are separated by ".").'
+}
+#property_names = list(candidate_models.keys())
+model = ReactionPredictionModel(candidate_models)
+#model = MolecularPropertyPredictionModel(candidate_models)
+def get_description(task_name):
+    task = task_names_to_tasks[task_name]
+    return task_descriptions[task]
+#@spaces.GPU(duration=10)
+def predict_single_label(smiles, task_name):
+    task = task_names_to_tasks[task_name]
+    try:
+        running_status = None
+        #prediction = model.predict(smiles, property_name, adapter_id)
+        prediction = model.predict_single_smiles(smiles, task)
+        if prediction is None:
+            return "NA", "Invalid SMILES string"
+    except Exception as e:
+        # no matter what the error is, we should return
+        print(e)
+        return "NA", "Prediction failed"
+    prediction = "\n".join([f"{idx+1}. {item}" for idx, item in enumerate(prediction)])
+    return prediction, "Prediction is done"
+"""
+@spaces.GPU(duration=30)
+def predict_file(file, property_name):
+    property_id = dataset_property_names_to_dataset[property_name]
+    try:
+        adapter_id = candidate_models[property_id]
+        info = model.swith_adapter(property_id, adapter_id)
+        running_status = None
+        if info == "keep":
+            running_status = "Adapter is the same as the current one"
+            #print("Adapter is the same as the current one")
+        elif info == "switched":
+            running_status = "Adapter is switched successfully"
+            #print("Adapter is switched successfully")
+        elif info == "error":
+            running_status = "Adapter is not found"
+            #print("Adapter is not found")
+            return None, None, file, running_status
+        else:
+            running_status = "Unknown error"
+            return None, None, file, running_status
+        df = pd.read_csv(file)
+        # we have already checked the file contains the "smiles" column
+        df = model.predict_file(df, dataset_task_types[property_id])
+        # we should save this file to the disk to be downloaded
+        # rename the file to have "_prediction" suffix
+        prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
+        print(file, prediction_file)
+        # save the file to the disk
+        df.to_csv(prediction_file, index=False)
+    except Exception as e:
+        # no matter what the error is, we should return
+        print(e)
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), file, "Prediction failed"
+    return gr.update(visible=False), gr.DownloadButton(label="Download", value=prediction_file, visible=True), gr.update(visible=False), prediction_file, "Prediction is done"
+def validate_file(file):
+    try:
+        if file.endswith(".csv"):
+            df = pd.read_csv(file)
+            if "smiles" not in df.columns:
+                # we should clear the file input
+                return "Invalid file content. The csv file must contain column named 'smiles'", \
+                         None, gr.update(visible=False), gr.update(visible=False)
+            # check the length of the smiles
+            length = len(df["smiles"])
+        elif file.endswith(".smi"):
+            return "Invalid file extension", \
+                    None, gr.update(visible=False), gr.update(visible=False)
+        else:
+            return "Invalid file extension", \
+                    None, gr.update(visible=False), gr.update(visible=False)
+    except Exception as e:
+        return "Invalid file content.", \
+                None, gr.update(visible=False), gr.update(visible=False)
+    if length > 100:
+        return "The space does not support the file containing more than 100 SMILES", \
+                None, gr.update(visible=False), gr.update(visible=False)
+    return "Valid file", file, gr.update(visible=True), gr.update(visible=False)
+"""
+def raise_error(status):
+    if status != "Valid file":
+        raise gr.Error(status)
+    return None
+"""
+def clear_file(download_button):
+    # we might need to delete the prediction file and uploaded file
+    prediction_path = download_button
+    print(prediction_path)
+    if prediction_path and os.path.exists(prediction_path):
+        os.remove(prediction_path)
+        original_data_file_0 = prediction_path.replace("_prediction.csv", ".csv")
+        original_data_file_1 = prediction_path.replace("_prediction.csv", ".smi")
+        if os.path.exists(original_data_file_0):
+            os.remove(original_data_file_0)
+        if os.path.exists(original_data_file_1):
+            os.remove(original_data_file_1)
+    #if os.path.exists(file):
+    #    os.remove(file)
+    #prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
+    #if os.path.exists(prediction_file):
+    #    os.remove(prediction_file)
+    return gr.update(visible=False), gr.update(visible=False), None
+"""
+def build_inference():
+    with gr.Blocks() as demo:
+        # first row - Dropdown input
+        #with gr.Row():
+        #gr.Markdown(f"<span style='color: red;'>If you run out of your GPU quota, you can use the </span> <a href='https://huggingface.co/spaces/ChemFM/molecular_property_prediction'>CPU-powered space</a> but with much lower performance.")
+        dropdown = gr.Dropdown([task_names[key] for key in tasks], label="Task", value=task_names[tasks[0]])
+        description_box = gr.Textbox(label="Task description", lines=5,
+                                     interactive=False,
+                                     value= task_descriptions[tasks[0]])
+        # third row - Textbox input and prediction label
+        #with gr.Row(equal_height=True):
+        #    with gr.Column():
+        textbox = gr.Textbox(label="Reatants (Products) SMILES string", type="text", placeholder="Provide a SMILES string here",
+                             lines=1)
+        predict_single_smiles_button = gr.Button("Predict", size='sm')
+        #prediction = gr.Label("Prediction will appear here")
+        prediction = gr.Textbox(label="Predictions", type="text", placeholder=None, lines=10, interactive=False)
+        running_terminal_label = gr.Textbox(label="Running status", type="text", placeholder=None, lines=10, interactive=False)
+        #input_file = gr.File(label="Molecule file",
+        #               file_count='single',
+        #               file_types=[".smi", ".csv"], height=300)
+        #predict_file_button = gr.Button("Predict", size='sm', visible=False)
+        #download_button = gr.DownloadButton("Download", size='sm', visible=False)
+        #stop_button = gr.Button("Stop", size='sm', visible=False)
+        # dropdown change event
+        dropdown.change(get_description, inputs=dropdown, outputs=description_box)
+        # predict single button click event
+        predict_single_smiles_button.click(lambda:(gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   ) , outputs=[dropdown, textbox, predict_single_smiles_button, running_terminal_label])\
+                                                   .then(predict_single_label, inputs=[textbox, dropdown], outputs=[prediction, running_terminal_label])\
+                                                   .then(lambda:(gr.update(interactive=True),
+                                                                 gr.update(interactive=True),
+                                                                 gr.update(interactive=True),
+                                                                 gr.update(interactive=True),
+                                                                 ) , outputs=[dropdown, textbox, predict_single_smiles_button, running_terminal_label])
+        """
+        # input file upload event
+        file_status = gr.State()
+        input_file.upload(fn=validate_file, inputs=input_file, outputs=[file_status, input_file, predict_file_button, download_button]).success(raise_error, inputs=file_status, outputs=file_status)
+        # input file clear event
+        input_file.clear(fn=clear_file, inputs=[download_button], outputs=[predict_file_button, download_button, input_file])
+        # predict file button click event
+        predict_file_event = predict_file_button.click(lambda:(gr.update(interactive=False),
+                                                               gr.update(interactive=False),
+                                                               gr.update(interactive=False),
+                                                               gr.update(interactive=False, visible=True),
+                                                               gr.update(interactive=False),
+                                                               gr.update(interactive=True, visible=False),
+                                                               gr.update(interactive=False),
+                                                               gr.update(interactive=False),
+                                                               ) , outputs=[dropdown, textbox, predict_single_smiles_button, predict_file_button, download_button, stop_button, input_file, running_terminal_label])\
+                                                               .then(predict_file, inputs=[input_file, dropdown], outputs=[predict_file_button, download_button, stop_button, input_file, running_terminal_label])\
+                                                               .then(lambda:(gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             gr.update(interactive=True),
+                                                                             ) , outputs=[dropdown, textbox, predict_single_smiles_button, predict_file_button, download_button, stop_button, input_file, running_terminal_label])
+        # stop button click event
+        #stop_button.click(fn=None, inputs=None, outputs=None, cancels=[predict_file_event])
+        """
+    return demo
+demo = build_inference()
+if __name__ == '__main__':
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+transformers
+torch
+huggingface_hub
+pandas
+peft
+tqdm
+datasets
+rdkit
+scikit-learn

utils.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import os
+from typing import Optional, Dict, Sequence
+import transformers
+from peft import PeftModel
+import torch
+from dataclasses import dataclass, field
+from huggingface_hub import hf_hub_download
+import json
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+import spaces
+from rdkit import RDLogger, Chem
+# Suppress RDKit INFO messages
+RDLogger.DisableLog('rdApp.*')
+DEFAULT_PAD_TOKEN = "[PAD]"
+device_map = "cpu"
+def compute_rank(prediction,raw=False,alpha=1.0):
+    valid_score = [[k for k in range(len(prediction[j]))] for j in range(len(prediction))]
+    invalid_rates = [0 for k in range(len(prediction[0]))]
+    rank = {}
+    highest = {}
+    for j in range(len(prediction)):
+        for k in range(len(prediction[j])):
+            if prediction[j][k] == "":
+                valid_score[j][k] = 10 + 1
+                invalid_rates[k] += 1
+        de_error = [i[0] for i in sorted(list(zip(prediction[j], valid_score[j])), key=lambda x: x[1]) if i[0] != ""]
+        prediction[j] = list(set(de_error))
+        prediction[j].sort(key=de_error.index)
+        for k, data in enumerate(prediction[j]):
+            if data in rank:
+                rank[data] += 1 / (alpha * k + 1)
+            else:
+                rank[data] = 1 / (alpha * k + 1)
+            if data in highest:
+                highest[data] = min(k,highest[data])
+            else:
+                highest[data] = k
+    return rank,invalid_rates
+@dataclass
+class DataCollatorForCausalLMEval(object):
+    tokenizer: transformers.PreTrainedTokenizer
+    source_max_len: int
+    target_max_len: int
+    reactant_start_str: str
+    product_start_str: str
+    end_str: str
+    def augment_molecule(self, molecule: str) -> str:
+        return self.sme.augment([molecule])[0]
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        print(instances)
+        srcs = instances[0]['src']
+        task_type = instances[0]['task_type'][0]
+        if task_type == 'retrosynthesis':
+            src_start_str = self.product_start_str
+            tgt_start_str = self.reactant_start_str
+        else:
+            src_start_str = self.reactant_start_str
+            tgt_start_str = self.product_start_str
+        generation_prompts = []
+        generation_prompt = f"{src_start_str}{srcs}{self.end_str}{tgt_start_str}"
+        generation_prompts.append(generation_prompt)
+        data_dict = {
+            'generation_prompts': generation_prompts
+        }
+        return data_dict
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    non_special_tokens = None,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + tokenizer.add_tokens(non_special_tokens)
+    num_old_tokens = model.get_input_embeddings().weight.shape[0]
+    num_new_tokens = len(tokenizer) - num_old_tokens
+    if num_new_tokens == 0:
+        return
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings_data = model.get_input_embeddings().weight.data
+        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
+    print(f"Resized tokenizer and embedding from {num_old_tokens} to {len(tokenizer)} tokens.")
+class ReactionPredictionModel():
+    def __init__(self, candidate_models):
+        for model in candidate_models:
+            if "retro" in model:
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    candidate_models[list(candidate_models.keys())[0]],
+                    padding_side="right",
+                    use_fast=True,
+                    trust_remote_code=True,
+                    token = os.environ.get("TOKEN")
+                    )
+                self.load_retro_model(candidate_models[model])
+            else:
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    candidate_models[list(candidate_models.keys())[0]],
+                    padding_side="right",
+                    use_fast=True,
+                    trust_remote_code=True,
+                    token = os.environ.get("TOKEN")
+                    )
+                self.load_forward_model(candidate_models[model])
+        string_template_path = hf_hub_download(candidate_models[list(candidate_models.keys())[0]], filename="string_template.json", token = os.environ.get("TOKEN"))
+        string_template = json.load(open(string_template_path, 'r'))
+        reactant_start_str = string_template['REACTANTS_START_STRING']
+        product_start_str = string_template['PRODUCTS_START_STRING']
+        end_str = string_template['END_STRING']
+        self.data_collator = DataCollatorForCausalLMEval(
+            tokenizer=self.tokenizer,
+            source_max_len=512,
+            target_max_len=512,
+            reactant_start_str=reactant_start_str,
+            product_start_str=product_start_str,
+            end_str=end_str,
+        )
+    def load_retro_model(self, model_path):
+        # our retro model is lora model
+        config = AutoConfig.from_pretrained(
+            "ChemFM/ChemFM-3B",
+            trust_remote_code=True,
+            token=os.environ.get("TOKEN")
+        )
+        base_model = AutoModelForCausalLM.from_pretrained(
+            "ChemFM/ChemFM-3B",
+            config=config,
+            trust_remote_code=True,
+            device_map=device_map,
+            token = os.environ.get("TOKEN")
+        )
+        # we should resize the embedding layer of the base model to match the adapter's tokenizer
+        special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=special_tokens_dict,
+            tokenizer=self.tokenizer,
+            model=base_model
+        )
+        base_model.config.pad_token_id = self.tokenizer.pad_token_id
+        # load the adapter model
+        self.retro_model = PeftModel.from_pretrained(
+            base_model,
+            model_path,
+            token = os.environ.get("TOKEN")
+        )
+        self.retro_model.to("cuda")
+    def load_forward_model(self, model_path):
+        config = AutoConfig.from_pretrained(
+            model_path,
+            device_map=device_map,
+            trust_remote_code=True,
+            token = os.environ.get("TOKEN")
+        )
+        self.forward_model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                config=config,
+                device_map=device_map,
+                trust_remote_code=True,
+                token = os.environ.get("TOKEN")
+        )
+        # the finetune tokenizer could be in different size with pretrain tokenizer, and also, we need to add PAD_TOKEN
+        special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=special_tokens_dict,
+            tokenizer=self.tokenizer,
+            model=self.forward_model
+        )
+        self.forward_model.config.pad_token_id = self.tokenizer.pad_token_id
+        self.forward_model.to("cuda")
+    @spaces.GPU(duration=20)
+    def predict_single_smiles(self, smiles, task_type):
+        if task_type == "full_retro":
+            if "." in smiles:
+                return None
+        task_type = "retrosynthesis" if task_type == "full_retro" else "synthesis"
+        # canonicalize the smiles
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        smiles = Chem.MolToSmiles(mol)
+        smiles_list = [smiles]
+        task_type_list = [task_type]
+        df = pd.DataFrame({"src": smiles_list, "task_type": task_type_list})
+        test_dataset = Dataset.from_pandas(df)
+        # construct the dataloader
+        test_loader = torch.utils.data.DataLoader(
+            test_dataset,
+            batch_size=1,
+            collate_fn=self.data_collator,
+        )
+        predictions = []
+        for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
+            with torch.no_grad():
+                generation_prompts = batch['generation_prompts'][0]
+                inputs = self.tokenizer(generation_prompts, return_tensors="pt", padding=True, truncation=True).to(self.retro_model.device)
+                del inputs['token_type_ids']
+                if task_type == "retrosynthesis":
+                    outputs = self.retro_model.generate(**inputs, max_length=512, num_return_sequences=10,
+                                   do_sample=False, num_beams=10,
+                                   eos_token_id=self.tokenizer.eos_token_id,
+                                   early_stopping='never',
+                                   pad_token_id=self.tokenizer.pad_token_id,
+                                   length_penalty=0.0,
+                                   )
+                else:
+                    outputs = self.forward_model.generate(**inputs, max_length=512, num_return_sequences=10,
+                                   do_sample=False, num_beams=10,
+                                   eos_token_id=self.tokenizer.eos_token_id,
+                                   early_stopping='never',
+                                   pad_token_id=self.tokenizer.pad_token_id,
+                                   length_penalty=0.0,
+                                   )
+                original_smiles_list = self.tokenizer.batch_decode(outputs[:, len(inputs['input_ids'][0]):],
+                                                              skip_special_tokens=True)
+                original_smiles_list = map(lambda x: x.replace(" ", ""), original_smiles_list)
+                # canonize the SMILES
+                canonized_smiles_list = []
+                temp = []
+                for original_smiles in original_smiles_list:
+                    temp.append(original_smiles)
+                    try:
+                        canonized_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(original_smiles)))
+                    except:
+                        canonized_smiles_list.append("")
+                #canonized_smiles_list = \
+                #['N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1ccsc1Nc1cc(F)c([N+](=O)[O-])cc1F', 'N#Cc1ccsc1Nc1cc(Cl)c(F)cc1[N+](=O)[O-]', 'N#Cc1cnsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1cc(F)c(F)cc1Nc1sccc1C#N', 'N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=N)[O-]', 'N#Cc1cc(C#N)c(Nc2cc(F)c(F)cc2[N+](=O)[O-])s1', 'N#Cc1ccsc1Nc1c(F)c(F)cc(F)c1[N+](=O)[O-]', 'Nc1sccc1CNc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1ccsc1Nc1ccc(F)cc1[N+](=O)[O-]']
+                predictions.append(canonized_smiles_list)
+        rank, invalid_rate = compute_rank(predictions)
+        return rank