Spaces:

ChemFM
/

molecular_conditional_generation

Running on Zero

App Files Files Community

feiyang-cai commited on Jan 25

Commit

1d1d4f3

1 Parent(s): 1e001e8

update

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +62 -83
llama_customized_models.py +154 -0
metric_calculator.py +213 -0
utils.py +269 -190

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/*

app.py CHANGED Viewed

@@ -1,31 +1,11 @@
 import gradio as gr
 from huggingface_hub import HfApi, get_collection, list_collections, list_models
 #from utils import MolecularPropertyPredictionModel, dataset_task_types, dataset_descriptions, dataset_property_names, dataset_property_names_to_dataset
-from utils import ReactionPredictionModel
 import pandas as pd
 import os
 import spaces
-def get_models():
-    # we only support two models
-    # 1. ChemFM/uspto_mit_synthesis
-    # 2. ChemFM/uspto_full_retro
-    models = dict()
-    models['mit_synthesis'] = 'ChemFM/uspto_mit_synthesis'
-    models['full_retro'] = 'ChemFM/uspto_full_retro'
-    #for item in collection.items:
-    #    if item.item_type == "model":
-    #        item_name = item.item_id.split("/")[-1]
-    #        models[item_name] = item.item_id
-    #        assert item_name in dataset_task_types, f"{item_name} is not in the task_types"
-    #        assert item_name in dataset_descriptions, f"{item_name} is not in the dataset_descriptions"
-    return models
 #candidate_models = get_models()
 #task_names = {
 #    'mit_synthesis': 'Reaction Synthesis',
@@ -46,16 +26,30 @@ def get_models():
 #}
 #property_names = list(candidate_models.keys())
-#model = ReactionPredictionModel(candidate_models)
-#model = MolecularPropertyPredictionModel(candidate_models)
-def predict_single_label(value_1, value_2, value_3, value_4):
-    print(value_1, value_2, value_3, value_4)
     try:
         running_status = None
         prediction = None
         #prediction = model.predict(smiles, property_name, adapter_id)
         #prediction = model.predict_single_smiles(smiles, task)
@@ -65,10 +59,10 @@ def predict_single_label(value_1, value_2, value_3, value_4):
     except Exception as e:
         # no matter what the error is, we should return
         print(e)
-        return "NA", "Prediction failed"
-    prediction = "\n".join([f"{idx+1}. {item}" for idx, item in enumerate(prediction)])
-    return prediction, "Prediction is done"
 """
 def get_description(task_name):
@@ -177,6 +171,13 @@ def clear_file(download_button):
     return gr.update(visible=False), gr.update(visible=False), None
 """
 def build_inference():
     with gr.Blocks() as demo:
@@ -184,7 +185,11 @@ def build_inference():
         #with gr.Row():
         #gr.Markdown(f"<span style='color: red;'>If you run out of your GPU quota, you can use the </span> <a href='https://huggingface.co/spaces/ChemFM/molecular_property_prediction'>CPU-powered space</a> but with much lower performance.")
         #dropdown = gr.Dropdown([task_names[key] for key in tasks], label="Task", value=task_names[tasks[0]])
-        description = f"Generate 10 possible molecules based on the given conditions. \n"
         description_box = gr.Textbox(label="Task description", lines=5,
                                      interactive=False,
@@ -192,80 +197,54 @@ def build_inference():
         # third row - Textbox input and prediction label
         with gr.Row(equal_height=True):
             with gr.Column():
-                checkbox_1 = gr.Checkbox(label="qed")
-                slider_1 = gr.Slider(2, 20, value=4, label="qed", info="Choose between 2 and 20")
             with gr.Column():
-                checkbox_2 = gr.Checkbox(label="logp")
-                slider_2 = gr.Slider(2, 20, value=4, label="logp", info="Choose between 2 and 20")
             with gr.Column():
-                checkbox_3 = gr.Checkbox(label="sas")
-                slider_3 = gr.Slider(2, 20, value=4, label="sas", info="Choose between 2 and 20")
             with gr.Column():
-                checkbox_4 = gr.Checkbox(label="weight")
-                slider_4 = gr.Slider(2, 20, value=4, label="weight", info="Choose between 2 and 20")
         predict_single_smiles_button = gr.Button("Generate", size='sm')
         #prediction = gr.Label("Prediction will appear here")
-        prediction = gr.Textbox(label="Predictions", type="text", placeholder=None, lines=10, interactive=False)
         running_terminal_label = gr.Textbox(label="Running status", type="text", placeholder=None, lines=10, interactive=False)
-        #input_file = gr.File(label="Molecule file",
-        #               file_count='single',
-        #               file_types=[".smi", ".csv"], height=300)
-        #predict_file_button = gr.Button("Predict", size='sm', visible=False)
-        #download_button = gr.DownloadButton("Download", size='sm', visible=False)
-        #stop_button = gr.Button("Stop", size='sm', visible=False)
         # dropdown change event
         # predict single button click event
         predict_single_smiles_button.click(lambda:(gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    ) , outputs=[slider_1, slider_2, slider_3, slider_4,
                                                                 predict_single_smiles_button, running_terminal_label])\
-                                                   .then(predict_single_label, inputs=[slider_1, slider_2, slider_3, slider_4], outputs=[prediction, running_terminal_label])\
-                                                   .then(lambda:(gr.update(interactive=True),
-                                                                 gr.update(interactive=True),
-                                                                 gr.update(interactive=True),
-                                                                 gr.update(interactive=True),
-                                                                 gr.update(interactive=True),
-                                                                 gr.update(interactive=True),
-                                                                 ) , outputs=[slider_1, slider_2, slider_3, slider_4,
-                                                                               predict_single_smiles_button, running_terminal_label])
-        """
-        # input file upload event
-        file_status = gr.State()
-        input_file.upload(fn=validate_file, inputs=input_file, outputs=[file_status, input_file, predict_file_button, download_button]).success(raise_error, inputs=file_status, outputs=file_status)
-        # input file clear event
-        input_file.clear(fn=clear_file, inputs=[download_button], outputs=[predict_file_button, download_button, input_file])
-        # predict file button click event
-        predict_file_event = predict_file_button.click(lambda:(gr.update(interactive=False),
-                                                               gr.update(interactive=False),
-                                                               gr.update(interactive=False),
-                                                               gr.update(interactive=False, visible=True),
-                                                               gr.update(interactive=False),
-                                                               gr.update(interactive=True, visible=False),
-                                                               gr.update(interactive=False),
-                                                               gr.update(interactive=False),
-                                                               ) , outputs=[dropdown, textbox, predict_single_smiles_button, predict_file_button, download_button, stop_button, input_file, running_terminal_label])\
-                                                               .then(predict_file, inputs=[input_file, dropdown], outputs=[predict_file_button, download_button, stop_button, input_file, running_terminal_label])\
-                                                               .then(lambda:(gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             gr.update(interactive=True),
-                                                                             ) , outputs=[dropdown, textbox, predict_single_smiles_button, predict_file_button, download_button, stop_button, input_file, running_terminal_label])
-        # stop button click event
-        #stop_button.click(fn=None, inputs=None, outputs=None, cancels=[predict_file_event])
-        """
     return demo

 import gradio as gr
 from huggingface_hub import HfApi, get_collection, list_collections, list_models
 #from utils import MolecularPropertyPredictionModel, dataset_task_types, dataset_descriptions, dataset_property_names, dataset_property_names_to_dataset
+from utils import MolecularGenerationModel
 import pandas as pd
 import os
 import spaces
 #candidate_models = get_models()
 #task_names = {
 #    'mit_synthesis': 'Reaction Synthesis',
 #}
 #property_names = list(candidate_models.keys())
+model = MolecularGenerationModel()
+def predict_single_label(logp, tpas, sas, qed, logp_choose, tpsa_choose, sas_choose, qed_choose):
+    input_dict = dict()
+    if logp_choose:
+        input_dict['logP'] = logp
+    if tpsa_choose:
+        input_dict['TPSA'] = tpas
+    if sas_choose:
+        input_dict['SAS'] = sas
+    if qed_choose:
+        input_dict['QED'] = qed
+    if len(input_dict) == 0:
+        return "NA", "No input is selected"
+    print(input_dict)
     try:
         running_status = None
         prediction = None
+        prediction = model.predict_single_smiles(input_dict)
         #prediction = model.predict(smiles, property_name, adapter_id)
         #prediction = model.predict_single_smiles(smiles, task)
     except Exception as e:
         # no matter what the error is, we should return
         print(e)
+        return "NA", "Generation failed"
+    #prediction = "\n".join([f"{idx+1}. {item}" for idx, item in enumerate(prediction)])
+    return prediction, "Generation is done"
 """
 def get_description(task_name):
     return gr.update(visible=False), gr.update(visible=False), None
 """
+def toggle_slider(checked):
+    return gr.update(interactive=checked)
+def toggle_sliders_based_on_checkboxes(checked_values):
+    """Enable or disable sliders based on the corresponding checkbox values."""
+    return [gr.update(interactive=checked_values[i]) for i in range(4)]
 def build_inference():
     with gr.Blocks() as demo:
         #with gr.Row():
         #gr.Markdown(f"<span style='color: red;'>If you run out of your GPU quota, you can use the </span> <a href='https://huggingface.co/spaces/ChemFM/molecular_property_prediction'>CPU-powered space</a> but with much lower performance.")
         #dropdown = gr.Dropdown([task_names[key] for key in tasks], label="Task", value=task_names[tasks[0]])
+        description = f"This space allows you to generate ten possible molecules based on given conditions. \n" \
+                      f"1. You can enable or disable specific properties using checkboxes and adjust their values with sliders. \n" \
+                      f"2. The generated SMILES strings and their corresponding predicted properties will be displayed in the generations section. \n" \
+                      f"3. The properties include logP, TPSA, SAS, and QED. \n" \
+                      f"4. Model trained on the GuacaMol dataset for molecular design. "
         description_box = gr.Textbox(label="Task description", lines=5,
                                      interactive=False,
         # third row - Textbox input and prediction label
         with gr.Row(equal_height=True):
             with gr.Column():
+                checkbox_1 = gr.Checkbox(label="logP", value=True)
+                slider_1 = gr.Slider(1, 7, value=4, label="logP", info="Choose between 1 and 7")
+                checkbox_1.change(toggle_slider, checkbox_1, slider_1)
             with gr.Column():
+                checkbox_2 = gr.Checkbox(label="TPSA", value=True)
+                slider_2 = gr.Slider(20, 140, value=80, label="TPSA", info="Choose between 20 and 140")
+                checkbox_2.change(toggle_slider, checkbox_2, slider_2)
             with gr.Column():
+                checkbox_3 = gr.Checkbox(label="SAS", value=True)
+                slider_3 = gr.Slider(1, 5, value=3, label="SAS", info="Choose between 1 and 5")
+                checkbox_3.change(toggle_slider, checkbox_3, slider_3)
             with gr.Column():
+                checkbox_4 = gr.Checkbox(label="QED", value=True)
+                slider_4 = gr.Slider(0.1, 0.9, value=0.5, label="QED", info="Choose between 0.1 and 0.9")
+                checkbox_4.change(toggle_slider, checkbox_4, slider_4)
         predict_single_smiles_button = gr.Button("Generate", size='sm')
         #prediction = gr.Label("Prediction will appear here")
+        #prediction = gr.Textbox(label="Predictions", type="text", placeholder=None, lines=10, interactive=False)
+        prediction = gr.Dataframe(label="Generations", type="pandas", interactive=False)
         running_terminal_label = gr.Textbox(label="Running status", type="text", placeholder=None, lines=10, interactive=False)
         # dropdown change event
         # predict single button click event
         predict_single_smiles_button.click(lambda:(gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   gr.update(interactive=False),
+                                                   gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    gr.update(interactive=False),
                                                    ) , outputs=[slider_1, slider_2, slider_3, slider_4,
+                                                                checkbox_1, checkbox_2, checkbox_3, checkbox_4,
                                                                 predict_single_smiles_button, running_terminal_label])\
+                                                   .then(predict_single_label, inputs=[slider_1, slider_2, slider_3, slider_4,
+                                                                                        checkbox_1, checkbox_2, checkbox_3, checkbox_4
+                                                                                       ], outputs=[prediction, running_terminal_label])\
+                                                   .then(lambda a, b, c, d: toggle_sliders_based_on_checkboxes([a, b, c, d]) +
+                                                                            [gr.update(interactive=True)] * 6,
+                                                         inputs=[checkbox_1, checkbox_2, checkbox_3, checkbox_4],
+                                                         outputs=[slider_1, slider_2, slider_3, slider_4,
+                                                                  checkbox_1, checkbox_2, checkbox_3, checkbox_4,
+                                                                  predict_single_smiles_button, running_terminal_label])
     return demo

llama_customized_models.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel
+from transformers.models.llama.configuration_llama import LlamaConfig
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    CausalLMOutputWithPast,
+)
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from dataclasses import dataclass
+from transformers.utils import ModelOutput
+import torch
+from typing import List, Optional, Tuple, Union
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+class LlamaForCausalLMWithNumericalEmbedding(LlamaForCausalLM):
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.numerical_embedding = torch.nn.Linear(1, config.hidden_size, bias=True)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        properties: List = None,
+        properties_index: List = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position=None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        b, l = input_ids.size()
+        assert len(properties) == b, "The number of properties should be equal to the batch size."
+        assert len(properties_index) == b, "The number of properties_index should be equal to the batch size."
+        embeddings = self.model.embed_tokens(input_ids)
+        for i, (props, props_index, embeds) in enumerate(zip(properties, properties_index, embeddings)):
+            assert len(props) == len(props_index), "The number of properties should be equal to the number of properties_index."
+            props = torch.tensor(props, device=embeds.device, dtype=torch.float32).unsqueeze(1)
+            num_embeds = self.numerical_embedding(props)
+            if len(props_index) > 0:
+                assert embeddings[i, props_index, :].shape == num_embeds.shape, "The shape of the embeddings and the numerical embeddings should be the same."
+                embeddings[i, props_index, :] = num_embeds
+        return super().forward(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=embeddings,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )

metric_calculator.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
+from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
+import networkx as nx
+import os.path as op
+import math
+#from rdkit.six.moves import cPickle
+import _pickle as cPickle
+#from rdkit.six import iteritems
+from rdkit import Chem
+import pickle
+import numpy as np
+import sys
+import os
+from rdkit.Chem import RDConfig
+sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
+import sascorer
+from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
+from rdkit.Chem.Fingerprints import FingerprintMols
+def compute_rmse(gt, pred):
+    return mean_squared_error(gt, pred, squared=False)
+def compute_r2score(gt, pred):
+    return r2_score(gt, pred)
+def compute_roc_auc(gt, pred):
+    return roc_auc_score(gt, pred)
+def check_valid(smiles_list):
+    total_num = len(smiles_list)
+    empty_num = smiles_list.count("")
+    return 1 - empty_num / float(total_num)
+def check_unique(smiles_list):
+    total_num = len(smiles_list)
+    smiles_set = set(smiles_list)
+    if "" in smiles_set:
+        smiles_set.remove("")
+    return len(smiles_set) / float(total_num)
+def check_nolvelty(gen_smiles, train_smiles):
+    if len(gen_smiles) == 0:
+        novel_ratio = 0.
+    else:
+        duplicates = [1 for mol in gen_smiles if mol in train_smiles]
+        novel = len(gen_smiles) - sum(duplicates)
+        novel_ratio = novel*100./len(gen_smiles)
+    return novel_ratio
+_fscores = None
+def readFragmentScores(name='fpscores'):
+    import gzip
+    global _fscores
+    # generate the full path filename:
+    if name == "fpscores":
+        name = op.join(op.dirname(__file__), name)
+    _fscores = cPickle.load(gzip.open('%s.pkl.gz'%name))
+    outDict = {}
+    for i in _fscores:
+        for j in range(1,len(i)):
+            outDict[i[j]] = float(i[0])
+    _fscores = outDict
+def numBridgeheadsAndSpiro(mol,ri=None):
+    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+    return nBridgehead,nSpiro
+def calculateScore(m):
+    if _fscores is None: readFragmentScores()
+    # fragment score
+    fp = rdMolDescriptors.GetMorganFingerprint(m,2)  #<- 2 is the *radius* of the circular fingerprint
+    fps = fp.GetNonzeroElements()
+    score1 = 0.
+    nf = 0
+    for bitId,v in iteritems(fps):
+        nf += v
+        sfp = bitId
+        score1 += _fscores.get(sfp,-4)*v
+    score1 /= nf
+    # features score
+    nAtoms = m.GetNumAtoms()
+    nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True))
+    ri = m.GetRingInfo()
+    nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri)
+    nMacrocycles=0
+    for x in ri.AtomRings():
+        if len(x)>8: nMacrocycles+=1
+    sizePenalty = nAtoms**1.005 - nAtoms
+    stereoPenalty = math.log10(nChiralCenters+1)
+    spiroPenalty = math.log10(nSpiro+1)
+    bridgePenalty = math.log10(nBridgeheads+1)
+    macrocyclePenalty = 0.
+    # ---------------------------------------
+    # This differs from the paper, which defines:
+    #  macrocyclePenalty = math.log10(nMacrocycles+1)
+    # This form generates better results when 2 or more macrocycles are present
+    if nMacrocycles > 0: macrocyclePenalty = math.log10(2)
+    score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty
+    # correction for the fingerprint density
+    # not in the original publication, added in version 1.1
+    # to make highly symmetrical molecules easier to synthetise
+    score3 = 0.
+    if nAtoms > len(fps):
+      score3 = math.log(float(nAtoms) / len(fps)) * .5
+    sascore = score1 + score2 + score3
+    # need to transform "raw" value into scale between 1 and 10
+    min = -4.0
+    max = 2.5
+    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+    # smooth the 10-end
+    if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.)
+    if sascore > 10.: sascore = 10.0
+    elif sascore < 1.: sascore = 1.0
+    return sascore
+def compute_plogp(mol):
+    #mol = MolFromSmiles(smiles_string)
+    #logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values)
+    logp = Crippen.MolLogP(mol)
+    #SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores)
+    SA_score = -calculateScore(mol)
+    cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
+    if len(cycle_list) == 0:
+        cycle_length = 0
+    else:
+        cycle_length = max([ len(j) for j in cycle_list ])
+    if cycle_length <= 6:
+        cycle_length = 0
+    else:
+        cycle_length = cycle_length - 6
+    #cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores)
+    cycle_score = -cycle_length
+    #plogp = -(logp + SA_score + cycle_score)
+    plogp = (logp + SA_score + cycle_score)
+    return plogp
+clf_model = None
+def load_model():
+    global clf_model
+    #name = op.join(op.dirname(__file__), 'clf_py36.pkl')
+    name = op.join(op.dirname(__file__), 'drd2_current.pkl')
+    with open(name, "rb") as f:
+        clf_model = pickle.load(f)
+def fingerprints_from_mol(mol):
+    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
+    size = 2048
+    nfp = np.zeros((1, size), np.int32)
+    for idx,v in fp.GetNonzeroElements().items():
+        nidx = idx%size
+        nfp[0, nidx] += int(v)
+    return nfp
+def compute_drd2(mol):
+    if clf_model is None:
+        load_model()
+    #print(smile)
+    #mol = Chem.MolFromSmiles(smile)
+    if mol:
+        fp = fingerprints_from_mol(mol)
+        score = clf_model.predict_proba(fp)[:, 1]
+        return float(score)
+    return 0.0
+def compute_qed(mol):
+    return QED.qed(mol)
+def compute_logp(mol):
+    return Crippen.MolLogP(mol)
+def compute_tpsa(mol):
+    return rdMolDescriptors.CalcTPSA(mol)
+def compute_sas(mol):
+    return sascorer.calculateScore(mol)
+def check_valid_unique(smiles_list):
+    total_num = len(smiles_list)
+    empty_num = smiles_list.count("")
+    smiles_set = set(smiles_list)
+    if "" in smiles_set:
+        smiles_set.remove("")
+    return 1 - empty_num / float(total_num), \
+        len(smiles_set) / float(total_num - empty_num)
+def get_similarity(smiles1, smiles2):
+    if smiles1 == "" or smiles2 == "":
+        return np.nan
+    sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)),
+                       FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2)))
+    return sim
+def get_scaffold(smiles):
+    scaffold = MurckoScaffoldSmiles(smiles)
+    return scaffold

utils.py CHANGED Viewed

@@ -12,37 +12,62 @@ from datasets import Dataset
 from tqdm import tqdm
 import spaces
 from rdkit import RDLogger, Chem
 # Suppress RDKit INFO messages
 RDLogger.DisableLog('rdApp.*')
 DEFAULT_PAD_TOKEN = "[PAD]"
-device_map = "cpu"
-def compute_rank(prediction,raw=False,alpha=1.0):
-    valid_score = [[k for k in range(len(prediction[j]))] for j in range(len(prediction))]
-    invalid_rates = [0 for k in range(len(prediction[0]))]
-    rank = {}
-    highest = {}
-    for j in range(len(prediction)):
-        for k in range(len(prediction[j])):
-            if prediction[j][k] == "":
-                valid_score[j][k] = 10 + 1
-                invalid_rates[k] += 1
-        de_error = [i[0] for i in sorted(list(zip(prediction[j], valid_score[j])), key=lambda x: x[1]) if i[0] != ""]
-        prediction[j] = list(set(de_error))
-        prediction[j].sort(key=de_error.index)
-        for k, data in enumerate(prediction[j]):
-            if data in rank:
-                rank[data] += 1 / (alpha * k + 1)
-            else:
-                rank[data] = 1 / (alpha * k + 1)
-            if data in highest:
-                highest[data] = min(k,highest[data])
             else:
-                highest[data] = k
-    return rank,invalid_rates
 @dataclass
@@ -50,36 +75,98 @@ class DataCollatorForCausalLMEval(object):
     tokenizer: transformers.PreTrainedTokenizer
     source_max_len: int
     target_max_len: int
-    reactant_start_str: str
-    product_start_str: str
     end_str: str
-    def augment_molecule(self, molecule: str) -> str:
-        return self.sme.augment([molecule])[0]
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        print(instances)
-        srcs = instances[0]['src']
-        task_type = instances[0]['task_type'][0]
-        if task_type == 'retrosynthesis':
-            src_start_str = self.product_start_str
-            tgt_start_str = self.reactant_start_str
-        else:
-            src_start_str = self.reactant_start_str
-            tgt_start_str = self.product_start_str
-        generation_prompts = []
-        generation_prompt = f"{src_start_str}{srcs}{self.end_str}{tgt_start_str}"
-        generation_prompts.append(generation_prompt)
         data_dict = {
-            'generation_prompts': generation_prompts
         }
         return data_dict
 def smart_tokenizer_and_embedding_resize(
     special_tokens_dict: Dict,
     tokenizer: transformers.PreTrainedTokenizer,
@@ -106,176 +193,168 @@ def smart_tokenizer_and_embedding_resize(
         input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
     print(f"Resized tokenizer and embedding from {num_old_tokens} to {len(tokenizer)} tokens.")
-class ReactionPredictionModel():
-    def __init__(self, candidate_models):
-        for model in candidate_models:
-            if "retro" in model:
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    candidate_models[list(candidate_models.keys())[0]],
-                    padding_side="right",
-                    use_fast=True,
-                    trust_remote_code=True,
-                    token = os.environ.get("TOKEN")
-                    )
-                self.load_retro_model(candidate_models[model])
-            else:
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    candidate_models[list(candidate_models.keys())[0]],
-                    padding_side="right",
-                    use_fast=True,
-                    trust_remote_code=True,
-                    token = os.environ.get("TOKEN")
-                    )
-                self.load_forward_model(candidate_models[model])
-        string_template_path = hf_hub_download(candidate_models[list(candidate_models.keys())[0]], filename="string_template.json", token = os.environ.get("TOKEN"))
-        string_template = json.load(open(string_template_path, 'r'))
-        reactant_start_str = string_template['REACTANTS_START_STRING']
-        product_start_str = string_template['PRODUCTS_START_STRING']
-        end_str = string_template['END_STRING']
-        self.data_collator = DataCollatorForCausalLMEval(
-            tokenizer=self.tokenizer,
-            source_max_len=512,
-            target_max_len=512,
-            reactant_start_str=reactant_start_str,
-            product_start_str=product_start_str,
-            end_str=end_str,
         )
-    def load_retro_model(self, model_path):
-        # our retro model is lora model
         config = AutoConfig.from_pretrained(
-            "ChemFM/ChemFM-3B",
             trust_remote_code=True,
-            token=os.environ.get("TOKEN")
         )
-        base_model = AutoModelForCausalLM.from_pretrained(
-            "ChemFM/ChemFM-3B",
             config=config,
-            trust_remote_code=True,
             device_map=device_map,
             token = os.environ.get("TOKEN")
         )
-        # we should resize the embedding layer of the base model to match the adapter's tokenizer
         special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
         smart_tokenizer_and_embedding_resize(
             special_tokens_dict=special_tokens_dict,
             tokenizer=self.tokenizer,
-            model=base_model
-        )
-        base_model.config.pad_token_id = self.tokenizer.pad_token_id
-        # load the adapter model
-        self.retro_model = PeftModel.from_pretrained(
-            base_model,
-            model_path,
-            token = os.environ.get("TOKEN")
         )
-        #self.retro_model.to("cuda")
-    def load_forward_model(self, model_path):
-        config = AutoConfig.from_pretrained(
-            model_path,
-            device_map=device_map,
-            trust_remote_code=True,
-            token = os.environ.get("TOKEN")
         )
-        self.forward_model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                config=config,
-                device_map=device_map,
-                trust_remote_code=True,
-                token = os.environ.get("TOKEN")
-        )
-        # the finetune tokenizer could be in different size with pretrain tokenizer, and also, we need to add PAD_TOKEN
-        special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
-        smart_tokenizer_and_embedding_resize(
-            special_tokens_dict=special_tokens_dict,
-            tokenizer=self.tokenizer,
-            model=self.forward_model
-        )
-        self.forward_model.config.pad_token_id = self.tokenizer.pad_token_id
-        #self.forward_model.to("cuda")
-    @spaces.GPU(duration=20)
-    def predict_single_smiles(self, smiles, task_type):
-        if task_type == "full_retro":
-            if "." in smiles:
-                return None
-        task_type = "retrosynthesis" if task_type == "full_retro" else "synthesis"
-        # canonicalize the smiles
-        mol = Chem.MolFromSmiles(smiles)
-        if mol is None:
-            return None
-        smiles = Chem.MolToSmiles(mol)
-        smiles_list = [smiles]
-        task_type_list = [task_type]
-        df = pd.DataFrame({"src": smiles_list, "task_type": task_type_list})
-        test_dataset = Dataset.from_pandas(df)
-        # construct the dataloader
-        test_loader = torch.utils.data.DataLoader(
-            test_dataset,
-            batch_size=1,
-            collate_fn=self.data_collator,
-        )
-        predictions = []
-        for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
-            with torch.no_grad():
-                generation_prompts = batch['generation_prompts'][0]
-                inputs = self.tokenizer(generation_prompts, return_tensors="pt", padding=True, truncation=True).to(self.retro_model.device)
-                print(inputs)
-                del inputs['token_type_ids']
-                """
-                if task_type == "retrosynthesis":
-                    outputs = self.retro_model.generate(**inputs, max_length=512, num_return_sequences=10,
-                                   do_sample=False, num_beams=10,
-                                   eos_token_id=self.tokenizer.eos_token_id,
-                                   early_stopping='never',
-                                   pad_token_id=self.tokenizer.pad_token_id,
-                                   length_penalty=0.0,
-                                   )
-                else:
-                    outputs = self.forward_model.generate(**inputs, max_length=512, num_return_sequences=10,
-                                   do_sample=False, num_beams=10,
-                                   eos_token_id=self.tokenizer.eos_token_id,
-                                   early_stopping='never',
-                                   pad_token_id=self.tokenizer.pad_token_id,
-                                   length_penalty=0.0,
-                                   )
-                original_smiles_list = self.tokenizer.batch_decode(outputs[:, len(inputs['input_ids'][0]):],
-                                                              skip_special_tokens=True)
-                original_smiles_list = map(lambda x: x.replace(" ", ""), original_smiles_list)
-                # canonize the SMILES
-                canonized_smiles_list = []
-                temp = []
-                for original_smiles in original_smiles_list:
-                    temp.append(original_smiles)
-                    try:
-                        canonized_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(original_smiles)))
-                    except:
-                        canonized_smiles_list.append("")
-                """
-                canonized_smiles_list = \
-                ['N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1ccsc1Nc1cc(F)c([N+](=O)[O-])cc1F', 'N#Cc1ccsc1Nc1cc(Cl)c(F)cc1[N+](=O)[O-]', 'N#Cc1cnsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1cc(F)c(F)cc1Nc1sccc1C#N', 'N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=N)[O-]', 'N#Cc1cc(C#N)c(Nc2cc(F)c(F)cc2[N+](=O)[O-])s1', 'N#Cc1ccsc1Nc1c(F)c(F)cc(F)c1[N+](=O)[O-]', 'Nc1sccc1CNc1cc(F)c(F)cc1[N+](=O)[O-]', 'N#Cc1ccsc1Nc1ccc(F)cc1[N+](=O)[O-]']
-                predictions.append(canonized_smiles_list)
-        rank, invalid_rate = compute_rank(predictions)
-        return rank

 from tqdm import tqdm
 import spaces
+from llama_customized_models import LlamaForCausalLMWithNumericalEmbedding
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+from torch.utils.data.dataloader import DataLoader
+from torch.nn import functional as F
+import importlib
 from rdkit import RDLogger, Chem
 # Suppress RDKit INFO messages
 RDLogger.DisableLog('rdApp.*')
 DEFAULT_PAD_TOKEN = "[PAD]"
+device_map = "cuda"
+means = {"qed": 0.5559003125710424, "logp": 3.497542110420217, "sas": 2.889429694406497, "tpsa": 80.19717097706841}
+stds = {"qed": 0.21339854620824716, "logp": 1.7923582437824368, "sas": 0.8081188219568571, "tpsa": 38.212259443049554}
+def phrase_df(df):
+    metric_calculator = importlib.import_module("metric_calculator")
+    new_df = []
+    # iterate over the dataframe
+    for i in range(len(df)):
+        sub_df = dict()
+        # get the SMILES
+        smiles = df.iloc[i]['SMILES']
+        # get the property names
+        property_names = df.iloc[i]['property_names']
+        # get the non normalized properties
+        non_normalized_properties = df.iloc[i]['non_normalized_properties']
+        sub_df['SMILES'] = smiles
+        # compute the similarity between the scaffold and the SMILES
+        for j in range(len(property_names)):
+            # get the property name
+            property_name = property_names[j]
+            # get the non normalized property
+            non_normalized_property = non_normalized_properties[j]
+            sub_df[f'{property_name}_condition'] = non_normalized_property
+            if smiles == "":
+                sub_df[f'{property_name}_measured'] = np.nan
             else:
+                property_eval_func_name = f"compute_{property_name}"
+                property_eval_func = getattr(metric_calculator, property_eval_func_name)
+                sub_df[f'{property_name}_measured'] = property_eval_func(Chem.MolFromSmiles(smiles))
+        new_df.append(sub_df)
+    new_df = pd.DataFrame(new_df)
+    return new_df
 @dataclass
     tokenizer: transformers.PreTrainedTokenizer
     source_max_len: int
     target_max_len: int
+    molecule_target_aug_prob: float
+    molecule_start_str: str
+    scaffold_aug_prob: float
+    scaffold_start_str: str
+    property_start_str: str
+    property_inner_sep: str
+    property_inter_sep: str
     end_str: str
+    ignore_index: int
+    has_scaffold: bool
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        # Extract elements
+        prop_token_map = {
+            'qed': '<qed>',
+            'logp': '<logp>',
+            'sas': '<SAS>',
+            'tpsa': '<TPSA>'
+        }
+        sources = []
+        props_list = []
+        non_normalized_props_list = []
+        prop_names_list = []
+        props_index_list = []
+        temperature_list = []
+        scaffold_list = []
+        for example in instances:
+            prop_names = example['property_name']
+            prop_values = example['property_value']
+            non_normalized_prop_values = example['non_normalized_property_value']
+            temperature = example['temperature']
+            # we need to convert the string to a list
+            # randomly choose the property and the scaffold combinations:
+            props_str = ""
+            scaffold_str = ""
+            props = []
+            non_nornalized_props = []
+            props_index = []
+            if self.has_scaffold:
+                scaffold = example['scaffold_smiles'].strip()
+                scaffold_str = f"{self.scaffold_start_str}{scaffold}{self.end_str}"
+            props_str = f"{self.property_start_str}"
+            for i, prop in enumerate(prop_names):
+                prop = prop.lower()
+                props_str += f"{prop_token_map[prop]}{self.property_inner_sep}{self.molecule_start_str}{self.property_inter_sep}"
+                props.append(prop_values[i])
+                non_nornalized_props.append(non_normalized_prop_values[i])
+                props_index.append(3 + 4 * i) # this is hard coded for the current template
+            props_str += f"{self.end_str}"
+            source = props_str + scaffold_str + "<->>" + self.molecule_start_str
+            sources.append(source)
+            props_list.append(props)
+            non_normalized_props_list.append(non_nornalized_props)
+            props_index_list.append(props_index)
+            prop_names_list.append(prop_names)
+            temperature_list.append(temperature)
+        # Tokenize
+        tokenized_sources_with_prompt = self.tokenizer(
+            sources,
+            max_length=self.source_max_len,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        # Build the input and labels for causal LM
+        input_ids = []
+        for tokenized_source in tokenized_sources_with_prompt['input_ids']:
+            input_ids.append(torch.tensor(tokenized_source))
+        # Apply padding
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
         data_dict = {
+            'input_ids': input_ids,
+            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
+            'properties': props_list,
+            'non_normalized_properties': non_normalized_props_list,
+            'property_names': prop_names_list,
+            'properties_index': props_index_list,
+            'temperature': temperature_list,
         }
         return data_dict
 def smart_tokenizer_and_embedding_resize(
     special_tokens_dict: Dict,
     tokenizer: transformers.PreTrainedTokenizer,
         input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
     print(f"Resized tokenizer and embedding from {num_old_tokens} to {len(tokenizer)} tokens.")
+class MolecularGenerationModel():
+    def __init__(self):
+        model_id = "ChemFM/molecular_cond_generation_guacamol"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            padding_side="right",
+            use_fast=True,
+            trust_remote_code=True,
+            token = os.environ.get("TOKEN")
         )
+        # load model
         config = AutoConfig.from_pretrained(
+            model_id,
+            device_map=device_map,
             trust_remote_code=True,
+            token = os.environ.get("TOKEN")
         )
+        self.model = LlamaForCausalLMWithNumericalEmbedding.from_pretrained(
+            model_id,
             config=config,
             device_map=device_map,
+            trust_remote_code=True,
             token = os.environ.get("TOKEN")
         )
+        # the finetune tokenizer could be in different size with pretrain tokenizer, and also, we need to add PAD_TOKEN
         special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
         smart_tokenizer_and_embedding_resize(
             special_tokens_dict=special_tokens_dict,
             tokenizer=self.tokenizer,
+            model=self.model
         )
+        self.model.config.pad_token_id = self.tokenizer.pad_token_id
+        self.model.eval()
+        string_template_path = hf_hub_download(model_id, filename="string_template.json", token = os.environ.get("TOKEN"))
+        string_template = json.load(open(string_template_path, 'r'))
+        molecule_start_str = string_template['MOLECULE_START_STRING']
+        scaffold_start_str = string_template['SCAFFOLD_MOLECULE_START_STRING']
+        property_start_str = string_template['PROPERTY_START_STRING']
+        property_inner_sep = string_template['PROPERTY_INNER_SEP']
+        property_inter_sep = string_template['PROPERTY_INTER_SEP']
+        end_str = string_template['END_STRING']
+        self.data_collator = DataCollatorForCausalLMEval(
+            tokenizer=self.tokenizer,
+            source_max_len=512,
+            target_max_len=512,
+            molecule_target_aug_prob=1.0,
+            scaffold_aug_prob=0.0,
+            molecule_start_str=molecule_start_str,
+            scaffold_start_str=scaffold_start_str,
+            property_start_str=property_start_str,
+            property_inner_sep=property_inner_sep,
+            property_inter_sep=property_inter_sep,
+            end_str=end_str,
+            ignore_index=-100,
+            has_scaffold=False
         )
+    @spaces.GPU(duration=60)
+    def generate(self, loader):
+        df = []
+        pbar = tqdm(loader, desc=f"Evaluating...", leave=False)
+        for it, batch in enumerate(pbar):
+            sub_df = dict()
+            batch_size = batch['input_ids'].shape[0]
+            assert batch_size == 1, "The batch size should be 1"
+            temperature = batch['temperature'][0]
+            property_names = batch['property_names'][0]
+            non_normalized_properties = batch['non_normalized_properties'][0]
+            num_generations = 1
+            del batch['temperature']
+            del batch['property_names']
+            del batch['non_normalized_properties']
+            input_length = batch['input_ids'].shape[1]
+            steps = 1024 - input_length
+            with torch.set_grad_enabled(False):
+                early_stop_flags = torch.zeros(num_generations, dtype=torch.bool).to(self.model.device)
+                for k in range(steps):
+                    logits = self.model(**batch)['logits']
+                    logits = logits[:, -1, :] / temperature
+                    probs = F.softmax(logits, dim=-1)
+                    ix = torch.multinomial(probs, num_samples=num_generations)
+                    ix[early_stop_flags] = self.tokenizer.eos_token_id
+                    batch['input_ids'] = torch.cat([batch['input_ids'], ix], dim=-1)
+                    early_stop_flags |= (ix.squeeze() == self.tokenizer.eos_token_id)
+                    if torch.all(early_stop_flags):
+                        break
+            generations = self.tokenizer.batch_decode(batch['input_ids'][:, input_length:], skip_special_tokens=True)
+            generations = map(lambda x: x.replace(" ", ""), generations)
+            predictions = []
+            for generation in generations:
+                try:
+                    predictions.append(Chem.MolToSmiles(Chem.MolFromSmiles(generation)))
+                except:
+                    predictions.append("")
+            sub_df['SMILES'] = predictions[0]
+            sub_df['property_names'] = property_names
+            sub_df['property'] = batch['properties'][0]
+            sub_df['non_normalized_properties'] = non_normalized_properties
+            df.append(sub_df)
+        df = pd.DataFrame(df)
+        return df
+    def predict_single_smiles(self, input_dict: Dict):
+        # conver the key to lower case
+        input_dict = {key.lower(): value for key, value in input_dict.items()}
+        properties = [key.lower() for key in input_dict.keys()]
+        property_means = [means[prop] for prop in properties]
+        property_stds = [stds[prop] for prop in properties]
+        sample_point = [input_dict[prop] for prop in properties]
+        non_normalized_sample_point = np.array(sample_point).reshape(-1)
+        sample_point = (np.array(sample_point) - np.array(property_means)) / np.array(property_stds)
+        sub_df = {
+            "property_name": properties,
+            "property_value": sample_point.tolist(),
+            "temperature": 1.0,
+            "non_normalized_property_value": non_normalized_sample_point.tolist()
+        }
+        test_dataset = [sub_df] * 10
+        test_dataset = pd.DataFrame(test_dataset)
+        test_dataset = Dataset.from_pandas(test_dataset)
+        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=self.data_collator)
+        df = self.generate(test_loader)
+        new_df = phrase_df(df)
+        # delete the condition columns
+        new_df = new_df.drop(columns=[col for col in new_df.columns if "condition" in col])
+        # drop the empty smiles rows
+        new_df = new_df.dropna(subset=['SMILES'])
+        # convert the measured to 2 decimal places
+        new_df = new_df.round(2)
+        return new_df