Spaces:

hmacdope-omsf
/

openadmet-predictions-demo

Sleeping

App Files Files Community

hmacdope commited on Mar 17

Commit

b1ec78c

1 Parent(s): 80a757e

fix

Browse files

Files changed (4) hide show

.streamlit/config.toml +2 -0
app.py +328 -0
models/ecfp4_mordred_lgbm/cyp3a4_model.json +24 -0
models/ecfp4_mordred_lgbm/cyp3a4_model.pkl +3 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ base="light"

app.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import streamlit as st
+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from rdkit import Chem
+from streamlit_ketcher import st_ketcher
+from io import StringIO
+from openadmet_models.models.gradient_boosting.lgbm import LGBMRegressorModel
+from openadmet_models.features.combine import FeatureConcatenator
+from openadmet_models.features.molfeat_properties import DescriptorFeaturizer
+from openadmet_models.features.molfeat_fingerprint import FingerprintFeaturizer
+def _is_valid_smiles(smi):
+    if smi is None or smi == "":
+        return False
+    try:
+        m = Chem.MolFromSmiles(smi)
+        if m is None:
+            return False
+        else:
+            return True
+    except:
+        return False
+def sdf_str_to_rdkit_mol(sdf):
+    from io import BytesIO
+    bio = BytesIO(sdf.encode())
+    suppl = Chem.ForwardSDMolSupplier(bio, removeHs=False)
+    mols = [mol for mol in suppl if mol is not None]
+    return mols
+@st.cache_data
+def convert_df(df):
+    # IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode("utf-8")
+def get_model(path, target, model_type):
+    model_path = os.path.join(path, f"{model_type}/{target.lower()}_model.json")
+    model_file = os.path.join(path, f"{model_type}/{target.lower()}_model.pkl")
+    print(model_path, model_file)
+    if not os.path.exists(model_path) or not os.path.exists(model_file):
+        return None
+    model = LGBMRegressorModel.deserialize(model_path, model_file)
+    featurizer = FeatureConcatenator(featurizers=[FingerprintFeaturizer(fp_type="ecfp:4"), DescriptorFeaturizer(descr_type="mordred")])
+    return model, featurizer
+# Set the title of the Streamlit app
+st.title("OpenADMET Streamlit DEMO")
+# Set the title of the Streamlit app
+st.title("OpenADMET Streamlit DEMO")
+st.markdown("## Background")
+st.markdown(
+    "**The [OpenADMET](https://openadmet.org) initiative provides a suite of open-source machine learning models to predict ADMET (Absorption, Distribution, Metabolism, Excretion, and Toxicity) properties, facilitating drug discovery and development.**"
+)
+st.markdown(
+    "This web app enables researchers and scientists to leverage OpenADMET’s models without needing to write or run code, making predictive analytics more accessible."
+)
+st.markdown("---")
+st.markdown("## Input :clipboard:")
+input = st.selectbox(
+    "How would you like to enter your input?",
+    ["Upload a CSV file", "Draw a molecule", "Enter SMILES", "Upload an SDF file"],
+    key="input",
+)
+multismiles = False
+if input == "Draw a molecule":
+    smiles = st_ketcher(None)
+    if _is_valid_smiles(smiles):
+        st.success("Valid molecule", icon="✅")
+    else:
+        st.error("Invalid molecule", icon="🚨")
+        st.stop()
+    smiles = [smiles]
+    queried_df = pd.DataFrame(smiles, columns=["SMILES"])
+    smiles_column_name = "SMILES"
+    smiles_column = queried_df[smiles_column_name]
+elif input == "Enter SMILES":
+    smiles = st.text_input("Enter a SMILES string", key="smiles_user_input")
+    if _is_valid_smiles(smiles):
+        st.success("Valid SMILES string", icon="✅")
+    else:
+        st.error("Invalid SMILES string", icon="🚨")
+        st.stop()
+    smiles = [smiles]
+    queried_df = pd.DataFrame(smiles, columns=["SMILES"])
+    smiles_column_name = "SMILES"
+    smiles_column = queried_df[smiles_column_name]
+elif input == "Upload a CSV file":
+    # Create a file uploader for CSV files
+    uploaded_file = st.file_uploader(
+        "Choose a CSV file to upload your predictions to", type="csv", key="csv_file"
+    )
+    # If a file is uploaded, parse it into a DataFrame
+    if uploaded_file is not None:
+        queried_df = pd.read_csv(uploaded_file)
+    else:
+        st.stop()
+    # Select a column from the DataFrame
+    smiles_column_name = st.selectbox("Select a SMILES column", queried_df.columns, key="df_smiles_column")
+    multismiles = True
+    smiles_column = queried_df[smiles_column_name]
+    # check if the smiles are valid
+    valid_smiles = [_is_valid_smiles(smi) for smi in smiles_column]
+    if not all(valid_smiles):
+        st.error(
+            "Some of the SMILES strings are invalid, please check the input", icon="🚨"
+        )
+        st.stop()
+    st.success(
+        f"All SMILES strings are valid (n={len(valid_smiles)}), proceeding with prediction",
+        icon="✅",
+    )
+elif input == "Upload an SDF file":
+    # Create a file uploader for SDF files
+    uploaded_file = st.file_uploader(
+        "Choose a SDF file to upload your predictions to", type="sdf"
+    )
+    # read with rdkit
+    if uploaded_file is not None:
+        # To convert to a string based IO:
+        try:
+            stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
+            # To read file as string:
+            string_data = stringio.read()
+            mols = sdf_str_to_rdkit_mol(string_data)
+            smiles = [Chem.MolToSmiles(m) for m in mols]
+            queried_df = pd.DataFrame(smiles, columns=["SMILES"])
+        except:
+            st.error("Error reading the SDF file, please check the input", icon="🚨")
+            st.stop()
+    else:
+        st.error("No file uploaded", icon="🚨")
+        st.stop()
+    st.success(
+        f"All molecule entries are valid (n={len(queried_df)}), proceeding with prediction",
+        icon="✅",
+    )
+    smiles_column_name = "SMILES"
+    smiles_column = queried_df[smiles_column_name]
+    multismiles = True
+st.markdown("## Model parameters :nut_and_bolt:")
+targets = ['CYP3A4', 'CYP2D6', 'CYP2C9']
+models = {"ecfp:4 Mordred LGBM":"ecfp4_mordred_lgbm", "ChemProp":"chemprop"}
+models_inversed = {v: k for k, v in models.items()}
+model_names = list(models.keys())
+endpoints = ["pIC50"]
+# Select a target value from the preset list
+target_value = st.selectbox("Select a biological target ", targets, key="target")
+# endpoints
+# Select a target value from the preset list
+endpoint_value = st.selectbox("Select a property ", endpoints, key="endpoint")
+model_value = st.selectbox("Select a model type ", model_names, key="model")
+if target_value != "CYP3A4":
+    st.write("Only CYP3A4 is currently supported")
+    st.stop()
+if endpoint_value != "pIC50":
+    st.write("Only pIC50 is currently supported")
+    st.stop()
+if model_value != "ecfp:4 Mordred LGBM":
+    st.write("Only ecfp:4 Mordred LGBM is currently supported")
+    st.stop()
+model, featurizer = get_model("./models", target_value, models[model_value])
+if model is None:
+    st.write(f"No model found for {target_value} {endpoint_value}")
+    st.stop()
+    # retry with a different target or endpoint
+st.markdown("## Prediction 🚀")
+st.write(
+    f"Predicting **{target_value} {endpoint_value}** using model:\n\n `{model_value}`"
+)
+# featurize the smiles
+X, _ = featurizer.featurize(smiles_column)
+# predict the properties
+preds = model.predict(X)
+# not implemented yet
+err = None
+pred_column_name = f"{target_value}_computed-{endpoint_value}"
+unc_column_name = f"{target_value}_computed-{endpoint_value}_uncertainty"
+queried_df[pred_column_name] = preds
+queried_df[unc_column_name] = err
+st.markdown("---")
+if multismiles:
+    # plot the predictions and errors
+    # Histogram first
+    fig, ax = plt.subplots()
+    sorted_df = queried_df.sort_values(by=pred_column_name)
+    n_bins = int(len(sorted_df[pred_column_name]) / 10)
+    if n_bins < 5:  # makes the histogram slightly more interpretable with low data
+        n_bins = 5
+    ax.hist(sorted_df[pred_column_name], bins=n_bins)
+    ax.set_ylabel("Count")
+    ax.set_xlabel(f"Computed {endpoint_value}")
+    ax.set_title(f"Histogram of computed {endpoint_value} for target: {target_value}")
+    st.pyplot(fig)
+    # then a barplot
+    fig, ax = plt.subplots()
+    ax.bar(range(len(sorted_df)), sorted_df[pred_column_name])
+    ax.set_xticks([])
+    ax.set_xlabel(f"Query compounds")
+    ax.set_ylabel(f"Computed {endpoint_value}")
+    ax.set_title(f"Barplot of computed {endpoint_value} for target: {target_value}")
+    st.pyplot(fig)
+    # if endpoint_value == "pIC50":
+    #     from rdkit.Chem.Descriptors import MolWt
+    #     import seaborn as sns
+    #     # then a scatterplot of uncertainty vs MW
+    #     queried_df["MW"] = [
+    #         MolWt(Chem.MolFromSmiles(smi)) for smi in sorted_df[smiles_column_name]
+    #     ]
+    #     fig, ax = plt.subplots()
+    #     ax = sns.scatterplot(
+    #         x="MW",
+    #         y=pred_column_name,
+    #         hue=unc_column_name,
+    #         palette="coolwarm",
+    #         data=queried_df,
+    #     )
+    #     norm = plt.Normalize(
+    #         queried_df[unc_column_name].min(), queried_df[unc_column_name].max()
+    #     )
+    #     sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
+    #     sm.set_array([])
+    #     # Remove the legend and add a colorbar
+    #     cbar = ax.figure.colorbar(sm, ax=ax)
+    #     ax.annotate(
+    #         f"Computed {endpoint_value} uncertainty",
+    #         xy=(1.2, 0.3),
+    #         xycoords="axes fraction",
+    #         rotation=270,
+    #     )
+    #     ax.set_title(
+    #         f"Scatterplot of predicted {endpoint_value} versus MW\ntarget: {target_value}"
+    #     )
+    #     ax.set_xlabel(f"Molecular weight (Da)")
+    #     ax.set_ylabel(f"Computed {endpoint_value}")
+    #     st.pyplot(fig)
+else:
+    # just print the prediction
+    preds = queried_df[pred_column_name].values[0]
+    smiles = queried_df["SMILES"].values[0]
+    if err:
+        err = queried_df[unc_column_name].values[0]
+        errstr = f"± {err:.2f}"
+    else:
+        errstr = ""
+    st.markdown(
+        f"Predicted {target_value} {endpoint_value} for {smiles} is {preds:.2f} {errstr}."
+    )
+# allow the user to download the predictions
+csv = convert_df(queried_df)
+st.download_button(
+    label="Download data as CSV",
+    data=csv,
+    file_name=f"predictions_{model_value}.csv",
+    mime="text/csv",
+)

models/ecfp4_mordred_lgbm/cyp3a4_model.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "model_params": {
+    "boosting_type": "gbdt",
+    "class_weight": null,
+    "colsample_bytree": 1.0,
+    "importance_type": "split",
+    "learning_rate": 0.05,
+    "max_depth": -1,
+    "min_child_samples": 20,
+    "min_child_weight": 0.001,
+    "min_split_gain": 0.0,
+    "n_estimators": 500,
+    "n_jobs": null,
+    "num_leaves": 31,
+    "objective": null,
+    "random_state": null,
+    "reg_alpha": 0.0,
+    "reg_lambda": 0.0,
+    "subsample": 1.0,
+    "subsample_for_bin": 200000,
+    "subsample_freq": 0,
+    "alpha": 0.005
+  }
+}

models/ecfp4_mordred_lgbm/cyp3a4_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c4dd3ba5e2deb461c7471ad8c6e957044e81225fe1f55faebe3ee5abc2e3e0
+size 1605595