Spaces:

openadmet
/

OpenADMET_Challenges

Running

App Files Files Community

hmacdope-omsf

hmacdope commited on 19 days ago

Commit

b77cb84

verified ·

1 Parent(s): ac084ef

improving HF space (#1)

Browse files

- Update interface (b4b15c962bc3f447dcb968b7cee4872dea3c6f7c)
- HMO updates (3687c401b7514461058d15607c9783942e793912)

Co-authored-by: Hugo MacDermott-Opeskin <[email protected]>

Files changed (4) hide show

about.py +21 -0
app.py +178 -27
evaluate.py +211 -0
requirements.txt +3 -1

about.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from huggingface_hub import HfApi
+ENDPOINTS = ["LogD",
+             "KSol",
+             "MLM CLint",
+             "HLM CLint",
+             "Caco-2 Permeability Efflux",
+             "Caco-2 Permeability Papp A>B",
+             "MPPB",
+             "MBPB",
+             "RLM CLint",
+             "MGMB"]
+TOKEN = os.environ.get("HF_TOKEN")
+CACHE_PATH=os.getenv("HF_HOME", ".")
+API = HfApi(token=TOKEN)
+organization="OpenADMET"
+submissions_repo = f'{organization}/openadmet-challenge-submissions' # private
+results_repo = f'{organization}/openadmet-challenge-results' # public
+test_repo = f'{organization}/openadmet-challenge-test-data' # private

app.py CHANGED Viewed

@@ -2,27 +2,85 @@ import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter
 import pandas as pd
-# dataset = load_dataset("your_dataset_name")
 from datetime import datetime
 def gradio_interface():
     with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
         # --- Welcome markdown message ---
         welcome_md = """
-        # 🧪 OpenADMET + XXX
         ## Computational Blind Challenge in ADMET
-        Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
         Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
         📅 **Timeline**:
         - TBD
@@ -34,35 +92,128 @@ def gradio_interface():
         # --- Gradio Interface ---
         with gr.Tabs(elem_classes="tab-buttons"):
-            with gr.TabItem("Welcome"):
                 gr.Markdown(welcome_md)
-            with gr.TabItem("Submit Predictions"):
-                gr.Markdown("Upload your prediction files here.")
-                filename = gr.State(value=None)
-                eval_state = gr.State(value=None)
-                user_state = gr.State(value=None)
-            with gr.TabItem("Leaderboard"):
-                gr.Markdown("View the leaderboard here.")
-                df = pd.DataFrame({
                     "user": ["User1", "User2", "User3"],
-                    "Model": ["A", "B", "C"],
                     "R2": [0.94, 0.92, 0.89],
                     "Spearman R": [0.93, 0.91, 0.88],
                 })
-                Leaderboard(
-                    value=df,
-                    # Optionally configure columns:
-                    select_columns=["Model", "R2", "Spearman R"],
-                    # Additional options: search_columns, filter_columns, hide_columns, etc.
-                    search_columns=["Model", "user"],
-                )
-            with gr.TabItem("About"):
-                gr.Markdown("Learn more about the challenge and the organizers.")
     return demo
 if __name__ == "__main__":

 from gradio_leaderboard import Leaderboard, ColumnFilter
 import pandas as pd
+from about import submissions_repo, results_repo
+from evaluate import submit_data, evaluate_data
+from datasets import load_dataset
 from datetime import datetime
+from about import ENDPOINTS
+def get_leaderboard(dset):
+    dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
+    full_df = pd.DataFrame(dset)
+    to_show = full_df.copy(deep=True)
+    to_show = to_show[to_show['user'] != 'test']
+    # The columns to display publicly
+    to_show = to_show[["user", "Model", "MAE", "R2", "Spearman R", "Kendall's Tau"]]
+    return to_show
 def gradio_interface():
     with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
+        gr.Markdown("## Welcome to the OpenADMET + XXX Blind Challenge!")
         # --- Welcome markdown message ---
         welcome_md = """
+        # 💊 OpenADMET + XXX
         ## Computational Blind Challenge in ADMET
+        Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
+        This is a community-driven initiative to benchmark predictive models for ADMET properties in drug discovery.
         Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
+        ## ADMET Properties:
+        *Absorption*, *Distribution*, *Metabolism*, *Excretion*, *Toxicology*--or **ADMET**--endpoints sit in the middle of the assay cascade and can make or break preclinical candidate molecules.
+        For this blind challenge we selected several crucial endpoints for the community to predict:
+        - LogD
+        - Kinetic Solubility **KSOL**: uM
+        - Mouse Liver Microsomal (**MLM**) *CLint*: mL/min/kg
+        - Human Liver Microsomal (**HLM**) *Clint*: mL/min/kg
+        - Caco-2 Efflux Ratio
+        - Caco-2 Papp A>B (10^-6 cm/s)
+        - Mouse Plasma Protein Binding (**MPPB**): % Unbound
+        - Mouse Brain Protein Binding (**MBPB**): % Unbound
+        - Rat Liver Microsomal (**RLM**) *Clint*: mL/min/kg
+        - Mouse Gastrocnemius Muscle Binding (**MGMB**): % Unbound
+        ## ✅ How to Participate
+        1. **Register**: Create an account with Hugging Face.
+        2. **Download the Public Dataset**: Clone the XXX dataset [link]
+        3. **Train Your Model**: Use the provided training data for each ADMET property of your choice.
+        4. **Submit Predictions**: Follow the instructions in the *Submit* tab to upload your predictions.
+        5. Join the discussion on the [Challenge Discord](link)!
+        ## 📊 Data:
+        The training set will have the following variables:
+        | Column                       |    Unit   | data type |  Description |
+        |:-----------------------------|-----------|-----------|:-------------|
+        | Molecule Name                |           |    str    | Identifier for the molecule |
+        | Smiles                       |           |    str    | Text representation of the 2D molecular structure |
+        | LogD                         |           |   float   | LogD calculation |
+        | KSol                         |    uM     |   float   | Kinetic Solubility |
+        | MLM CLint                    | mL/min/kg |   float   | Mouse Liver Microsomal |
+        | HLM CLint                    | mL/min/kg |   float   | Human Liver Microsomal |
+        | Caco-2 Permeability Efflux   |           |   float   | Caco-2 Permeability Efflux |
+        | Caco-2 Permeability Papp A>B | 10^-6 cm/s|   float   | Caco-2 Permeability Papp A>B |
+        | MPPB                         | % Unbound |   float   | Mouse Plasma Protein Binding |
+        | MBPB                         | % Unbound |   float   | Mouse Brain Protein Binding |
+        | RLM CLint                    | mL/min/kg |   float   | Rat Liver Microsomal Stability |
+        | MGMB.                        | % Unbound |   float   | Mouse Gastrocnemius Muscle Binding |
+        At test time, we will only provide the Molecule Name and Smiles. Make sure your submission file has the same columns!
+        ## 📝 Evaluation
+        The challenge will be judged based on the judging criteria outlined here.
+        - TBD
         📅 **Timeline**:
         - TBD
         # --- Gradio Interface ---
         with gr.Tabs(elem_classes="tab-buttons"):
+            with gr.TabItem("📝About"):
                 gr.Markdown(welcome_md)
+            with gr.TabItem("🚀Leaderboard"):
+                gr.Markdown("View the leaderboard for each ADMET endpoint by selecting the appropiate tab.")
+                df1 = pd.DataFrame({
                     "user": ["User1", "User2", "User3"],
+                    "MAE": [0.1, 0.2, 0.15],
                     "R2": [0.94, 0.92, 0.89],
                     "Spearman R": [0.93, 0.91, 0.88],
+                    "Kendall's Tau": [0.90, 0.89, 0.85],
                 })
+                df2 = pd.DataFrame({
+                    "user": ["User1", "User2", "User3"],
+                    "MAE": [0.2, 0.3, 0.15],
+                    "R2": [0.2, 0.72, 0.89],
+                    "Spearman R": [0.91, 0.71, 0.68],
+                    "Kendall's Tau": [0.90, 0.4, 0.7],
+                })
+                # Make separate leaderboards in separate tabs
+                mock_data = [df1, df1, df2, df1, df2, df1, df1, df2, df1, df2]
+                for i, endpoint in enumerate(ENDPOINTS):
+                    df = mock_data[i]
+                    with gr.TabItem(endpoint):
+                        Leaderboard(
+                            value=df,
+                            datatype=['str', 'number', 'number', 'number', 'number'],
+                            select_columns=["user", "MAE", "R2", "Spearman R", "Kendall's Tau"],
+                            search_columns=["user"],
+                            every=60,
+                        )
+            with gr.TabItem("Submit Predictions"):
+                gr.Markdown(
+                """
+                # ADME Endpoints Submission
+                Upload your prediction files here as a csv file.
+                """
+                )
+                filename = gr.State(value=None)
+                eval_state = gr.State(value=None)
+                user_state = gr.State(value=None)
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown(
+                            """
+                            ## Participant Information
+                            To participate, you must enter a Hugging Face username, or alias, which will be displayed on the leaderboard.
+                            Other information is optional but helps us track participation.
+                            If you wish to be included in Challenge discussions, please provide your Discord username and email.
+                            If you wish to be included in a future publication with the Challenge results, please provide your name and affiliation.
+                            """
+                            )
+                    #    endpoint_type = gr.CheckboxGroup(
+                    ##        ENDPOINTS,
+                    #        label="ADME Endpoints",
+                    #        info="Select the ADME endpoints you are submitting predictions for."),
+                        # Could also allow a display name in case HF username is not necessary?
+                        username_input = gr.Textbox(
+                            label="Username",
+                            placeholder="Enter your Hugging Face username",
+                            info="This will be displayed on the leaderboard."
+                        )
+                    with gr.Column():
+                        # Info to track participant, that will not be displayed publicly
+                        participant_name = gr.Textbox(
+                            label="Participant Name",
+                            placeholder="Enter your name (optional)",
+                            info="This will not be displayed on the leaderboard but will be used for tracking participation."
+                        )
+                        discord_username= gr.Textbox(
+                            label="Discord Username",
+                            placeholder="Enter your Discord username (optional)",
+                            info="Enter the username you will use for the Discord channel (if you are planning to engage in the discussion)."
+                        )
+                        email = gr.Textbox(
+                            label="Email",
+                            placeholder="Enter your email (optional)",
+                        )
+                        affiliation = gr.Textbox(
+                            label="Affiliation",
+                            placeholder="Enter your school/company affiliation (optional)",
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown(
+                            """
+                            ## Submission Instructions
+                            Upload a single CSV file containing your predictions for all ligands in the test set.
+                            You can download the ligand test set here (lik/to/download/smiles/csv).
+                            """
+                        )
+                    with gr.Column():
+                        predictions_file = gr.File(label="Single file with ADME predictions (.csv)",
+                                                file_types=[".csv"],
+                                                file_count="single",)
+                username_input.change(
+                    fn=lambda x: x if x.strip() else None,
+                    inputs=username_input,
+                    outputs=user_state
+                )
+                submit_btn = gr.Button("Submit Predictions")
+                message = gr.Textbox(label="Status", lines=1, visible=False)
+                submit_btn.click(
+                    submit_data,
+                    inputs=[predictions_file, user_state, participant_name, discord_username, email, affiliation],
+                    outputs=[message],
+                ).success(
+                    fn=lambda m: gr.update(value=m, visible=True),
+                    inputs=[message],
+                    outputs=[message],
+                ).success(
+                    fn=evaluate_data,
+                    inputs=[filename],
+                    outputs=[eval_state]
+                )
     return demo
 if __name__ == "__main__":

evaluate.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from scipy.stats import spearmanr, kendalltau
+from sklearn.metrics import mean_absolute_error, r2_score
+from typing import Optional
+from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
+from huggingface_hub import hf_hub_download
+import datetime
+import io
+import json, tempfile
+import pydantic
+class ParticipantRecord(pydantic.BaseModel):
+    hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username")
+    participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name")
+    discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username")
+    email: Optional[str] = pydantic.Field(default=None, description="Email address")
+    affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation")
+    model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag")
+class SubmissionMetadata(pydantic.BaseModel):
+    submission_time_utc: datetime.datetime
+    user: str
+    original_filename: str
+    evaluated: bool
+    participant: ParticipantRecord
+def _safeify_username(username: str) -> str:
+    return str(username.strip()).replace("/", "_").replace(" ", "_")
+def _unsafify_username(username: str) -> str:
+    return str(username.strip()).replace("/", "_").replace(" ", "_")
+def submit_data(predictions_file: str,
+                user_state,
+                participant_name: str = "",
+                discord_username: str = "",
+                email: str = "",
+                affiliation: str = ""
+):
+    if user_state is None:
+        raise gr.Error("Username or alias is required for submission.")
+    file_path = Path(predictions_file).resolve()
+    if not file_path.exists():
+        raise gr.Error("Uploaded file object does not have a valid file path.")
+    # Read results file
+    try:
+        results_df = pd.read_csv(file_path)
+    except Exception as e:
+        return f"❌ Error reading results file: {str(e)}"
+    if results_df.empty:
+        return gr.Error("The uploaded file is empty.")
+    if not set(ENDPOINTS).issubset(set(results_df.columns)):
+        return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.")
+    # TODO, much more validation logic needed depending on the state of final data
+    # Build destination filename in the dataset
+    ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") # should keep default time so can be deserialized correctly
+    safe_user = _safeify_username(user_state)
+    destination_csv = f"submissions/{safe_user}_{ts}.csv"
+    destination_json = destination_csv.replace(".csv", ".json")
+    # Upload the CSV file
+    API.upload_file(
+        path_or_fileobj=str(file_path),
+        path_in_repo=destination_csv,
+        repo_id=submissions_repo,
+        repo_type="dataset",
+        commit_message=f"Add submission for {safe_user} at {ts}"
+    )
+    # Optional participant record
+    try:
+        participant_record = ParticipantRecord(
+            hf_username=user_state,
+            participant_name=participant_name,
+            discord_username=discord_username,
+            email=email,
+            affiliation=affiliation,
+        )
+    except pydantic.ValidationError as e:
+        return f"❌ Error in participant information: {str(e)}"
+    try:
+        meta = SubmissionMetadata(
+            submission_time_utc=ts,
+            original_filename=file_path.name,
+            evaluated=False,
+            participant=participant_record
+        )
+    except pydantic.ValidationError as e:
+        return f"❌ Error in metadata information: {str(e)}"
+    meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
+    API.upload_file(
+        path_or_fileobj=meta_bytes,
+        path_in_repo=destination_json,
+        repo_id=submissions_repo,
+        repo_type="dataset",
+        commit_message=f"Add metadata for {user_state} submission at {ts}"
+    )
+    return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
+def evaluate_data(filename: str) -> None:
+    # Load the submission csv
+    try:
+        local_path = hf_hub_download(
+            repo_id=submissions_repo,
+            repo_type="dataset",
+            filename=filename,
+        )
+    except Exception as e:
+        raise gr.Error(f"Failed to download submission file: {e}")
+    # Load the test set
+    try:
+        test_path = hf_hub_download(
+            repo_id=test_repo,
+            repo_type="dataset",
+            filename="data/test_dataset.csv",
+        )
+    except Exception as e:
+        raise gr.Error(f"Failed to download test file: {e}")
+    data_df = pd.read_csv(local_path)
+    test_df = pd.read_csv(test_path)
+    try:
+        results_df = calculate_metrics(data_df, test_df)
+        if not isinstance(results_df, pd.DataFrame) or results_df.empty:
+            raise gr.Error("Evaluation produced no results.")
+    except Exception as e:
+        raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.')
+    # Load metadata file
+    meta_filename = filename.replace(".csv", ".json")
+    try:
+        meta_path = hf_hub_download(
+                repo_id=submissions_repo,
+                repo_type="dataset",
+                filename=meta_filename,
+            )
+        with open(meta_path, "r", encoding="utf-8") as f:
+            _meta = json.load(f)
+        meta = SubmissionMetadata(**_meta)
+        username = meta.participant.hf_username
+        timestamp = meta.submission_time_utc
+    except Exception as e:
+        raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.")
+    # Write results to results dataset
+    results_df['user'] = username
+    safe_user = _unsafify_username(username)
+    destination_path = f"results/{safe_user}_{timestamp}_results.csv"
+    tmp_name = None
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
+        results_df.to_csv(tmp, index=False)
+        tmp.flush()
+        tmp_name = tmp.name
+    API.upload_file(
+            path_or_fileobj=tmp_name,
+            path_in_repo=destination_path,
+            repo_id=results_repo,
+            repo_type="dataset",
+            commit_message=f"Add result data for {username}"
+        )
+    Path(tmp_name).unlink()
+def calculate_metrics(
+        results_dataframe: pd.DataFrame,
+        test_dataframe: pd.DataFrame
+    ):
+    def metrics_per_ep(pred, true):
+        mae = mean_absolute_error(true, pred)
+        r2 = r2_score(true, pred)
+        spr, _ = spearmanr(true, pred)
+        ktau, _ = kendalltau(true, pred)
+        return mae, r2, spr, ktau
+    df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"])
+    for i, measurement in enumerate(ENDPOINTS):
+        df_pred = results_dataframe[['Molecule Name', measurement]].dropna()
+        df_true = test_dataframe[['Molecule Name', measurement]].dropna()
+        # Make sure both have the same order
+        pred = df_pred.sort_values(by='Molecule Name')[measurement]
+        true = df_true.sort_values(by='Molecule Name')[measurement]
+        mae, r2, spearman, ktau = metrics_per_ep(pred, true)
+        df_results.loc[i, 'endpoint'] = measurement
+        df_results.loc[i, 'MAE'] = mae
+        df_results.loc[i, 'R2'] = r2
+        df_results.loc[i, 'Spearman R'] = spearman
+        df_results.loc[i, "Kendall's Tau"] = ktau
+    return df_results

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ gradio
 datasets
 huggingface_hub
 gradio-leaderboard
-plotly

 datasets
 huggingface_hub
 gradio-leaderboard
+plotly
+scipy
+scikit-learn