hmacdope-omsf hmacdope commited on
Commit
b77cb84
Β·
verified Β·
1 Parent(s): ac084ef

improving HF space (#1)

Browse files

- Update interface (b4b15c962bc3f447dcb968b7cee4872dea3c6f7c)
- HMO updates (3687c401b7514461058d15607c9783942e793912)


Co-authored-by: Hugo MacDermott-Opeskin <[email protected]>

Files changed (4) hide show
  1. about.py +21 -0
  2. app.py +178 -27
  3. evaluate.py +211 -0
  4. requirements.txt +3 -1
about.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ ENDPOINTS = ["LogD",
5
+ "KSol",
6
+ "MLM CLint",
7
+ "HLM CLint",
8
+ "Caco-2 Permeability Efflux",
9
+ "Caco-2 Permeability Papp A>B",
10
+ "MPPB",
11
+ "MBPB",
12
+ "RLM CLint",
13
+ "MGMB"]
14
+
15
+ TOKEN = os.environ.get("HF_TOKEN")
16
+ CACHE_PATH=os.getenv("HF_HOME", ".")
17
+ API = HfApi(token=TOKEN)
18
+ organization="OpenADMET"
19
+ submissions_repo = f'{organization}/openadmet-challenge-submissions' # private
20
+ results_repo = f'{organization}/openadmet-challenge-results' # public
21
+ test_repo = f'{organization}/openadmet-challenge-test-data' # private
app.py CHANGED
@@ -2,27 +2,85 @@ import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import pandas as pd
4
 
5
- # dataset = load_dataset("your_dataset_name")
6
-
7
 
 
8
  from datetime import datetime
 
 
9
 
 
 
 
 
 
 
 
10
 
 
11
 
12
  def gradio_interface():
13
  with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
14
 
15
-
16
 
17
  # --- Welcome markdown message ---
18
  welcome_md = """
19
- # πŸ§ͺ OpenADMET + XXX
20
  ## Computational Blind Challenge in ADMET
21
 
22
- Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
 
23
 
24
  Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  πŸ“… **Timeline**:
28
  - TBD
@@ -34,35 +92,128 @@ def gradio_interface():
34
  # --- Gradio Interface ---
35
  with gr.Tabs(elem_classes="tab-buttons"):
36
 
37
- with gr.TabItem("Welcome"):
38
  gr.Markdown(welcome_md)
39
 
40
- with gr.TabItem("Submit Predictions"):
41
- gr.Markdown("Upload your prediction files here.")
42
- filename = gr.State(value=None)
43
- eval_state = gr.State(value=None)
44
- user_state = gr.State(value=None)
45
-
46
- with gr.TabItem("Leaderboard"):
47
- gr.Markdown("View the leaderboard here.")
48
- df = pd.DataFrame({
49
  "user": ["User1", "User2", "User3"],
50
- "Model": ["A", "B", "C"],
51
  "R2": [0.94, 0.92, 0.89],
52
  "Spearman R": [0.93, 0.91, 0.88],
 
53
  })
54
- Leaderboard(
55
- value=df,
56
- # Optionally configure columns:
57
- select_columns=["Model", "R2", "Spearman R"],
58
- # Additional options: search_columns, filter_columns, hide_columns, etc.
59
- search_columns=["Model", "user"],
60
- )
61
-
62
-
63
- with gr.TabItem("About"):
64
- gr.Markdown("Learn more about the challenge and the organizers.")
 
 
 
 
 
 
 
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return demo
67
 
68
  if __name__ == "__main__":
 
2
  from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import pandas as pd
4
 
5
+ from about import submissions_repo, results_repo
6
+ from evaluate import submit_data, evaluate_data
7
 
8
+ from datasets import load_dataset
9
  from datetime import datetime
10
+ from about import ENDPOINTS
11
+
12
 
13
+ def get_leaderboard(dset):
14
+ dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
15
+ full_df = pd.DataFrame(dset)
16
+ to_show = full_df.copy(deep=True)
17
+ to_show = to_show[to_show['user'] != 'test']
18
+ # The columns to display publicly
19
+ to_show = to_show[["user", "Model", "MAE", "R2", "Spearman R", "Kendall's Tau"]]
20
 
21
+ return to_show
22
 
23
  def gradio_interface():
24
  with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
25
 
26
+ gr.Markdown("## Welcome to the OpenADMET + XXX Blind Challenge!")
27
 
28
  # --- Welcome markdown message ---
29
  welcome_md = """
30
+ # πŸ’Š OpenADMET + XXX
31
  ## Computational Blind Challenge in ADMET
32
 
33
+ Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
34
+ This is a community-driven initiative to benchmark predictive models for ADMET properties in drug discovery.
35
 
36
  Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
37
 
38
+ ## ADMET Properties:
39
+ *Absorption*, *Distribution*, *Metabolism*, *Excretion*, *Toxicology*--or **ADMET**--endpoints sit in the middle of the assay cascade and can make or break preclinical candidate molecules.
40
+ For this blind challenge we selected several crucial endpoints for the community to predict:
41
+ - LogD
42
+ - Kinetic Solubility **KSOL**: uM
43
+ - Mouse Liver Microsomal (**MLM**) *CLint*: mL/min/kg
44
+ - Human Liver Microsomal (**HLM**) *Clint*: mL/min/kg
45
+ - Caco-2 Efflux Ratio
46
+ - Caco-2 Papp A>B (10^-6 cm/s)
47
+ - Mouse Plasma Protein Binding (**MPPB**): % Unbound
48
+ - Mouse Brain Protein Binding (**MBPB**): % Unbound
49
+ - Rat Liver Microsomal (**RLM**) *Clint*: mL/min/kg
50
+ - Mouse Gastrocnemius Muscle Binding (**MGMB**): % Unbound
51
+
52
+ ## βœ… How to Participate
53
+ 1. **Register**: Create an account with Hugging Face.
54
+ 2. **Download the Public Dataset**: Clone the XXX dataset [link]
55
+ 3. **Train Your Model**: Use the provided training data for each ADMET property of your choice.
56
+ 4. **Submit Predictions**: Follow the instructions in the *Submit* tab to upload your predictions.
57
+ 5. Join the discussion on the [Challenge Discord](link)!
58
+
59
+ ## πŸ“Š Data:
60
+
61
+ The training set will have the following variables:
62
+
63
+ | Column | Unit | data type | Description |
64
+ |:-----------------------------|-----------|-----------|:-------------|
65
+ | Molecule Name | | str | Identifier for the molecule |
66
+ | Smiles | | str | Text representation of the 2D molecular structure |
67
+ | LogD | | float | LogD calculation |
68
+ | KSol | uM | float | Kinetic Solubility |
69
+ | MLM CLint | mL/min/kg | float | Mouse Liver Microsomal |
70
+ | HLM CLint | mL/min/kg | float | Human Liver Microsomal |
71
+ | Caco-2 Permeability Efflux | | float | Caco-2 Permeability Efflux |
72
+ | Caco-2 Permeability Papp A>B | 10^-6 cm/s| float | Caco-2 Permeability Papp A>B |
73
+ | MPPB | % Unbound | float | Mouse Plasma Protein Binding |
74
+ | MBPB | % Unbound | float | Mouse Brain Protein Binding |
75
+ | RLM CLint | mL/min/kg | float | Rat Liver Microsomal Stability |
76
+ | MGMB. | % Unbound | float | Mouse Gastrocnemius Muscle Binding |
77
+
78
+ At test time, we will only provide the Molecule Name and Smiles. Make sure your submission file has the same columns!
79
+
80
+ ## πŸ“ Evaluation
81
+ The challenge will be judged based on the judging criteria outlined here.
82
+
83
+ - TBD
84
 
85
  πŸ“… **Timeline**:
86
  - TBD
 
92
  # --- Gradio Interface ---
93
  with gr.Tabs(elem_classes="tab-buttons"):
94
 
95
+ with gr.TabItem("πŸ“About"):
96
  gr.Markdown(welcome_md)
97
 
98
+ with gr.TabItem("πŸš€Leaderboard"):
99
+ gr.Markdown("View the leaderboard for each ADMET endpoint by selecting the appropiate tab.")
100
+ df1 = pd.DataFrame({
 
 
 
 
 
 
101
  "user": ["User1", "User2", "User3"],
102
+ "MAE": [0.1, 0.2, 0.15],
103
  "R2": [0.94, 0.92, 0.89],
104
  "Spearman R": [0.93, 0.91, 0.88],
105
+ "Kendall's Tau": [0.90, 0.89, 0.85],
106
  })
107
+ df2 = pd.DataFrame({
108
+ "user": ["User1", "User2", "User3"],
109
+ "MAE": [0.2, 0.3, 0.15],
110
+ "R2": [0.2, 0.72, 0.89],
111
+ "Spearman R": [0.91, 0.71, 0.68],
112
+ "Kendall's Tau": [0.90, 0.4, 0.7],
113
+ })
114
+ # Make separate leaderboards in separate tabs
115
+ mock_data = [df1, df1, df2, df1, df2, df1, df1, df2, df1, df2]
116
+ for i, endpoint in enumerate(ENDPOINTS):
117
+ df = mock_data[i]
118
+ with gr.TabItem(endpoint):
119
+ Leaderboard(
120
+ value=df,
121
+ datatype=['str', 'number', 'number', 'number', 'number'],
122
+ select_columns=["user", "MAE", "R2", "Spearman R", "Kendall's Tau"],
123
+ search_columns=["user"],
124
+ every=60,
125
+ )
126
 
127
+ with gr.TabItem("Submit Predictions"):
128
+ gr.Markdown(
129
+ """
130
+ # ADME Endpoints Submission
131
+ Upload your prediction files here as a csv file.
132
+ """
133
+ )
134
+ filename = gr.State(value=None)
135
+ eval_state = gr.State(value=None)
136
+ user_state = gr.State(value=None)
137
+
138
+ with gr.Row():
139
+
140
+ with gr.Column():
141
+ gr.Markdown(
142
+ """
143
+ ## Participant Information
144
+ To participate, you must enter a Hugging Face username, or alias, which will be displayed on the leaderboard.
145
+ Other information is optional but helps us track participation.
146
+ If you wish to be included in Challenge discussions, please provide your Discord username and email.
147
+ If you wish to be included in a future publication with the Challenge results, please provide your name and affiliation.
148
+ """
149
+ )
150
+ # endpoint_type = gr.CheckboxGroup(
151
+ ## ENDPOINTS,
152
+ # label="ADME Endpoints",
153
+ # info="Select the ADME endpoints you are submitting predictions for."),
154
+ # Could also allow a display name in case HF username is not necessary?
155
+ username_input = gr.Textbox(
156
+ label="Username",
157
+ placeholder="Enter your Hugging Face username",
158
+ info="This will be displayed on the leaderboard."
159
+ )
160
+ with gr.Column():
161
+ # Info to track participant, that will not be displayed publicly
162
+ participant_name = gr.Textbox(
163
+ label="Participant Name",
164
+ placeholder="Enter your name (optional)",
165
+ info="This will not be displayed on the leaderboard but will be used for tracking participation."
166
+ )
167
+ discord_username= gr.Textbox(
168
+ label="Discord Username",
169
+ placeholder="Enter your Discord username (optional)",
170
+ info="Enter the username you will use for the Discord channel (if you are planning to engage in the discussion)."
171
+ )
172
+ email = gr.Textbox(
173
+ label="Email",
174
+ placeholder="Enter your email (optional)",
175
+ )
176
+ affiliation = gr.Textbox(
177
+ label="Affiliation",
178
+ placeholder="Enter your school/company affiliation (optional)",
179
+ )
180
+
181
+ with gr.Row():
182
+ with gr.Column():
183
+ gr.Markdown(
184
+ """
185
+ ## Submission Instructions
186
+ Upload a single CSV file containing your predictions for all ligands in the test set.
187
+ You can download the ligand test set here (lik/to/download/smiles/csv).
188
+ """
189
+ )
190
+ with gr.Column():
191
+ predictions_file = gr.File(label="Single file with ADME predictions (.csv)",
192
+ file_types=[".csv"],
193
+ file_count="single",)
194
+
195
+ username_input.change(
196
+ fn=lambda x: x if x.strip() else None,
197
+ inputs=username_input,
198
+ outputs=user_state
199
+ )
200
+
201
+ submit_btn = gr.Button("Submit Predictions")
202
+ message = gr.Textbox(label="Status", lines=1, visible=False)
203
+
204
+ submit_btn.click(
205
+ submit_data,
206
+ inputs=[predictions_file, user_state, participant_name, discord_username, email, affiliation],
207
+ outputs=[message],
208
+ ).success(
209
+ fn=lambda m: gr.update(value=m, visible=True),
210
+ inputs=[message],
211
+ outputs=[message],
212
+ ).success(
213
+ fn=evaluate_data,
214
+ inputs=[filename],
215
+ outputs=[eval_state]
216
+ )
217
  return demo
218
 
219
  if __name__ == "__main__":
evaluate.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from scipy.stats import spearmanr, kendalltau
5
+ from sklearn.metrics import mean_absolute_error, r2_score
6
+ from typing import Optional
7
+ from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
8
+ from huggingface_hub import hf_hub_download
9
+ import datetime
10
+ import io
11
+ import json, tempfile
12
+ import pydantic
13
+
14
+
15
+ class ParticipantRecord(pydantic.BaseModel):
16
+ hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username")
17
+ participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name")
18
+ discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username")
19
+ email: Optional[str] = pydantic.Field(default=None, description="Email address")
20
+ affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation")
21
+ model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag")
22
+
23
+
24
+ class SubmissionMetadata(pydantic.BaseModel):
25
+ submission_time_utc: datetime.datetime
26
+ user: str
27
+ original_filename: str
28
+ evaluated: bool
29
+ participant: ParticipantRecord
30
+
31
+
32
+ def _safeify_username(username: str) -> str:
33
+ return str(username.strip()).replace("/", "_").replace(" ", "_")
34
+
35
+ def _unsafify_username(username: str) -> str:
36
+ return str(username.strip()).replace("/", "_").replace(" ", "_")
37
+
38
+ def submit_data(predictions_file: str,
39
+ user_state,
40
+ participant_name: str = "",
41
+ discord_username: str = "",
42
+ email: str = "",
43
+ affiliation: str = ""
44
+ ):
45
+
46
+ if user_state is None:
47
+ raise gr.Error("Username or alias is required for submission.")
48
+
49
+ file_path = Path(predictions_file).resolve()
50
+
51
+ if not file_path.exists():
52
+ raise gr.Error("Uploaded file object does not have a valid file path.")
53
+
54
+ # Read results file
55
+ try:
56
+ results_df = pd.read_csv(file_path)
57
+ except Exception as e:
58
+ return f"❌ Error reading results file: {str(e)}"
59
+
60
+ if results_df.empty:
61
+ return gr.Error("The uploaded file is empty.")
62
+ if not set(ENDPOINTS).issubset(set(results_df.columns)):
63
+ return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.")
64
+
65
+ # TODO, much more validation logic needed depending on the state of final data
66
+
67
+ # Build destination filename in the dataset
68
+ ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") # should keep default time so can be deserialized correctly
69
+ safe_user = _safeify_username(user_state)
70
+
71
+ destination_csv = f"submissions/{safe_user}_{ts}.csv"
72
+ destination_json = destination_csv.replace(".csv", ".json")
73
+ # Upload the CSV file
74
+ API.upload_file(
75
+ path_or_fileobj=str(file_path),
76
+ path_in_repo=destination_csv,
77
+ repo_id=submissions_repo,
78
+ repo_type="dataset",
79
+ commit_message=f"Add submission for {safe_user} at {ts}"
80
+ )
81
+
82
+ # Optional participant record
83
+ try:
84
+
85
+ participant_record = ParticipantRecord(
86
+ hf_username=user_state,
87
+ participant_name=participant_name,
88
+ discord_username=discord_username,
89
+ email=email,
90
+ affiliation=affiliation,
91
+ )
92
+ except pydantic.ValidationError as e:
93
+ return f"❌ Error in participant information: {str(e)}"
94
+
95
+
96
+ try:
97
+ meta = SubmissionMetadata(
98
+ submission_time_utc=ts,
99
+ original_filename=file_path.name,
100
+ evaluated=False,
101
+ participant=participant_record
102
+ )
103
+ except pydantic.ValidationError as e:
104
+ return f"❌ Error in metadata information: {str(e)}"
105
+
106
+ meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
107
+
108
+ API.upload_file(
109
+ path_or_fileobj=meta_bytes,
110
+ path_in_repo=destination_json,
111
+ repo_id=submissions_repo,
112
+ repo_type="dataset",
113
+ commit_message=f"Add metadata for {user_state} submission at {ts}"
114
+ )
115
+
116
+ return "βœ… Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
117
+
118
+ def evaluate_data(filename: str) -> None:
119
+
120
+ # Load the submission csv
121
+ try:
122
+ local_path = hf_hub_download(
123
+ repo_id=submissions_repo,
124
+ repo_type="dataset",
125
+ filename=filename,
126
+ )
127
+ except Exception as e:
128
+ raise gr.Error(f"Failed to download submission file: {e}")
129
+
130
+ # Load the test set
131
+ try:
132
+ test_path = hf_hub_download(
133
+ repo_id=test_repo,
134
+ repo_type="dataset",
135
+ filename="data/test_dataset.csv",
136
+ )
137
+ except Exception as e:
138
+ raise gr.Error(f"Failed to download test file: {e}")
139
+
140
+ data_df = pd.read_csv(local_path)
141
+ test_df = pd.read_csv(test_path)
142
+ try:
143
+ results_df = calculate_metrics(data_df, test_df)
144
+ if not isinstance(results_df, pd.DataFrame) or results_df.empty:
145
+ raise gr.Error("Evaluation produced no results.")
146
+ except Exception as e:
147
+ raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.')
148
+
149
+ # Load metadata file
150
+ meta_filename = filename.replace(".csv", ".json")
151
+ try:
152
+ meta_path = hf_hub_download(
153
+ repo_id=submissions_repo,
154
+ repo_type="dataset",
155
+ filename=meta_filename,
156
+ )
157
+ with open(meta_path, "r", encoding="utf-8") as f:
158
+ _meta = json.load(f)
159
+ meta = SubmissionMetadata(**_meta)
160
+ username = meta.participant.hf_username
161
+ timestamp = meta.submission_time_utc
162
+ except Exception as e:
163
+ raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.")
164
+
165
+ # Write results to results dataset
166
+ results_df['user'] = username
167
+ safe_user = _unsafify_username(username)
168
+ destination_path = f"results/{safe_user}_{timestamp}_results.csv"
169
+ tmp_name = None
170
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
171
+ results_df.to_csv(tmp, index=False)
172
+ tmp.flush()
173
+ tmp_name = tmp.name
174
+
175
+ API.upload_file(
176
+ path_or_fileobj=tmp_name,
177
+ path_in_repo=destination_path,
178
+ repo_id=results_repo,
179
+ repo_type="dataset",
180
+ commit_message=f"Add result data for {username}"
181
+ )
182
+ Path(tmp_name).unlink()
183
+
184
+
185
+ def calculate_metrics(
186
+ results_dataframe: pd.DataFrame,
187
+ test_dataframe: pd.DataFrame
188
+ ):
189
+
190
+ def metrics_per_ep(pred, true):
191
+ mae = mean_absolute_error(true, pred)
192
+ r2 = r2_score(true, pred)
193
+ spr, _ = spearmanr(true, pred)
194
+ ktau, _ = kendalltau(true, pred)
195
+ return mae, r2, spr, ktau
196
+
197
+ df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"])
198
+ for i, measurement in enumerate(ENDPOINTS):
199
+ df_pred = results_dataframe[['Molecule Name', measurement]].dropna()
200
+ df_true = test_dataframe[['Molecule Name', measurement]].dropna()
201
+ # Make sure both have the same order
202
+ pred = df_pred.sort_values(by='Molecule Name')[measurement]
203
+ true = df_true.sort_values(by='Molecule Name')[measurement]
204
+ mae, r2, spearman, ktau = metrics_per_ep(pred, true)
205
+ df_results.loc[i, 'endpoint'] = measurement
206
+ df_results.loc[i, 'MAE'] = mae
207
+ df_results.loc[i, 'R2'] = r2
208
+ df_results.loc[i, 'Spearman R'] = spearman
209
+ df_results.loc[i, "Kendall's Tau"] = ktau
210
+
211
+ return df_results
requirements.txt CHANGED
@@ -2,4 +2,6 @@ gradio
2
  datasets
3
  huggingface_hub
4
  gradio-leaderboard
5
- plotly
 
 
 
2
  datasets
3
  huggingface_hub
4
  gradio-leaderboard
5
+ plotly
6
+ scipy
7
+ scikit-learn