File size: 7,632 Bytes
b77cb84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import gradio as gr
import pandas as pd
from pathlib import Path
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics import mean_absolute_error, r2_score
from typing import Optional
from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
from huggingface_hub import hf_hub_download
import datetime
import io
import json, tempfile
import pydantic
class ParticipantRecord(pydantic.BaseModel):
hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username")
participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name")
discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username")
email: Optional[str] = pydantic.Field(default=None, description="Email address")
affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation")
model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag")
class SubmissionMetadata(pydantic.BaseModel):
submission_time_utc: datetime.datetime
user: str
original_filename: str
evaluated: bool
participant: ParticipantRecord
def _safeify_username(username: str) -> str:
return str(username.strip()).replace("/", "_").replace(" ", "_")
def _unsafify_username(username: str) -> str:
return str(username.strip()).replace("/", "_").replace(" ", "_")
def submit_data(predictions_file: str,
user_state,
participant_name: str = "",
discord_username: str = "",
email: str = "",
affiliation: str = ""
):
if user_state is None:
raise gr.Error("Username or alias is required for submission.")
file_path = Path(predictions_file).resolve()
if not file_path.exists():
raise gr.Error("Uploaded file object does not have a valid file path.")
# Read results file
try:
results_df = pd.read_csv(file_path)
except Exception as e:
return f"β Error reading results file: {str(e)}"
if results_df.empty:
return gr.Error("The uploaded file is empty.")
if not set(ENDPOINTS).issubset(set(results_df.columns)):
return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.")
# TODO, much more validation logic needed depending on the state of final data
# Build destination filename in the dataset
ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") # should keep default time so can be deserialized correctly
safe_user = _safeify_username(user_state)
destination_csv = f"submissions/{safe_user}_{ts}.csv"
destination_json = destination_csv.replace(".csv", ".json")
# Upload the CSV file
API.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=destination_csv,
repo_id=submissions_repo,
repo_type="dataset",
commit_message=f"Add submission for {safe_user} at {ts}"
)
# Optional participant record
try:
participant_record = ParticipantRecord(
hf_username=user_state,
participant_name=participant_name,
discord_username=discord_username,
email=email,
affiliation=affiliation,
)
except pydantic.ValidationError as e:
return f"β Error in participant information: {str(e)}"
try:
meta = SubmissionMetadata(
submission_time_utc=ts,
original_filename=file_path.name,
evaluated=False,
participant=participant_record
)
except pydantic.ValidationError as e:
return f"β Error in metadata information: {str(e)}"
meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
API.upload_file(
path_or_fileobj=meta_bytes,
path_in_repo=destination_json,
repo_id=submissions_repo,
repo_type="dataset",
commit_message=f"Add metadata for {user_state} submission at {ts}"
)
return "β
Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
def evaluate_data(filename: str) -> None:
# Load the submission csv
try:
local_path = hf_hub_download(
repo_id=submissions_repo,
repo_type="dataset",
filename=filename,
)
except Exception as e:
raise gr.Error(f"Failed to download submission file: {e}")
# Load the test set
try:
test_path = hf_hub_download(
repo_id=test_repo,
repo_type="dataset",
filename="data/test_dataset.csv",
)
except Exception as e:
raise gr.Error(f"Failed to download test file: {e}")
data_df = pd.read_csv(local_path)
test_df = pd.read_csv(test_path)
try:
results_df = calculate_metrics(data_df, test_df)
if not isinstance(results_df, pd.DataFrame) or results_df.empty:
raise gr.Error("Evaluation produced no results.")
except Exception as e:
raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.')
# Load metadata file
meta_filename = filename.replace(".csv", ".json")
try:
meta_path = hf_hub_download(
repo_id=submissions_repo,
repo_type="dataset",
filename=meta_filename,
)
with open(meta_path, "r", encoding="utf-8") as f:
_meta = json.load(f)
meta = SubmissionMetadata(**_meta)
username = meta.participant.hf_username
timestamp = meta.submission_time_utc
except Exception as e:
raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.")
# Write results to results dataset
results_df['user'] = username
safe_user = _unsafify_username(username)
destination_path = f"results/{safe_user}_{timestamp}_results.csv"
tmp_name = None
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
results_df.to_csv(tmp, index=False)
tmp.flush()
tmp_name = tmp.name
API.upload_file(
path_or_fileobj=tmp_name,
path_in_repo=destination_path,
repo_id=results_repo,
repo_type="dataset",
commit_message=f"Add result data for {username}"
)
Path(tmp_name).unlink()
def calculate_metrics(
results_dataframe: pd.DataFrame,
test_dataframe: pd.DataFrame
):
def metrics_per_ep(pred, true):
mae = mean_absolute_error(true, pred)
r2 = r2_score(true, pred)
spr, _ = spearmanr(true, pred)
ktau, _ = kendalltau(true, pred)
return mae, r2, spr, ktau
df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"])
for i, measurement in enumerate(ENDPOINTS):
df_pred = results_dataframe[['Molecule Name', measurement]].dropna()
df_true = test_dataframe[['Molecule Name', measurement]].dropna()
# Make sure both have the same order
pred = df_pred.sort_values(by='Molecule Name')[measurement]
true = df_true.sort_values(by='Molecule Name')[measurement]
mae, r2, spearman, ktau = metrics_per_ep(pred, true)
df_results.loc[i, 'endpoint'] = measurement
df_results.loc[i, 'MAE'] = mae
df_results.loc[i, 'R2'] = r2
df_results.loc[i, 'Spearman R'] = spearman
df_results.loc[i, "Kendall's Tau"] = ktau
return df_results
|