|
import gradio as gr |
|
import pandas as pd |
|
from pathlib import Path |
|
from scipy.stats import spearmanr, kendalltau |
|
from sklearn.metrics import mean_absolute_error, r2_score |
|
from typing import Optional |
|
from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo |
|
from huggingface_hub import hf_hub_download |
|
import datetime |
|
import io |
|
import json, tempfile |
|
import pydantic |
|
|
|
|
|
class ParticipantRecord(pydantic.BaseModel): |
|
hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username") |
|
participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name") |
|
discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username") |
|
email: Optional[str] = pydantic.Field(default=None, description="Email address") |
|
affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation") |
|
model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag") |
|
|
|
|
|
class SubmissionMetadata(pydantic.BaseModel): |
|
submission_time_utc: datetime.datetime |
|
user: str |
|
original_filename: str |
|
evaluated: bool |
|
participant: ParticipantRecord |
|
|
|
|
|
def _safeify_username(username: str) -> str: |
|
return str(username.strip()).replace("/", "_").replace(" ", "_") |
|
|
|
def _unsafify_username(username: str) -> str: |
|
return str(username.strip()).replace("/", "_").replace(" ", "_") |
|
|
|
def submit_data(predictions_file: str, |
|
user_state, |
|
participant_name: str = "", |
|
discord_username: str = "", |
|
email: str = "", |
|
affiliation: str = "" |
|
): |
|
|
|
if user_state is None: |
|
raise gr.Error("Username or alias is required for submission.") |
|
|
|
file_path = Path(predictions_file).resolve() |
|
|
|
if not file_path.exists(): |
|
raise gr.Error("Uploaded file object does not have a valid file path.") |
|
|
|
|
|
try: |
|
results_df = pd.read_csv(file_path) |
|
except Exception as e: |
|
return f"β Error reading results file: {str(e)}" |
|
|
|
if results_df.empty: |
|
return gr.Error("The uploaded file is empty.") |
|
if not set(ENDPOINTS).issubset(set(results_df.columns)): |
|
return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.") |
|
|
|
|
|
|
|
|
|
ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") |
|
safe_user = _safeify_username(user_state) |
|
|
|
destination_csv = f"submissions/{safe_user}_{ts}.csv" |
|
destination_json = destination_csv.replace(".csv", ".json") |
|
|
|
API.upload_file( |
|
path_or_fileobj=str(file_path), |
|
path_in_repo=destination_csv, |
|
repo_id=submissions_repo, |
|
repo_type="dataset", |
|
commit_message=f"Add submission for {safe_user} at {ts}" |
|
) |
|
|
|
|
|
try: |
|
|
|
participant_record = ParticipantRecord( |
|
hf_username=user_state, |
|
participant_name=participant_name, |
|
discord_username=discord_username, |
|
email=email, |
|
affiliation=affiliation, |
|
) |
|
except pydantic.ValidationError as e: |
|
return f"β Error in participant information: {str(e)}" |
|
|
|
|
|
try: |
|
meta = SubmissionMetadata( |
|
submission_time_utc=ts, |
|
original_filename=file_path.name, |
|
evaluated=False, |
|
participant=participant_record |
|
) |
|
except pydantic.ValidationError as e: |
|
return f"β Error in metadata information: {str(e)}" |
|
|
|
meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8")) |
|
|
|
API.upload_file( |
|
path_or_fileobj=meta_bytes, |
|
path_in_repo=destination_json, |
|
repo_id=submissions_repo, |
|
repo_type="dataset", |
|
commit_message=f"Add metadata for {user_state} submission at {ts}" |
|
) |
|
|
|
return "β
Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv |
|
|
|
def evaluate_data(filename: str) -> None: |
|
|
|
|
|
try: |
|
local_path = hf_hub_download( |
|
repo_id=submissions_repo, |
|
repo_type="dataset", |
|
filename=filename, |
|
) |
|
except Exception as e: |
|
raise gr.Error(f"Failed to download submission file: {e}") |
|
|
|
|
|
try: |
|
test_path = hf_hub_download( |
|
repo_id=test_repo, |
|
repo_type="dataset", |
|
filename="data/test_dataset.csv", |
|
) |
|
except Exception as e: |
|
raise gr.Error(f"Failed to download test file: {e}") |
|
|
|
data_df = pd.read_csv(local_path) |
|
test_df = pd.read_csv(test_path) |
|
try: |
|
results_df = calculate_metrics(data_df, test_df) |
|
if not isinstance(results_df, pd.DataFrame) or results_df.empty: |
|
raise gr.Error("Evaluation produced no results.") |
|
except Exception as e: |
|
raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.') |
|
|
|
|
|
meta_filename = filename.replace(".csv", ".json") |
|
try: |
|
meta_path = hf_hub_download( |
|
repo_id=submissions_repo, |
|
repo_type="dataset", |
|
filename=meta_filename, |
|
) |
|
with open(meta_path, "r", encoding="utf-8") as f: |
|
_meta = json.load(f) |
|
meta = SubmissionMetadata(**_meta) |
|
username = meta.participant.hf_username |
|
timestamp = meta.submission_time_utc |
|
except Exception as e: |
|
raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.") |
|
|
|
|
|
results_df['user'] = username |
|
safe_user = _unsafify_username(username) |
|
destination_path = f"results/{safe_user}_{timestamp}_results.csv" |
|
tmp_name = None |
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp: |
|
results_df.to_csv(tmp, index=False) |
|
tmp.flush() |
|
tmp_name = tmp.name |
|
|
|
API.upload_file( |
|
path_or_fileobj=tmp_name, |
|
path_in_repo=destination_path, |
|
repo_id=results_repo, |
|
repo_type="dataset", |
|
commit_message=f"Add result data for {username}" |
|
) |
|
Path(tmp_name).unlink() |
|
|
|
|
|
def calculate_metrics( |
|
results_dataframe: pd.DataFrame, |
|
test_dataframe: pd.DataFrame |
|
): |
|
|
|
def metrics_per_ep(pred, true): |
|
mae = mean_absolute_error(true, pred) |
|
r2 = r2_score(true, pred) |
|
spr, _ = spearmanr(true, pred) |
|
ktau, _ = kendalltau(true, pred) |
|
return mae, r2, spr, ktau |
|
|
|
df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"]) |
|
for i, measurement in enumerate(ENDPOINTS): |
|
df_pred = results_dataframe[['Molecule Name', measurement]].dropna() |
|
df_true = test_dataframe[['Molecule Name', measurement]].dropna() |
|
|
|
pred = df_pred.sort_values(by='Molecule Name')[measurement] |
|
true = df_true.sort_values(by='Molecule Name')[measurement] |
|
mae, r2, spearman, ktau = metrics_per_ep(pred, true) |
|
df_results.loc[i, 'endpoint'] = measurement |
|
df_results.loc[i, 'MAE'] = mae |
|
df_results.loc[i, 'R2'] = r2 |
|
df_results.loc[i, 'Spearman R'] = spearman |
|
df_results.loc[i, "Kendall's Tau"] = ktau |
|
|
|
return df_results |
|
|