File size: 7,632 Bytes
b77cb84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import gradio as gr
import pandas as pd
from pathlib import Path
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics import mean_absolute_error, r2_score
from typing import Optional
from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
from huggingface_hub import hf_hub_download
import datetime
import io
import json, tempfile
import pydantic 


class ParticipantRecord(pydantic.BaseModel):
    hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username")
    participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name")
    discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username")
    email: Optional[str] = pydantic.Field(default=None, description="Email address")
    affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation")
    model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag")


class SubmissionMetadata(pydantic.BaseModel):
    submission_time_utc: datetime.datetime
    user: str
    original_filename: str
    evaluated: bool
    participant: ParticipantRecord


def _safeify_username(username: str) -> str:
    return str(username.strip()).replace("/", "_").replace(" ", "_")

def _unsafify_username(username: str) -> str:
    return str(username.strip()).replace("/", "_").replace(" ", "_")

def submit_data(predictions_file: str, 
                user_state,
                participant_name: str = "",
                discord_username: str = "",
                email: str = "",
                affiliation: str = ""
):
    
    if user_state is None:
        raise gr.Error("Username or alias is required for submission.")
    
    file_path = Path(predictions_file).resolve()

    if not file_path.exists():
        raise gr.Error("Uploaded file object does not have a valid file path.")

    # Read results file 
    try:
        results_df = pd.read_csv(file_path)            
    except Exception as e:
        return f"❌ Error reading results file: {str(e)}"

    if results_df.empty:
        return gr.Error("The uploaded file is empty.")
    if not set(ENDPOINTS).issubset(set(results_df.columns)):
        return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.")
    
    # TODO, much more validation logic needed depending on the state of final data

    # Build destination filename in the dataset
    ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") # should keep default time so can be deserialized correctly
    safe_user = _safeify_username(user_state)

    destination_csv = f"submissions/{safe_user}_{ts}.csv"
    destination_json = destination_csv.replace(".csv", ".json")
    # Upload the CSV file
    API.upload_file(
        path_or_fileobj=str(file_path),
        path_in_repo=destination_csv,
        repo_id=submissions_repo,
        repo_type="dataset",
        commit_message=f"Add submission for {safe_user} at {ts}"
    )

    # Optional participant record
    try:

        participant_record = ParticipantRecord(
            hf_username=user_state,
            participant_name=participant_name,
            discord_username=discord_username,
            email=email,
            affiliation=affiliation,
        )
    except pydantic.ValidationError as e:
        return f"❌ Error in participant information: {str(e)}"


    try:
        meta = SubmissionMetadata(
            submission_time_utc=ts,
            original_filename=file_path.name,
            evaluated=False,
            participant=participant_record
        )
    except pydantic.ValidationError as e:
        return f"❌ Error in metadata information: {str(e)}"

    meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))

    API.upload_file(
        path_or_fileobj=meta_bytes,
        path_in_repo=destination_json,
        repo_id=submissions_repo,
        repo_type="dataset",
        commit_message=f"Add metadata for {user_state} submission at {ts}"
    )

    return "βœ… Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv

def evaluate_data(filename: str) -> None:

    # Load the submission csv
    try:
        local_path = hf_hub_download(
            repo_id=submissions_repo,
            repo_type="dataset",
            filename=filename,
        )
    except Exception as e:
        raise gr.Error(f"Failed to download submission file: {e}")
    
    # Load the test set
    try: 
        test_path = hf_hub_download(
            repo_id=test_repo,
            repo_type="dataset",
            filename="data/test_dataset.csv",
        )
    except Exception as e:
        raise gr.Error(f"Failed to download test file: {e}")
    
    data_df = pd.read_csv(local_path)
    test_df = pd.read_csv(test_path)
    try:
        results_df = calculate_metrics(data_df, test_df)
        if not isinstance(results_df, pd.DataFrame) or results_df.empty:
            raise gr.Error("Evaluation produced no results.")
    except Exception as e:
        raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.')
    
    # Load metadata file
    meta_filename = filename.replace(".csv", ".json")
    try:
        meta_path = hf_hub_download(
                repo_id=submissions_repo,
                repo_type="dataset",
                filename=meta_filename,
            )
        with open(meta_path, "r", encoding="utf-8") as f:
            _meta = json.load(f)
        meta = SubmissionMetadata(**_meta)
        username = meta.participant.hf_username
        timestamp = meta.submission_time_utc
    except Exception as e:
        raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.")

    # Write results to results dataset
    results_df['user'] = username
    safe_user = _unsafify_username(username)
    destination_path = f"results/{safe_user}_{timestamp}_results.csv"
    tmp_name = None
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
        results_df.to_csv(tmp, index=False)
        tmp.flush()
        tmp_name = tmp.name
    
    API.upload_file(
            path_or_fileobj=tmp_name, 
            path_in_repo=destination_path,
            repo_id=results_repo,
            repo_type="dataset",
            commit_message=f"Add result data for {username}"
        )
    Path(tmp_name).unlink()


def calculate_metrics(
        results_dataframe: pd.DataFrame,
        test_dataframe: pd.DataFrame
    ):

    def metrics_per_ep(pred, true):
        mae = mean_absolute_error(true, pred)
        r2 = r2_score(true, pred)
        spr, _ = spearmanr(true, pred)
        ktau, _ = kendalltau(true, pred)
        return mae, r2, spr, ktau

    df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"])
    for i, measurement in enumerate(ENDPOINTS):  
        df_pred = results_dataframe[['Molecule Name', measurement]].dropna()
        df_true = test_dataframe[['Molecule Name', measurement]].dropna()
        # Make sure both have the same order
        pred = df_pred.sort_values(by='Molecule Name')[measurement]
        true = df_true.sort_values(by='Molecule Name')[measurement]
        mae, r2, spearman, ktau = metrics_per_ep(pred, true)
        df_results.loc[i, 'endpoint'] = measurement
        df_results.loc[i, 'MAE'] = mae
        df_results.loc[i, 'R2'] = r2
        df_results.loc[i, 'Spearman R'] = spearman
        df_results.loc[i, "Kendall's Tau"] = ktau

    return df_results