Spaces:
Running
Running
Update src/leaderboard/read_evals.py
Browse files- src/leaderboard/read_evals.py +26 -28
src/leaderboard/read_evals.py
CHANGED
|
@@ -161,6 +161,7 @@ class EvalResult_MIB:
|
|
| 161 |
# return data_dict
|
| 162 |
|
| 163 |
|
|
|
|
| 164 |
def to_dict(self):
|
| 165 |
"""Converts the Eval Result to a dict for dataframe display"""
|
| 166 |
data_dict = {
|
|
@@ -168,48 +169,45 @@ class EvalResult_MIB:
|
|
| 168 |
"Method": self.method_name,
|
| 169 |
}
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
all_scores = []
|
| 172 |
-
required_entries = {
|
| 173 |
-
'ioi_meta_llama': False,
|
| 174 |
-
'ioi_qwen': False,
|
| 175 |
-
'ioi_gpt2': False,
|
| 176 |
-
'mcqa_meta_llama': False,
|
| 177 |
-
'mcqa_qwen': False,
|
| 178 |
-
'mcqa_gpt2': False
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
for task, task_results in self.results.items():
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
for model in models:
|
| 185 |
col_name = f"{task}_{model}"
|
| 186 |
-
metrics = task_results[model]
|
| 187 |
|
| 188 |
-
# Handle empty lists case
|
| 189 |
if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
|
| 190 |
-
data_dict[col_name] = '-'
|
| 191 |
continue
|
| 192 |
|
| 193 |
faithfulness = metrics["faithfulness"]
|
| 194 |
if isinstance(faithfulness[0], list):
|
| 195 |
faithfulness = faithfulness[0]
|
| 196 |
-
|
| 197 |
result = compute_area(metrics["edge_counts"], faithfulness)
|
| 198 |
if result is None or result[0] is None:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
required_entries[col_name] = True
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
|
| 209 |
-
data_dict[
|
| 210 |
-
|
| 211 |
-
data_dict[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
|
|
|
| 213 |
return data_dict
|
| 214 |
|
| 215 |
|
|
|
|
| 161 |
# return data_dict
|
| 162 |
|
| 163 |
|
| 164 |
+
|
| 165 |
def to_dict(self):
|
| 166 |
"""Converts the Eval Result to a dict for dataframe display"""
|
| 167 |
data_dict = {
|
|
|
|
| 169 |
"Method": self.method_name,
|
| 170 |
}
|
| 171 |
|
| 172 |
+
# Initialize all possible columns with '-'
|
| 173 |
+
expected_models = ["meta_llama", "qwen", "gpt2"]
|
| 174 |
+
expected_tasks = ["ioi", "mcqa"]
|
| 175 |
+
for task in expected_tasks:
|
| 176 |
+
for model in expected_models:
|
| 177 |
+
data_dict[f"{task}_{model}"] = '-'
|
| 178 |
+
|
| 179 |
all_scores = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
for task, task_results in self.results.items():
|
| 181 |
+
for model, metrics in task_results.items():
|
|
|
|
|
|
|
| 182 |
col_name = f"{task}_{model}"
|
|
|
|
| 183 |
|
|
|
|
| 184 |
if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
|
|
|
|
| 185 |
continue
|
| 186 |
|
| 187 |
faithfulness = metrics["faithfulness"]
|
| 188 |
if isinstance(faithfulness[0], list):
|
| 189 |
faithfulness = faithfulness[0]
|
| 190 |
+
|
| 191 |
result = compute_area(metrics["edge_counts"], faithfulness)
|
| 192 |
if result is None or result[0] is None:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
area_under, _, _ = result
|
| 196 |
+
score = area_under * 100
|
| 197 |
+
data_dict[col_name] = round(score, 2)
|
| 198 |
+
all_scores.append(score)
|
|
|
|
| 199 |
|
| 200 |
+
# All entries must be present for average
|
| 201 |
+
required_entries = [
|
| 202 |
+
data_dict['ioi_meta_llama'] != '-',
|
| 203 |
+
data_dict['ioi_qwen'] != '-',
|
| 204 |
+
data_dict['ioi_gpt2'] != '-',
|
| 205 |
+
data_dict['mcqa_meta_llama'] != '-',
|
| 206 |
+
data_dict['mcqa_qwen'] != '-',
|
| 207 |
+
data_dict['mcqa_gpt2'] != '-'
|
| 208 |
+
]
|
| 209 |
|
| 210 |
+
data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
|
| 211 |
return data_dict
|
| 212 |
|
| 213 |
|