Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
keep old Average
Browse files- app.py +1 -1
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +7 -1
app.py
CHANGED
|
@@ -76,7 +76,7 @@ def style_df(df: pd.DataFrame) -> Styler:
|
|
| 76 |
rounding = {'#Params (B)': "{:.1f}"}
|
| 77 |
for task in Tasks:
|
| 78 |
rounding[task.value.col_name] = "{:.2f}"
|
| 79 |
-
for column_name in ["Average ⬆️", "Avg g", "Avg mc"]:
|
| 80 |
rounding[column_name] = "{:.2f}"
|
| 81 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
| 82 |
return leaderboard_df_styled
|
|
|
|
| 76 |
rounding = {'#Params (B)': "{:.1f}"}
|
| 77 |
for task in Tasks:
|
| 78 |
rounding[task.value.col_name] = "{:.2f}"
|
| 79 |
+
for column_name in ["Average ⬆️", "Avg g", "Avg mc", "Average old"]:
|
| 80 |
rounding[column_name] = "{:.2f}"
|
| 81 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
| 82 |
return leaderboard_df_styled
|
src/display/utils.py
CHANGED
|
@@ -30,6 +30,7 @@ auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str"
|
|
| 30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
| 31 |
#Scores
|
| 32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
| 33 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
| 34 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
| 35 |
for task in Tasks:
|
|
|
|
| 30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
| 31 |
#Scores
|
| 32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 33 |
+
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
| 34 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
| 35 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
| 36 |
for task in Tasks:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -157,10 +157,11 @@ class EvalResult:
|
|
| 157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 159 |
all_tasks = g_tasks + mc_tasks
|
|
|
|
| 160 |
|
| 161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
| 162 |
|
| 163 |
-
|
| 164 |
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 165 |
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 166 |
# print('XXXXXXXXXXXX')
|
|
@@ -249,6 +250,11 @@ class EvalResult:
|
|
| 249 |
except AttributeError:
|
| 250 |
print(f"AttributeError revision")
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
try:
|
| 253 |
data_dict[AutoEvalColumn.average.name] = average
|
| 254 |
except KeyError:
|
|
|
|
| 157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 159 |
all_tasks = g_tasks + mc_tasks
|
| 160 |
+
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
| 161 |
|
| 162 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
| 163 |
|
| 164 |
+
average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
|
| 165 |
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 166 |
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 167 |
# print('XXXXXXXXXXXX')
|
|
|
|
| 250 |
except AttributeError:
|
| 251 |
print(f"AttributeError revision")
|
| 252 |
|
| 253 |
+
try:
|
| 254 |
+
data_dict[AutoEvalColumn.average_old.name] = average_old
|
| 255 |
+
except KeyError:
|
| 256 |
+
print(f"Could not find average_old")
|
| 257 |
+
|
| 258 |
try:
|
| 259 |
data_dict[AutoEvalColumn.average.name] = average
|
| 260 |
except KeyError:
|