Spaces:
Running
Running
change to MASE and geometric mean to aggregate results
Browse files- app.py +4 -4
- requirements.txt +2 -1
- src/utils.py +12 -6
app.py
CHANGED
|
@@ -119,15 +119,15 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
| 119 |
merged_df = get_merged_df(ori_dataframe, model_info_df)
|
| 120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
| 121 |
merged_df = merged_df[new_cols]
|
| 122 |
-
print('Merged df: ', merged_df)
|
| 123 |
if sort_val:
|
| 124 |
if sort_val in merged_df.columns:
|
| 125 |
merged_df = merged_df.sort_values(by=[sort_val])
|
| 126 |
else:
|
| 127 |
print(f'Warning: cannot sort by {sort_val}')
|
|
|
|
| 128 |
# get the data type
|
| 129 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
| 130 |
-
print('datatype_list: ', datatype_list)
|
| 131 |
# print('merged_df.column: ', merged_df.columns)
|
| 132 |
# ipdb.set_trace()
|
| 133 |
return Leaderboard(
|
|
@@ -164,7 +164,7 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
| 164 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 165 |
],
|
| 166 |
# bool_checkboxgroup_label="",
|
| 167 |
-
column_widths=[40, 150] + [
|
| 168 |
interactive=False,
|
| 169 |
)
|
| 170 |
|
|
@@ -176,7 +176,7 @@ with demo:
|
|
| 176 |
|
| 177 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 178 |
with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
|
| 179 |
-
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='
|
| 180 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
| 181 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
| 182 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
|
|
|
| 119 |
merged_df = get_merged_df(ori_dataframe, model_info_df)
|
| 120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
| 121 |
merged_df = merged_df[new_cols]
|
|
|
|
| 122 |
if sort_val:
|
| 123 |
if sort_val in merged_df.columns:
|
| 124 |
merged_df = merged_df.sort_values(by=[sort_val])
|
| 125 |
else:
|
| 126 |
print(f'Warning: cannot sort by {sort_val}')
|
| 127 |
+
print('Merged df: ', merged_df)
|
| 128 |
# get the data type
|
| 129 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
| 130 |
+
# print('datatype_list: ', datatype_list)
|
| 131 |
# print('merged_df.column: ', merged_df.columns)
|
| 132 |
# ipdb.set_trace()
|
| 133 |
return Leaderboard(
|
|
|
|
| 164 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 165 |
],
|
| 166 |
# bool_checkboxgroup_label="",
|
| 167 |
+
column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
|
| 168 |
interactive=False,
|
| 169 |
)
|
| 170 |
|
|
|
|
| 176 |
|
| 177 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 178 |
with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
|
| 179 |
+
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
|
| 180 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
| 181 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
| 182 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
requirements.txt
CHANGED
|
@@ -14,4 +14,5 @@ tqdm
|
|
| 14 |
transformers
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
sentencepiece
|
| 17 |
-
ipdb
|
|
|
|
|
|
| 14 |
transformers
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
sentencepiece
|
| 17 |
+
ipdb
|
| 18 |
+
scipy
|
src/utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import ipdb
|
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 5 |
|
| 6 |
# Define the formatting function
|
| 7 |
def format_number(num):
|
|
@@ -45,7 +46,7 @@ def pivot_df(file_name, tab_name):
|
|
| 45 |
|
| 46 |
def rename_metrics(df):
|
| 47 |
df = df.rename(columns={
|
| 48 |
-
'eval_metrics/
|
| 49 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 50 |
'rank': 'Rank'
|
| 51 |
})
|
|
@@ -89,7 +90,7 @@ def pivot_existed_df(df, tab_name):
|
|
| 89 |
print('columns', df.columns)
|
| 90 |
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
|
| 91 |
df_melted['metric'] = df_melted['metric'].replace({
|
| 92 |
-
'eval_metrics/
|
| 93 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 94 |
'rank': 'Rank',
|
| 95 |
})
|
|
@@ -168,9 +169,12 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
|
|
| 168 |
df['univariate'] = df['num_variates'] == 1
|
| 169 |
|
| 170 |
# group by domain
|
| 171 |
-
METRIC_CHOICES = ["eval_metrics/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].mean()
|
| 174 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
| 175 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|
| 176 |
grouped_dfs = {}
|
|
@@ -236,8 +240,10 @@ def standardize_df(df):
|
|
| 236 |
return df
|
| 237 |
|
| 238 |
def group_by(df, col_name):
|
| 239 |
-
METRIC_CHOICES = ["eval_metrics/
|
| 240 |
-
grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].
|
|
|
|
|
|
|
| 241 |
# Display the results
|
| 242 |
# Write the results to a csv file
|
| 243 |
# grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
+
from scipy import stats
|
| 6 |
|
| 7 |
# Define the formatting function
|
| 8 |
def format_number(num):
|
|
|
|
| 46 |
|
| 47 |
def rename_metrics(df):
|
| 48 |
df = df.rename(columns={
|
| 49 |
+
'eval_metrics/MASE[0.5]': 'MASE',
|
| 50 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 51 |
'rank': 'Rank'
|
| 52 |
})
|
|
|
|
| 90 |
print('columns', df.columns)
|
| 91 |
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
|
| 92 |
df_melted['metric'] = df_melted['metric'].replace({
|
| 93 |
+
'eval_metrics/MASE[0.5]': 'MASE',
|
| 94 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 95 |
'rank': 'Rank',
|
| 96 |
})
|
|
|
|
| 169 |
df['univariate'] = df['num_variates'] == 1
|
| 170 |
|
| 171 |
# group by domain
|
| 172 |
+
METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
|
| 173 |
+
# ipdb.set_trace()
|
| 174 |
+
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
|
| 175 |
+
grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
|
| 176 |
+
grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank], axis=1)
|
| 177 |
|
|
|
|
| 178 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
| 179 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|
| 180 |
grouped_dfs = {}
|
|
|
|
| 240 |
return df
|
| 241 |
|
| 242 |
def group_by(df, col_name):
|
| 243 |
+
METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
|
| 244 |
+
grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].agg(stats.gmean)
|
| 245 |
+
grouped_results_rank = df.groupby([col_name, 'model'])[['rank']].mean()
|
| 246 |
+
grouped_results = pd.concat([grouped_results, grouped_results_rank], axis=1)
|
| 247 |
# Display the results
|
| 248 |
# Write the results to a csv file
|
| 249 |
# grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
|