mateusz-aveni commited on
Commit
6c63009
Β·
1 Parent(s): d048ec3

Add borda count instead of an average.

Browse files
Files changed (1) hide show
  1. app.py +90 -67
app.py CHANGED
@@ -16,7 +16,7 @@ initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH)
16
  LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv")
17
 
18
  columns = LEADERBOARD_DF.columns.tolist()
19
- demo = gr.Blocks()
20
 
21
  # Choices for the filters
22
  unselectable_columns = ["model"]
@@ -39,64 +39,71 @@ filter_skill_choices = [
39
 
40
  with demo:
41
  gr.HTML(TITLE)
42
-
 
 
 
 
 
43
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
44
- with gr.TabItem("πŸ… FinLLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
45
- with gr.Column():
46
- with gr.Row():
47
- with gr.Column():
48
- with gr.Row():
49
- filter_task = gr.CheckboxGroup(
50
- label="Select Tasks",
51
- choices=filter_task_choices,
52
- interactive=True,
53
- value=filter_task_choices,
54
- elem_id="filter_task",
55
- )
56
- with gr.Column():
57
- select_all_tasks = gr.Button(
58
- value="Select all tasks",
59
- elem_id="select-all-tasks",
60
- interactive=True,
61
- size="sm",
62
- )
63
- deselect_all_tasks = gr.Button(
64
- value="Deselect all tasks",
65
- elem_id="deselect-all-tasks",
66
- interactive=True,
67
- size="sm",
68
- )
69
-
70
- with gr.Row():
71
- filter_skills = gr.CheckboxGroup(
72
- label="Select Skills",
73
- choices=filter_skill_choices,
74
- value=filter_skill_choices,
75
- interactive=True,
76
- elem_id="filter-language"
77
- )
78
- with gr.Column():
79
- select_all_skills = gr.Button(
80
- value="Select all skills",
81
- elem_id="select-all-skills",
82
- interactive=True,
83
- size="sm",
84
- )
85
- deselect_all_skills = gr.Button(
86
- value="Deselect all skills",
87
- elem_id="deselect-all-skills",
88
- interactive=True,
89
- size="sm",
90
- )
91
 
 
 
 
 
 
 
 
 
 
92
  with gr.Column():
93
- leaderboard_table = gr.Dataframe(
94
- value=LEADERBOARD_DF,
95
- interactive=False,
96
- visible=True,
97
- label="Leaderboard",
98
- elem_id="leaderboard-title"
99
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.Row():
102
  with gr.Accordion("πŸ“™ Citation", open=False):
@@ -110,31 +117,49 @@ with demo:
110
 
111
 
112
  def update_leaderboard(filter_task_items, filter_skills_items):
 
 
 
 
113
  filtered_df: pd.DataFrame = LEADERBOARD_DF.copy()
114
 
115
  filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)]
116
- filtered_df = filtered_df[filtered_df["skill"].isin(filter_skills_items)]
 
 
 
117
 
118
  cols = ["model", "task", "score"]
119
  filtered_df = filtered_df[cols]
120
 
121
- # Create average column
122
  current_task_items = filtered_df["task"].unique().tolist()
123
- filtered_df = filtered_df.pivot(index="model", columns="task", values="score").reset_index()
124
- filtered_df["average"] = filtered_df[current_task_items].mean(axis=1)
 
 
 
 
 
 
125
 
126
  # Reorder columns
127
- filtered_df = filtered_df[["model", "average"] + current_task_items]
 
128
 
129
- # Sort by average
130
- filtered_df = filtered_df.sort_values(by="average", ascending=False)
131
 
132
- # Rename average with symbol
133
- filtered_df = filtered_df.rename(columns={"average": "Average ⬆️"})
 
 
 
 
134
 
135
  # Round values
136
  for col in filtered_df.columns:
137
- if col not in ["model"]:
138
  filtered_df[col] = filtered_df[col].round(2)
139
 
140
  return filtered_df
@@ -149,9 +174,7 @@ with demo:
149
  )
150
 
151
  select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task])
152
- deselect_all_tasks.click(lambda: [], inputs=[], outputs=[filter_task])
153
  select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills])
154
- deselect_all_skills.click(lambda: [], inputs=[], outputs=[filter_skills])
155
 
156
  gr.Blocks.load(
157
  block=demo,
 
16
  LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv")
17
 
18
  columns = LEADERBOARD_DF.columns.tolist()
19
+ demo = gr.Blocks(theme=gr.themes.Monochrome())
20
 
21
  # Choices for the filters
22
  unselectable_columns = ["model"]
 
39
 
40
  with demo:
41
  gr.HTML(TITLE)
42
+ gr.Markdown(
43
+ "This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key "
44
+ "finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, "
45
+ "summarisation and dialogue.", elem_classes="markdown-text",
46
+ )
47
+ gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text")
48
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
49
+ with gr.TabItem("πŸ… AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0):
50
+ with gr.Row():
51
+ filter_task = gr.CheckboxGroup(
52
+ label="Select Tasks",
53
+ choices=filter_task_choices,
54
+ interactive=True,
55
+ value=filter_task_choices,
56
+ elem_id="filter_task",
57
+ scale=6
58
+ )
59
+ with gr.Column():
60
+ select_all_tasks = gr.Button(
61
+ value="Select all tasks",
62
+ elem_id="select-all-tasks",
63
+ size="sm",
64
+ scale=1
65
+ )
66
+ deselect_all_tasks = gr.ClearButton(
67
+ filter_task,
68
+ value="Deselect all tasks",
69
+ elem_id="deselect-all-tasks",
70
+ size="sm",
71
+ scale=1
72
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ with gr.Row():
75
+ filter_skills = gr.CheckboxGroup(
76
+ label="Select Skills",
77
+ choices=filter_skill_choices,
78
+ value=filter_skill_choices,
79
+ interactive=True,
80
+ elem_id="filter-language",
81
+ scale=6
82
+ )
83
  with gr.Column():
84
+ select_all_skills = gr.Button(
85
+ value="Select all skills",
86
+ elem_id="select-all-skills",
87
+ size="sm",
88
+ scale=1
 
89
  )
90
+ deselect_all_skills = gr.ClearButton(
91
+ filter_skills,
92
+ value="Deselect all skills",
93
+ elem_id="deselect-all-skills",
94
+ size="sm",
95
+ scale=1
96
+ )
97
+
98
+ with gr.Column():
99
+ leaderboard_table = gr.Dataframe(
100
+ value=LEADERBOARD_DF,
101
+ interactive=False,
102
+ type="pandas",
103
+ visible=True,
104
+ label="Leaderboard",
105
+ elem_id="leaderboard-title",
106
+ )
107
 
108
  with gr.Row():
109
  with gr.Accordion("πŸ“™ Citation", open=False):
 
117
 
118
 
119
  def update_leaderboard(filter_task_items, filter_skills_items):
120
+ # Empty tasks/skills set:
121
+ if not filter_task_items or not filter_skills_items:
122
+ return pd.DataFrame([], columns=["model", "Borda Count"])
123
+
124
  filtered_df: pd.DataFrame = LEADERBOARD_DF.copy()
125
 
126
  filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)]
127
+
128
+ filtered_df = filtered_df[filtered_df["skill"].apply(
129
+ lambda x: any(skill in x for skill in filter_skills_items)
130
+ )]
131
 
132
  cols = ["model", "task", "score"]
133
  filtered_df = filtered_df[cols]
134
 
135
+ # Calculate borda count
136
  current_task_items = filtered_df["task"].unique().tolist()
137
+ filtered_df["borda-score"] = 0
138
+ for task in current_task_items:
139
+ filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task)
140
+ .rank(ascending=True, method="max") - 1).fillna(0)
141
+
142
+ filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index()
143
+ filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1)
144
+ filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min")
145
 
146
  # Reorder columns
147
+ filtered_df = filtered_df[["model", "borda-count", "score"]]
148
+ filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist())
149
 
150
+ # Sort by borda count
151
+ filtered_df = filtered_df.sort_values(by="borda-count", ascending=True)
152
 
153
+ # Rename borda count with symbol
154
+ filtered_df = filtered_df.rename(columns={
155
+ "borda-count": "Borda Count",
156
+ "MultiHiertt EASY": "MHiertt EASY",
157
+ "MultiHiertt HARD": "MHiertt HARD",
158
+ })
159
 
160
  # Round values
161
  for col in filtered_df.columns:
162
+ if col not in ["model", "Borda Count"]:
163
  filtered_df[col] = filtered_df[col].round(2)
164
 
165
  return filtered_df
 
174
  )
175
 
176
  select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task])
 
177
  select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills])
 
178
 
179
  gr.Blocks.load(
180
  block=demo,