ajude commited on
Commit
a22854d
·
1 Parent(s): 5d35aca

feat(MT-Bench): Added MT-Bench to the leaderboard.

Browse files
Files changed (2) hide show
  1. app.py +23 -0
  2. core.py +5 -2
app.py CHANGED
@@ -98,6 +98,12 @@ with demo:
98
  id=1,
99
  ) as misc:
100
  leaderboard_table_misc = gr.Dataframe()
 
 
 
 
 
 
101
 
102
  demo.load(
103
  core.update_task_groups_and_fewshot,
@@ -119,6 +125,11 @@ with demo:
119
  inputs=[gr.State(value=1), fewshot],
120
  outputs=[shown_tasks, fewshot, selected_tab],
121
  )
 
 
 
 
 
122
  for comp, fn in [
123
  (search_bar, "submit"),
124
  (langs_bar, "change"),
@@ -136,6 +147,11 @@ with demo:
136
  [shown_tasks, search_bar, langs_bar, model_types, fewshot],
137
  leaderboard_table_misc,
138
  )
 
 
 
 
 
139
 
140
 
141
  gr.Blocks.load(
@@ -152,4 +168,11 @@ with demo:
152
  outputs=leaderboard_table_misc,
153
  )
154
 
 
 
 
 
 
 
 
155
  demo.launch()
 
98
  id=1,
99
  ) as misc:
100
  leaderboard_table_misc = gr.Dataframe()
101
+ with gr.TabItem(
102
+ "🌐 LLM MT-Bench benchmark",
103
+ elem_id="llm-benchmark-tab-table-mtbench",
104
+ id=2,
105
+ ) as mtbench:
106
+ leaderboard_table_mtbench = gr.Dataframe()
107
 
108
  demo.load(
109
  core.update_task_groups_and_fewshot,
 
125
  inputs=[gr.State(value=1), fewshot],
126
  outputs=[shown_tasks, fewshot, selected_tab],
127
  )
128
+ mtbench.select(
129
+ core.update_task_groups_and_fewshot,
130
+ inputs=[gr.State(value=2), fewshot],
131
+ outputs=[shown_tasks, fewshot, selected_tab],
132
+ )
133
  for comp, fn in [
134
  (search_bar, "submit"),
135
  (langs_bar, "change"),
 
147
  [shown_tasks, search_bar, langs_bar, model_types, fewshot],
148
  leaderboard_table_misc,
149
  )
150
+ getattr(comp, fn)(
151
+ core.update_df,
152
+ [shown_tasks, search_bar, langs_bar, model_types, fewshot],
153
+ leaderboard_table_mtbench,
154
+ )
155
 
156
 
157
  gr.Blocks.load(
 
168
  outputs=leaderboard_table_misc,
169
  )
170
 
171
+ gr.Blocks.load(
172
+ block=demo,
173
+ fn=core.update_df,
174
+ inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
175
+ outputs=leaderboard_table_mtbench,
176
+ )
177
+
178
  demo.launch()
core.py CHANGED
@@ -9,7 +9,7 @@ from datasets import load_dataset
9
 
10
  import style
11
 
12
- ZERO_SHOT_ONLY = ["BELEBELE"]
13
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
14
 
15
 
@@ -141,6 +141,9 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
141
  elif current_selected_tab == 1:
142
  is_fewshot_new = False
143
  fewshot_available = False
 
 
 
144
 
145
  fewshot_radio_update = gr.Radio(
146
  value=is_fewshot_new,
@@ -151,7 +154,7 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
151
 
152
 
153
  def get_selected_task_type(task_type_id):
154
- task_types = {0: "accuracy", 1: "misc"}
155
  selected_task_type = task_types[task_type_id]
156
  return selected_task_type
157
 
 
9
 
10
  import style
11
 
12
+ ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
13
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
14
 
15
 
 
141
  elif current_selected_tab == 1:
142
  is_fewshot_new = False
143
  fewshot_available = False
144
+ elif current_selected_tab == 2:
145
+ is_fewshot_new = False
146
+ fewshot_available = False
147
 
148
  fewshot_radio_update = gr.Radio(
149
  value=is_fewshot_new,
 
154
 
155
 
156
  def get_selected_task_type(task_type_id):
157
+ task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score"}
158
  selected_task_type = task_types[task_type_id]
159
  return selected_task_type
160