Klaudia Thellmann commited on
Commit
e974a37
Β·
unverified Β·
2 Parent(s): 6ede6ba 2ecf642

Merge pull request #14 from OpenGPTX/mt_bench

Browse files
Files changed (2) hide show
  1. app.py +29 -13
  2. core.py +7 -2
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
 
3
  import core as core
4
- from style import CSS, T_SYMBOLS, TITLE, LANG_SYMBOLS
5
 
6
  demo = gr.Blocks(css=CSS)
7
  with demo:
@@ -38,7 +38,7 @@ with demo:
38
  )
39
  with gr.Row():
40
  langs_bar = gr.CheckboxGroup(
41
- choices=[(LANG_SYMBOLS.get(l,l),l) for l in core.languages_list],
42
  value=core.languages_list,
43
  label="Select languages to average over",
44
  elem_id="column-select",
@@ -52,13 +52,11 @@ with demo:
52
  size="sm",
53
  scale=1,
54
  )
55
- select = gr.Button(
56
- value="Select all languages", size="sm", scale=1
57
- )
58
 
59
  def update_bar():
60
  langs_bar = gr.CheckboxGroup(
61
- choices=[(LANG_SYMBOLS.get(l,l),l) for l in core.languages_list],
62
  value=core.languages_list,
63
  label="Select languages to average over",
64
  elem_id="column-select",
@@ -83,14 +81,10 @@ with demo:
83
  label="Select evaluation type",
84
  scale=29,
85
  )
86
- clear = gr.ClearButton(
87
- shown_tasks, value="Deselect all tasks", size="sm", scale=21
88
- )
89
 
90
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
91
- with gr.TabItem(
92
- "πŸ… LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
93
- ) as acc:
94
  leaderboard_table = gr.Dataframe()
95
  with gr.TabItem(
96
  "🌐 LLM translation benchmark",
@@ -98,6 +92,12 @@ with demo:
98
  id=1,
99
  ) as misc:
100
  leaderboard_table_misc = gr.Dataframe()
 
 
 
 
 
 
101
 
102
  demo.load(
103
  core.update_task_groups_and_fewshot,
@@ -119,6 +119,11 @@ with demo:
119
  inputs=[gr.State(value=1), fewshot],
120
  outputs=[shown_tasks, fewshot, selected_tab],
121
  )
 
 
 
 
 
122
  for comp, fn in [
123
  (search_bar, "submit"),
124
  (langs_bar, "change"),
@@ -136,7 +141,11 @@ with demo:
136
  [shown_tasks, search_bar, langs_bar, model_types, fewshot],
137
  leaderboard_table_misc,
138
  )
139
-
 
 
 
 
140
 
141
  gr.Blocks.load(
142
  block=demo,
@@ -152,4 +161,11 @@ with demo:
152
  outputs=leaderboard_table_misc,
153
  )
154
 
 
 
 
 
 
 
 
155
  demo.launch()
 
1
  import gradio as gr
2
 
3
  import core as core
4
+ from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE
5
 
6
  demo = gr.Blocks(css=CSS)
7
  with demo:
 
38
  )
39
  with gr.Row():
40
  langs_bar = gr.CheckboxGroup(
41
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
42
  value=core.languages_list,
43
  label="Select languages to average over",
44
  elem_id="column-select",
 
52
  size="sm",
53
  scale=1,
54
  )
55
+ select = gr.Button(value="Select all languages", size="sm", scale=1)
 
 
56
 
57
  def update_bar():
58
  langs_bar = gr.CheckboxGroup(
59
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
60
  value=core.languages_list,
61
  label="Select languages to average over",
62
  elem_id="column-select",
 
81
  label="Select evaluation type",
82
  scale=29,
83
  )
84
+ clear = gr.ClearButton(shown_tasks, value="Deselect all tasks", size="sm", scale=21)
 
 
85
 
86
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
87
+ with gr.TabItem("πŸ… LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0) as acc:
 
 
88
  leaderboard_table = gr.Dataframe()
89
  with gr.TabItem(
90
  "🌐 LLM translation benchmark",
 
92
  id=1,
93
  ) as misc:
94
  leaderboard_table_misc = gr.Dataframe()
95
+ with gr.TabItem(
96
+ "🌐 LLM MT-Bench benchmark",
97
+ elem_id="llm-benchmark-tab-table-mtbench",
98
+ id=2,
99
+ ) as mtbench:
100
+ leaderboard_table_mtbench = gr.Dataframe()
101
 
102
  demo.load(
103
  core.update_task_groups_and_fewshot,
 
119
  inputs=[gr.State(value=1), fewshot],
120
  outputs=[shown_tasks, fewshot, selected_tab],
121
  )
122
+ mtbench.select(
123
+ core.update_task_groups_and_fewshot,
124
+ inputs=[gr.State(value=2), fewshot],
125
+ outputs=[shown_tasks, fewshot, selected_tab],
126
+ )
127
  for comp, fn in [
128
  (search_bar, "submit"),
129
  (langs_bar, "change"),
 
141
  [shown_tasks, search_bar, langs_bar, model_types, fewshot],
142
  leaderboard_table_misc,
143
  )
144
+ getattr(comp, fn)(
145
+ core.update_df,
146
+ [shown_tasks, search_bar, langs_bar, model_types, fewshot],
147
+ leaderboard_table_mtbench,
148
+ )
149
 
150
  gr.Blocks.load(
151
  block=demo,
 
161
  outputs=leaderboard_table_misc,
162
  )
163
 
164
+ gr.Blocks.load(
165
+ block=demo,
166
+ fn=core.update_df,
167
+ inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
168
+ outputs=leaderboard_table_mtbench,
169
+ )
170
+
171
  demo.launch()
core.py CHANGED
@@ -9,7 +9,7 @@ from datasets import load_dataset
9
 
10
  import style
11
 
12
- ZERO_SHOT_ONLY = ["BELEBELE"]
13
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
14
 
15
 
@@ -141,6 +141,11 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
141
  elif current_selected_tab == 1:
142
  is_fewshot_new = False
143
  fewshot_available = False
 
 
 
 
 
144
 
145
  fewshot_radio_update = gr.Radio(
146
  value=is_fewshot_new,
@@ -151,7 +156,7 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
151
 
152
 
153
  def get_selected_task_type(task_type_id):
154
- task_types = {0: "accuracy", 1: "misc"}
155
  selected_task_type = task_types[task_type_id]
156
  return selected_task_type
157
 
 
9
 
10
  import style
11
 
12
+ ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
13
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
14
 
15
 
 
141
  elif current_selected_tab == 1:
142
  is_fewshot_new = False
143
  fewshot_available = False
144
+ elif current_selected_tab == 2:
145
+ is_fewshot_new = False
146
+ fewshot_available = False
147
+ else:
148
+ raise ValueError(f"Unknown tab id {current_selected_tab}")
149
 
150
  fewshot_radio_update = gr.Radio(
151
  value=is_fewshot_new,
 
156
 
157
 
158
  def get_selected_task_type(task_type_id):
159
+ task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score"}
160
  selected_task_type = task_types[task_type_id]
161
  return selected_task_type
162