Merge pull request #14 from OpenGPTX/mt_bench
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
import core as core
|
| 4 |
-
from style import CSS, T_SYMBOLS, TITLE
|
| 5 |
|
| 6 |
demo = gr.Blocks(css=CSS)
|
| 7 |
with demo:
|
|
@@ -38,7 +38,7 @@ with demo:
|
|
| 38 |
)
|
| 39 |
with gr.Row():
|
| 40 |
langs_bar = gr.CheckboxGroup(
|
| 41 |
-
choices=[(LANG_SYMBOLS.get(l,l),l) for l in core.languages_list],
|
| 42 |
value=core.languages_list,
|
| 43 |
label="Select languages to average over",
|
| 44 |
elem_id="column-select",
|
|
@@ -52,13 +52,11 @@ with demo:
|
|
| 52 |
size="sm",
|
| 53 |
scale=1,
|
| 54 |
)
|
| 55 |
-
select = gr.Button(
|
| 56 |
-
value="Select all languages", size="sm", scale=1
|
| 57 |
-
)
|
| 58 |
|
| 59 |
def update_bar():
|
| 60 |
langs_bar = gr.CheckboxGroup(
|
| 61 |
-
choices=[(LANG_SYMBOLS.get(l,l),l) for l in core.languages_list],
|
| 62 |
value=core.languages_list,
|
| 63 |
label="Select languages to average over",
|
| 64 |
elem_id="column-select",
|
|
@@ -83,14 +81,10 @@ with demo:
|
|
| 83 |
label="Select evaluation type",
|
| 84 |
scale=29,
|
| 85 |
)
|
| 86 |
-
clear = gr.ClearButton(
|
| 87 |
-
shown_tasks, value="Deselect all tasks", size="sm", scale=21
|
| 88 |
-
)
|
| 89 |
|
| 90 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 91 |
-
with gr.TabItem(
|
| 92 |
-
"π
LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
|
| 93 |
-
) as acc:
|
| 94 |
leaderboard_table = gr.Dataframe()
|
| 95 |
with gr.TabItem(
|
| 96 |
"π LLM translation benchmark",
|
|
@@ -98,6 +92,12 @@ with demo:
|
|
| 98 |
id=1,
|
| 99 |
) as misc:
|
| 100 |
leaderboard_table_misc = gr.Dataframe()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
demo.load(
|
| 103 |
core.update_task_groups_and_fewshot,
|
|
@@ -119,6 +119,11 @@ with demo:
|
|
| 119 |
inputs=[gr.State(value=1), fewshot],
|
| 120 |
outputs=[shown_tasks, fewshot, selected_tab],
|
| 121 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
for comp, fn in [
|
| 123 |
(search_bar, "submit"),
|
| 124 |
(langs_bar, "change"),
|
|
@@ -136,7 +141,11 @@ with demo:
|
|
| 136 |
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
| 137 |
leaderboard_table_misc,
|
| 138 |
)
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
gr.Blocks.load(
|
| 142 |
block=demo,
|
|
@@ -152,4 +161,11 @@ with demo:
|
|
| 152 |
outputs=leaderboard_table_misc,
|
| 153 |
)
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
import core as core
|
| 4 |
+
from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE
|
| 5 |
|
| 6 |
demo = gr.Blocks(css=CSS)
|
| 7 |
with demo:
|
|
|
|
| 38 |
)
|
| 39 |
with gr.Row():
|
| 40 |
langs_bar = gr.CheckboxGroup(
|
| 41 |
+
choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
|
| 42 |
value=core.languages_list,
|
| 43 |
label="Select languages to average over",
|
| 44 |
elem_id="column-select",
|
|
|
|
| 52 |
size="sm",
|
| 53 |
scale=1,
|
| 54 |
)
|
| 55 |
+
select = gr.Button(value="Select all languages", size="sm", scale=1)
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def update_bar():
|
| 58 |
langs_bar = gr.CheckboxGroup(
|
| 59 |
+
choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
|
| 60 |
value=core.languages_list,
|
| 61 |
label="Select languages to average over",
|
| 62 |
elem_id="column-select",
|
|
|
|
| 81 |
label="Select evaluation type",
|
| 82 |
scale=29,
|
| 83 |
)
|
| 84 |
+
clear = gr.ClearButton(shown_tasks, value="Deselect all tasks", size="sm", scale=21)
|
|
|
|
|
|
|
| 85 |
|
| 86 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 87 |
+
with gr.TabItem("π
LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0) as acc:
|
|
|
|
|
|
|
| 88 |
leaderboard_table = gr.Dataframe()
|
| 89 |
with gr.TabItem(
|
| 90 |
"π LLM translation benchmark",
|
|
|
|
| 92 |
id=1,
|
| 93 |
) as misc:
|
| 94 |
leaderboard_table_misc = gr.Dataframe()
|
| 95 |
+
with gr.TabItem(
|
| 96 |
+
"π LLM MT-Bench benchmark",
|
| 97 |
+
elem_id="llm-benchmark-tab-table-mtbench",
|
| 98 |
+
id=2,
|
| 99 |
+
) as mtbench:
|
| 100 |
+
leaderboard_table_mtbench = gr.Dataframe()
|
| 101 |
|
| 102 |
demo.load(
|
| 103 |
core.update_task_groups_and_fewshot,
|
|
|
|
| 119 |
inputs=[gr.State(value=1), fewshot],
|
| 120 |
outputs=[shown_tasks, fewshot, selected_tab],
|
| 121 |
)
|
| 122 |
+
mtbench.select(
|
| 123 |
+
core.update_task_groups_and_fewshot,
|
| 124 |
+
inputs=[gr.State(value=2), fewshot],
|
| 125 |
+
outputs=[shown_tasks, fewshot, selected_tab],
|
| 126 |
+
)
|
| 127 |
for comp, fn in [
|
| 128 |
(search_bar, "submit"),
|
| 129 |
(langs_bar, "change"),
|
|
|
|
| 141 |
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
| 142 |
leaderboard_table_misc,
|
| 143 |
)
|
| 144 |
+
getattr(comp, fn)(
|
| 145 |
+
core.update_df,
|
| 146 |
+
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
| 147 |
+
leaderboard_table_mtbench,
|
| 148 |
+
)
|
| 149 |
|
| 150 |
gr.Blocks.load(
|
| 151 |
block=demo,
|
|
|
|
| 161 |
outputs=leaderboard_table_misc,
|
| 162 |
)
|
| 163 |
|
| 164 |
+
gr.Blocks.load(
|
| 165 |
+
block=demo,
|
| 166 |
+
fn=core.update_df,
|
| 167 |
+
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
| 168 |
+
outputs=leaderboard_table_mtbench,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
demo.launch()
|
core.py
CHANGED
|
@@ -9,7 +9,7 @@ from datasets import load_dataset
|
|
| 9 |
|
| 10 |
import style
|
| 11 |
|
| 12 |
-
ZERO_SHOT_ONLY = ["BELEBELE"]
|
| 13 |
FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
|
| 14 |
|
| 15 |
|
|
@@ -141,6 +141,11 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
|
|
| 141 |
elif current_selected_tab == 1:
|
| 142 |
is_fewshot_new = False
|
| 143 |
fewshot_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
fewshot_radio_update = gr.Radio(
|
| 146 |
value=is_fewshot_new,
|
|
@@ -151,7 +156,7 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
|
|
| 151 |
|
| 152 |
|
| 153 |
def get_selected_task_type(task_type_id):
|
| 154 |
-
task_types = {0: "accuracy", 1: "misc"}
|
| 155 |
selected_task_type = task_types[task_type_id]
|
| 156 |
return selected_task_type
|
| 157 |
|
|
|
|
| 9 |
|
| 10 |
import style
|
| 11 |
|
| 12 |
+
ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
|
| 13 |
FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
|
| 14 |
|
| 15 |
|
|
|
|
| 141 |
elif current_selected_tab == 1:
|
| 142 |
is_fewshot_new = False
|
| 143 |
fewshot_available = False
|
| 144 |
+
elif current_selected_tab == 2:
|
| 145 |
+
is_fewshot_new = False
|
| 146 |
+
fewshot_available = False
|
| 147 |
+
else:
|
| 148 |
+
raise ValueError(f"Unknown tab id {current_selected_tab}")
|
| 149 |
|
| 150 |
fewshot_radio_update = gr.Radio(
|
| 151 |
value=is_fewshot_new,
|
|
|
|
| 156 |
|
| 157 |
|
| 158 |
def get_selected_task_type(task_type_id):
|
| 159 |
+
task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score"}
|
| 160 |
selected_task_type = task_types[task_type_id]
|
| 161 |
return selected_task_type
|
| 162 |
|