Spaces:

Eurolingua
/

european-llm-leaderboard

Running

ajude commited on Jul 24, 2024

Commit

8e1a43b

1 Parent(s): 2b03fdd

fix(MT-BENCH): Added fix for:

1. Model type is now fixed at "chat" for MT-BENCH. Pretrained models are not shown or can be selected.
2. Language selection in MT-BENCH tab is limited to the EN,DE,ES,FR,IT

Files changed (3) hide show

app.py +10 -10
core.py +34 -4
style.py +7 -0

app.py CHANGED Viewed

@@ -101,28 +101,28 @@ with demo:
         demo.load(
             core.update_task_groups_and_fewshot,
-            [gr.State(value=0), fewshot],
-            [shown_tasks, fewshot, selected_tab],
         )
         fewshot.change(
             core.update_task_groups_and_fewshot,
-            [selected_tab, fewshot],
-            [shown_tasks, fewshot, selected_tab],
         )
         acc.select(
             core.update_task_groups_and_fewshot,
-            inputs=[gr.State(value=0), fewshot],
-            outputs=[shown_tasks, fewshot, selected_tab],
         )
         misc.select(
             core.update_task_groups_and_fewshot,
-            inputs=[gr.State(value=1), fewshot],
-            outputs=[shown_tasks, fewshot, selected_tab],
         )
         mtbench.select(
             core.update_task_groups_and_fewshot,
-            inputs=[gr.State(value=2), fewshot],
-            outputs=[shown_tasks, fewshot, selected_tab],
         )
         for comp, fn in [
             (search_bar, "submit"),

         demo.load(
             core.update_task_groups_and_fewshot,
+            [gr.State(value=0), model_types, langs_bar,fewshot],
+            [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
         )
         fewshot.change(
             core.update_task_groups_and_fewshot,
+            [selected_tab, model_types, langs_bar, fewshot],
+            [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
         )
         acc.select(
             core.update_task_groups_and_fewshot,
+            inputs=[gr.State(value=0), model_types, langs_bar, fewshot],
+            outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
         )
         misc.select(
             core.update_task_groups_and_fewshot,
+            inputs=[gr.State(value=1), model_types, langs_bar, fewshot],
+            outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
         )
         mtbench.select(
             core.update_task_groups_and_fewshot,
+            inputs=[gr.State(value=2), model_types, langs_bar, fewshot],
+            outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
         )
         for comp, fn in [
             (search_bar, "submit"),

core.py CHANGED Viewed

@@ -4,17 +4,17 @@ import os
 import gradio as gr
 import numpy as np
 import pandas as pd
-import plotly.express as px
 from datasets import load_dataset
 import style
 ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
 FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
 def init():
-    global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict
     repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
     config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
@@ -29,6 +29,7 @@ def init():
     task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
     task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
     languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
     model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
     model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
@@ -127,7 +128,7 @@ def update_df(
         return sort_cols(df, fewshot)
-def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current: bool = False):
     selected_task_type = get_selected_task_type(current_selected_tab)
     available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
     new_selected_tasks = available_tasks.copy()
@@ -154,7 +155,36 @@ def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current
         interactive=fewshot_available,
     )
-    return [tasks_checkbox_group_update, fewshot_radio_update, current_selected_tab]
 def get_selected_task_type(task_type_id):

 import gradio as gr
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 import style
+from style import T_SYMBOLS, MT_BENCH_LANG_SYMBOLS, LANG_SYMBOLS
 ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
 FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
 def init():
+    global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict, mt_bench_language_list
     repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
     config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
     task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
     task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
     languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
+    mt_bench_language_list = hidden_df[hidden_df['Task_Group'] == "MTBENCH"]["Language"].drop_duplicates().str.upper().tolist()
     model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
     model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
         return sort_cols(df, fewshot)
+def update_task_groups_and_fewshot(current_selected_tab: int, model_types, langs_bar, is_fewshot_current: bool = False, ):
     selected_task_type = get_selected_task_type(current_selected_tab)
     available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
     new_selected_tasks = available_tasks.copy()
         interactive=fewshot_available,
     )
+    if current_selected_tab == 2:
+        model_types = gr.CheckboxGroup(
+            value=[T_SYMBOLS['chat']],
+            interactive=False
+        )
+        langs_bar = gr.CheckboxGroup(
+            choices=[(MT_BENCH_LANG_SYMBOLS.get(l, l), l) for l in mt_bench_language_list],
+            value=mt_bench_language_list,
+            interactive=True,
+        )
+    else:
+        model_types = gr.CheckboxGroup(
+            label="Select model type",
+            choices=[
+                (
+                    f"Pretrained {T_SYMBOLS['pretrained']}",
+                    T_SYMBOLS["pretrained"],
+                ),
+                (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
+            ],
+            value=list(T_SYMBOLS.values()),
+            interactive=True
+        )
+        langs_bar = gr.CheckboxGroup(
+            choices=[(LANG_SYMBOLS.get(l, l), l) for l in languages_list],
+            value=languages_list,
+            interactive=True,
+        )
+    return [tasks_checkbox_group_update, fewshot_radio_update, current_selected_tab, model_types, langs_bar]
 def get_selected_task_type(task_type_id):

style.py CHANGED Viewed

@@ -40,3 +40,10 @@ LANG_SYMBOLS = {
     "SV": "🇸🇪 SV"
 }

     "SV": "🇸🇪 SV"
 }
+MT_BENCH_LANG_SYMBOLS = {
+    "ES": "🇪🇸 ES",
+    "EN": "🇬🇧 EN",
+    "DE": "🇩🇪 DE",
+    "FR": "🇫🇷 FR",
+    "IT": "🇮🇹 IT"
+}