File size: 6,815 Bytes
9346f1c
 
2a5f9fb
1ffc326
8c49cb6
 
 
 
d048ec3
 
 
 
 
 
 
 
 
 
6c63009
d048ec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d084b26
01233b7
58733e4
6c63009
 
 
 
 
 
8cb7546
6c63009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0227006
6c63009
 
 
 
 
 
 
 
 
6e8f400
6c63009
 
 
 
 
b323764
6c63009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0227006
d16cee2
 
 
 
 
67109fc
d16cee2
adb0416
 
d16cee2
d048ec3
 
6c63009
 
 
 
d048ec3
 
 
6c63009
 
 
 
d048ec3
 
 
 
6c63009
d048ec3
6c63009
 
 
 
 
 
 
 
d048ec3
 
6c63009
 
d048ec3
6c63009
 
d048ec3
6c63009
 
 
 
 
 
d048ec3
 
 
6c63009
d048ec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import gradio as gr
import pandas as pd

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    TITLE,
)
from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO
from src.populate import get_leaderboard_df
from src.utils import initialize_file

# Initialize the results file
initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH)
# Get leaderboard
LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv")

columns = LEADERBOARD_DF.columns.tolist()
demo = gr.Blocks(theme=gr.themes.Monochrome())

# Choices for the filters
unselectable_columns = ["model"]
select_column_choices = list(columns)

for unselectable_column in unselectable_columns:
    select_column_choices.remove(unselectable_column)

# Option for the filters
filter_model_choices = LEADERBOARD_DF["model"].unique().tolist()
filter_task_choices = LEADERBOARD_DF["task"].unique().tolist()
filter_skill_choices = [
    "Dialogue",
    "Long Context",
    "Numerical Reasoning",
    "Question Answering",
    "Summarisation",
    "Tabular Reasoning",
]

with demo:
    gr.HTML(TITLE)
    gr.Markdown(
        "This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key "
        "finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, "
        "summarisation and dialogue.", elem_classes="markdown-text",
    )
    gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            with gr.Row():
                filter_task = gr.CheckboxGroup(
                    label="Select Tasks",
                    choices=filter_task_choices,
                    interactive=True,
                    value=filter_task_choices,
                    elem_id="filter_task",
                    scale=6
                )
                with gr.Column():
                    select_all_tasks = gr.Button(
                        value="Select all tasks",
                        elem_id="select-all-tasks",
                        size="sm",
                        scale=1
                    )
                    deselect_all_tasks = gr.ClearButton(
                        filter_task,
                        value="Deselect all tasks",
                        elem_id="deselect-all-tasks",
                        size="sm",
                        scale=1
                    )

            with gr.Row():
                filter_skills = gr.CheckboxGroup(
                    label="Select Skills",
                    choices=filter_skill_choices,
                    value=filter_skill_choices,
                    interactive=True,
                    elem_id="filter-language",
                    scale=6
                )
                with gr.Column():
                    select_all_skills = gr.Button(
                        value="Select all skills",
                        elem_id="select-all-skills",
                        size="sm",
                        scale=1
                    )
                    deselect_all_skills = gr.ClearButton(
                        filter_skills,
                        value="Deselect all skills",
                        elem_id="deselect-all-skills",
                        size="sm",
                        scale=1
                    )

            with gr.Column():
                leaderboard_table = gr.Dataframe(
                    value=LEADERBOARD_DF,
                    interactive=False,
                    type="pandas",
                    visible=True,
                    label="Leaderboard",
                    elem_id="leaderboard-title",
                )

    with gr.Row():
        with gr.Accordion("πŸ“™ Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )


    def update_leaderboard(filter_task_items, filter_skills_items):
        # Empty tasks/skills set:
        if not filter_task_items or not filter_skills_items:
            return pd.DataFrame([], columns=["model", "Borda Count"])

        filtered_df: pd.DataFrame = LEADERBOARD_DF.copy()

        filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)]

        filtered_df = filtered_df[filtered_df["skill"].apply(
            lambda x: any(skill in x for skill in filter_skills_items)
        )]

        cols = ["model", "task", "score"]
        filtered_df = filtered_df[cols]

        # Calculate borda count
        current_task_items = filtered_df["task"].unique().tolist()
        filtered_df["borda-score"] = 0
        for task in current_task_items:
            filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task)
                                           .rank(ascending=True, method="max") - 1).fillna(0)

        filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index()
        filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1)
        filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min")

        # Reorder columns
        filtered_df = filtered_df[["model", "borda-count", "score"]]
        filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist())

        # Sort by borda count
        filtered_df = filtered_df.sort_values(by="borda-count", ascending=True)

        # Rename borda count with symbol
        filtered_df = filtered_df.rename(columns={
            "borda-count": "Borda Count",
            "MultiHiertt EASY": "MHiertt EASY",
            "MultiHiertt HARD": "MHiertt HARD",
        })

        # Round values
        for col in filtered_df.columns:
            if col not in ["model", "Borda Count"]:
                filtered_df[col] = filtered_df[col].round(2)

        return filtered_df

    inputs = [filter_task, filter_skills]
    outputs = [leaderboard_table]
    for component in inputs:
        component.change(
            fn=update_leaderboard,
            inputs=inputs,
            outputs=outputs
        )

    select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task])
    select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills])

    gr.Blocks.load(
        block=demo,
        fn=update_leaderboard,
        inputs=inputs,
        outputs=outputs
    )

demo.queue().launch()