Spaces:
Sleeping
Sleeping
File size: 6,815 Bytes
9346f1c 2a5f9fb 1ffc326 8c49cb6 d048ec3 6c63009 d048ec3 d084b26 01233b7 58733e4 6c63009 8cb7546 6c63009 0227006 6c63009 6e8f400 6c63009 b323764 6c63009 0227006 d16cee2 67109fc d16cee2 adb0416 d16cee2 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 6c63009 d048ec3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import gradio as gr
import pandas as pd
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
TITLE,
)
from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO
from src.populate import get_leaderboard_df
from src.utils import initialize_file
# Initialize the results file
initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH)
# Get leaderboard
LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv")
columns = LEADERBOARD_DF.columns.tolist()
demo = gr.Blocks(theme=gr.themes.Monochrome())
# Choices for the filters
unselectable_columns = ["model"]
select_column_choices = list(columns)
for unselectable_column in unselectable_columns:
select_column_choices.remove(unselectable_column)
# Option for the filters
filter_model_choices = LEADERBOARD_DF["model"].unique().tolist()
filter_task_choices = LEADERBOARD_DF["task"].unique().tolist()
filter_skill_choices = [
"Dialogue",
"Long Context",
"Numerical Reasoning",
"Question Answering",
"Summarisation",
"Tabular Reasoning",
]
with demo:
gr.HTML(TITLE)
gr.Markdown(
"This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key "
"finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, "
"summarisation and dialogue.", elem_classes="markdown-text",
)
gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("π
AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0):
with gr.Row():
filter_task = gr.CheckboxGroup(
label="Select Tasks",
choices=filter_task_choices,
interactive=True,
value=filter_task_choices,
elem_id="filter_task",
scale=6
)
with gr.Column():
select_all_tasks = gr.Button(
value="Select all tasks",
elem_id="select-all-tasks",
size="sm",
scale=1
)
deselect_all_tasks = gr.ClearButton(
filter_task,
value="Deselect all tasks",
elem_id="deselect-all-tasks",
size="sm",
scale=1
)
with gr.Row():
filter_skills = gr.CheckboxGroup(
label="Select Skills",
choices=filter_skill_choices,
value=filter_skill_choices,
interactive=True,
elem_id="filter-language",
scale=6
)
with gr.Column():
select_all_skills = gr.Button(
value="Select all skills",
elem_id="select-all-skills",
size="sm",
scale=1
)
deselect_all_skills = gr.ClearButton(
filter_skills,
value="Deselect all skills",
elem_id="deselect-all-skills",
size="sm",
scale=1
)
with gr.Column():
leaderboard_table = gr.Dataframe(
value=LEADERBOARD_DF,
interactive=False,
type="pandas",
visible=True,
label="Leaderboard",
elem_id="leaderboard-title",
)
with gr.Row():
with gr.Accordion("π Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
def update_leaderboard(filter_task_items, filter_skills_items):
# Empty tasks/skills set:
if not filter_task_items or not filter_skills_items:
return pd.DataFrame([], columns=["model", "Borda Count"])
filtered_df: pd.DataFrame = LEADERBOARD_DF.copy()
filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)]
filtered_df = filtered_df[filtered_df["skill"].apply(
lambda x: any(skill in x for skill in filter_skills_items)
)]
cols = ["model", "task", "score"]
filtered_df = filtered_df[cols]
# Calculate borda count
current_task_items = filtered_df["task"].unique().tolist()
filtered_df["borda-score"] = 0
for task in current_task_items:
filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task)
.rank(ascending=True, method="max") - 1).fillna(0)
filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index()
filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1)
filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min")
# Reorder columns
filtered_df = filtered_df[["model", "borda-count", "score"]]
filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist())
# Sort by borda count
filtered_df = filtered_df.sort_values(by="borda-count", ascending=True)
# Rename borda count with symbol
filtered_df = filtered_df.rename(columns={
"borda-count": "Borda Count",
"MultiHiertt EASY": "MHiertt EASY",
"MultiHiertt HARD": "MHiertt HARD",
})
# Round values
for col in filtered_df.columns:
if col not in ["model", "Borda Count"]:
filtered_df[col] = filtered_df[col].round(2)
return filtered_df
inputs = [filter_task, filter_skills]
outputs = [leaderboard_table]
for component in inputs:
component.change(
fn=update_leaderboard,
inputs=inputs,
outputs=outputs
)
select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task])
select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills])
gr.Blocks.load(
block=demo,
fn=update_leaderboard,
inputs=inputs,
outputs=outputs
)
demo.queue().launch()
|