aveni-bench / app.py
mateusz-aveni's picture
Add borda count instead of an average.
6c63009
import gradio as gr
import pandas as pd
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
TITLE,
)
from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO
from src.populate import get_leaderboard_df
from src.utils import initialize_file
# Initialize the results file
initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH)
# Get leaderboard
LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv")
columns = LEADERBOARD_DF.columns.tolist()
demo = gr.Blocks(theme=gr.themes.Monochrome())
# Choices for the filters
unselectable_columns = ["model"]
select_column_choices = list(columns)
for unselectable_column in unselectable_columns:
select_column_choices.remove(unselectable_column)
# Option for the filters
filter_model_choices = LEADERBOARD_DF["model"].unique().tolist()
filter_task_choices = LEADERBOARD_DF["task"].unique().tolist()
filter_skill_choices = [
"Dialogue",
"Long Context",
"Numerical Reasoning",
"Question Answering",
"Summarisation",
"Tabular Reasoning",
]
with demo:
gr.HTML(TITLE)
gr.Markdown(
"This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key "
"finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, "
"summarisation and dialogue.", elem_classes="markdown-text",
)
gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0):
with gr.Row():
filter_task = gr.CheckboxGroup(
label="Select Tasks",
choices=filter_task_choices,
interactive=True,
value=filter_task_choices,
elem_id="filter_task",
scale=6
)
with gr.Column():
select_all_tasks = gr.Button(
value="Select all tasks",
elem_id="select-all-tasks",
size="sm",
scale=1
)
deselect_all_tasks = gr.ClearButton(
filter_task,
value="Deselect all tasks",
elem_id="deselect-all-tasks",
size="sm",
scale=1
)
with gr.Row():
filter_skills = gr.CheckboxGroup(
label="Select Skills",
choices=filter_skill_choices,
value=filter_skill_choices,
interactive=True,
elem_id="filter-language",
scale=6
)
with gr.Column():
select_all_skills = gr.Button(
value="Select all skills",
elem_id="select-all-skills",
size="sm",
scale=1
)
deselect_all_skills = gr.ClearButton(
filter_skills,
value="Deselect all skills",
elem_id="deselect-all-skills",
size="sm",
scale=1
)
with gr.Column():
leaderboard_table = gr.Dataframe(
value=LEADERBOARD_DF,
interactive=False,
type="pandas",
visible=True,
label="Leaderboard",
elem_id="leaderboard-title",
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
def update_leaderboard(filter_task_items, filter_skills_items):
# Empty tasks/skills set:
if not filter_task_items or not filter_skills_items:
return pd.DataFrame([], columns=["model", "Borda Count"])
filtered_df: pd.DataFrame = LEADERBOARD_DF.copy()
filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)]
filtered_df = filtered_df[filtered_df["skill"].apply(
lambda x: any(skill in x for skill in filter_skills_items)
)]
cols = ["model", "task", "score"]
filtered_df = filtered_df[cols]
# Calculate borda count
current_task_items = filtered_df["task"].unique().tolist()
filtered_df["borda-score"] = 0
for task in current_task_items:
filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task)
.rank(ascending=True, method="max") - 1).fillna(0)
filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index()
filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1)
filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min")
# Reorder columns
filtered_df = filtered_df[["model", "borda-count", "score"]]
filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist())
# Sort by borda count
filtered_df = filtered_df.sort_values(by="borda-count", ascending=True)
# Rename borda count with symbol
filtered_df = filtered_df.rename(columns={
"borda-count": "Borda Count",
"MultiHiertt EASY": "MHiertt EASY",
"MultiHiertt HARD": "MHiertt HARD",
})
# Round values
for col in filtered_df.columns:
if col not in ["model", "Borda Count"]:
filtered_df[col] = filtered_df[col].round(2)
return filtered_df
inputs = [filter_task, filter_skills]
outputs = [leaderboard_table]
for component in inputs:
component.change(
fn=update_leaderboard,
inputs=inputs,
outputs=outputs
)
select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task])
select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills])
gr.Blocks.load(
block=demo,
fn=update_leaderboard,
inputs=inputs,
outputs=outputs
)
demo.queue().launch()