Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from src.about import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
TITLE, | |
) | |
from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO | |
from src.populate import get_leaderboard_df | |
from src.utils import initialize_file | |
# Initialize the results file | |
initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH) | |
# Get leaderboard | |
LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv") | |
columns = LEADERBOARD_DF.columns.tolist() | |
demo = gr.Blocks(theme=gr.themes.Monochrome()) | |
# Choices for the filters | |
unselectable_columns = ["model"] | |
select_column_choices = list(columns) | |
for unselectable_column in unselectable_columns: | |
select_column_choices.remove(unselectable_column) | |
# Option for the filters | |
filter_model_choices = LEADERBOARD_DF["model"].unique().tolist() | |
filter_task_choices = LEADERBOARD_DF["task"].unique().tolist() | |
filter_skill_choices = [ | |
"Dialogue", | |
"Long Context", | |
"Numerical Reasoning", | |
"Question Answering", | |
"Summarisation", | |
"Tabular Reasoning", | |
] | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown( | |
"This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key " | |
"finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, " | |
"summarisation and dialogue.", elem_classes="markdown-text", | |
) | |
gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0): | |
with gr.Row(): | |
filter_task = gr.CheckboxGroup( | |
label="Select Tasks", | |
choices=filter_task_choices, | |
interactive=True, | |
value=filter_task_choices, | |
elem_id="filter_task", | |
scale=6 | |
) | |
with gr.Column(): | |
select_all_tasks = gr.Button( | |
value="Select all tasks", | |
elem_id="select-all-tasks", | |
size="sm", | |
scale=1 | |
) | |
deselect_all_tasks = gr.ClearButton( | |
filter_task, | |
value="Deselect all tasks", | |
elem_id="deselect-all-tasks", | |
size="sm", | |
scale=1 | |
) | |
with gr.Row(): | |
filter_skills = gr.CheckboxGroup( | |
label="Select Skills", | |
choices=filter_skill_choices, | |
value=filter_skill_choices, | |
interactive=True, | |
elem_id="filter-language", | |
scale=6 | |
) | |
with gr.Column(): | |
select_all_skills = gr.Button( | |
value="Select all skills", | |
elem_id="select-all-skills", | |
size="sm", | |
scale=1 | |
) | |
deselect_all_skills = gr.ClearButton( | |
filter_skills, | |
value="Deselect all skills", | |
elem_id="deselect-all-skills", | |
size="sm", | |
scale=1 | |
) | |
with gr.Column(): | |
leaderboard_table = gr.Dataframe( | |
value=LEADERBOARD_DF, | |
interactive=False, | |
type="pandas", | |
visible=True, | |
label="Leaderboard", | |
elem_id="leaderboard-title", | |
) | |
with gr.Row(): | |
with gr.Accordion("π Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=20, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
def update_leaderboard(filter_task_items, filter_skills_items): | |
# Empty tasks/skills set: | |
if not filter_task_items or not filter_skills_items: | |
return pd.DataFrame([], columns=["model", "Borda Count"]) | |
filtered_df: pd.DataFrame = LEADERBOARD_DF.copy() | |
filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)] | |
filtered_df = filtered_df[filtered_df["skill"].apply( | |
lambda x: any(skill in x for skill in filter_skills_items) | |
)] | |
cols = ["model", "task", "score"] | |
filtered_df = filtered_df[cols] | |
# Calculate borda count | |
current_task_items = filtered_df["task"].unique().tolist() | |
filtered_df["borda-score"] = 0 | |
for task in current_task_items: | |
filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task) | |
.rank(ascending=True, method="max") - 1).fillna(0) | |
filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index() | |
filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1) | |
filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min") | |
# Reorder columns | |
filtered_df = filtered_df[["model", "borda-count", "score"]] | |
filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist()) | |
# Sort by borda count | |
filtered_df = filtered_df.sort_values(by="borda-count", ascending=True) | |
# Rename borda count with symbol | |
filtered_df = filtered_df.rename(columns={ | |
"borda-count": "Borda Count", | |
"MultiHiertt EASY": "MHiertt EASY", | |
"MultiHiertt HARD": "MHiertt HARD", | |
}) | |
# Round values | |
for col in filtered_df.columns: | |
if col not in ["model", "Borda Count"]: | |
filtered_df[col] = filtered_df[col].round(2) | |
return filtered_df | |
inputs = [filter_task, filter_skills] | |
outputs = [leaderboard_table] | |
for component in inputs: | |
component.change( | |
fn=update_leaderboard, | |
inputs=inputs, | |
outputs=outputs | |
) | |
select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task]) | |
select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills]) | |
gr.Blocks.load( | |
block=demo, | |
fn=update_leaderboard, | |
inputs=inputs, | |
outputs=outputs | |
) | |
demo.queue().launch() | |