import gradio as gr import pandas as pd from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, TITLE, ) from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO from src.populate import get_leaderboard_df from src.utils import initialize_file # Initialize the results file initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH) # Get leaderboard LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv") columns = LEADERBOARD_DF.columns.tolist() demo = gr.Blocks(theme=gr.themes.Monochrome()) # Choices for the filters unselectable_columns = ["model"] select_column_choices = list(columns) for unselectable_column in unselectable_columns: select_column_choices.remove(unselectable_column) # Option for the filters filter_model_choices = LEADERBOARD_DF["model"].unique().tolist() filter_task_choices = LEADERBOARD_DF["task"].unique().tolist() filter_skill_choices = [ "Dialogue", "Long Context", "Numerical Reasoning", "Question Answering", "Summarisation", "Tabular Reasoning", ] with demo: gr.HTML(TITLE) gr.Markdown( "This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key " "finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, " "summarisation and dialogue.", elem_classes="markdown-text", ) gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0): with gr.Row(): filter_task = gr.CheckboxGroup( label="Select Tasks", choices=filter_task_choices, interactive=True, value=filter_task_choices, elem_id="filter_task", scale=6 ) with gr.Column(): select_all_tasks = gr.Button( value="Select all tasks", elem_id="select-all-tasks", size="sm", scale=1 ) deselect_all_tasks = gr.ClearButton( filter_task, value="Deselect all tasks", elem_id="deselect-all-tasks", size="sm", scale=1 ) with gr.Row(): filter_skills = gr.CheckboxGroup( label="Select Skills", choices=filter_skill_choices, value=filter_skill_choices, interactive=True, elem_id="filter-language", scale=6 ) with gr.Column(): select_all_skills = gr.Button( value="Select all skills", elem_id="select-all-skills", size="sm", scale=1 ) deselect_all_skills = gr.ClearButton( filter_skills, value="Deselect all skills", elem_id="deselect-all-skills", size="sm", scale=1 ) with gr.Column(): leaderboard_table = gr.Dataframe( value=LEADERBOARD_DF, interactive=False, type="pandas", visible=True, label="Leaderboard", elem_id="leaderboard-title", ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) def update_leaderboard(filter_task_items, filter_skills_items): # Empty tasks/skills set: if not filter_task_items or not filter_skills_items: return pd.DataFrame([], columns=["model", "Borda Count"]) filtered_df: pd.DataFrame = LEADERBOARD_DF.copy() filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)] filtered_df = filtered_df[filtered_df["skill"].apply( lambda x: any(skill in x for skill in filter_skills_items) )] cols = ["model", "task", "score"] filtered_df = filtered_df[cols] # Calculate borda count current_task_items = filtered_df["task"].unique().tolist() filtered_df["borda-score"] = 0 for task in current_task_items: filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task) .rank(ascending=True, method="max") - 1).fillna(0) filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index() filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1) filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min") # Reorder columns filtered_df = filtered_df[["model", "borda-count", "score"]] filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist()) # Sort by borda count filtered_df = filtered_df.sort_values(by="borda-count", ascending=True) # Rename borda count with symbol filtered_df = filtered_df.rename(columns={ "borda-count": "Borda Count", "MultiHiertt EASY": "MHiertt EASY", "MultiHiertt HARD": "MHiertt HARD", }) # Round values for col in filtered_df.columns: if col not in ["model", "Borda Count"]: filtered_df[col] = filtered_df[col].round(2) return filtered_df inputs = [filter_task, filter_skills] outputs = [leaderboard_table] for component in inputs: component.change( fn=update_leaderboard, inputs=inputs, outputs=outputs ) select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task]) select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills]) gr.Blocks.load( block=demo, fn=update_leaderboard, inputs=inputs, outputs=outputs ) demo.queue().launch()