Spaces:
Running
Running
import os | |
import gradio as gr | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi | |
from src.about import CITATION_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE | |
from src.populate import ( | |
MU_MATH_COLUMNS_DICT, | |
U_MATH_COLUMNS_DICT, | |
U_MATH_AND_MU_MATH_COLUMNS_DICT, | |
Field, | |
get_mu_math_leaderboard_df, | |
get_u_math_leaderboard_df, | |
get_joined_leaderboard_df, | |
) | |
def restart_space(): | |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org | |
API = HfApi(token=TOKEN) | |
REPO_ID = "toloka/u-math-leaderboard" | |
API.restart_space(repo_id=REPO_ID) | |
LEADERBOARD_U_MATH_DF = get_u_math_leaderboard_df() | |
LEADERBOARD_MU_MATH_DF = get_mu_math_leaderboard_df() | |
LEADERBOARD_U_MATH_MU_MATH_JOINED_DF = get_joined_leaderboard_df() | |
def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) -> gr.components.Component: | |
if dataframe is None or dataframe.empty: | |
raise ValueError("Leaderboard DataFrame is empty or None.") | |
def filter_dataframe_by_selected_columns(full_df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: | |
always_here_cols = [c.pretty_name for c in columns_dict.values() if c.never_hidden] | |
selected_columns = [c for c in columns if c in full_df.columns and c not in always_here_cols] | |
# keep the order of the columns | |
filtered_df = full_df[[c for c in full_df.columns if c in (always_here_cols + selected_columns)]] | |
return filtered_df | |
def filter_dataframe_by_selected_tag_columns( | |
full_df: pd.DataFrame, current_tag: str | |
) -> tuple[pd.DataFrame, list[str], str, str]: | |
always_here_cols = [c.pretty_name for c in columns_dict.values() if c.never_hidden] | |
selected_columns = [ | |
c.pretty_name for c in columns_dict.values() if current_tag in c.tags and c.pretty_name not in always_here_cols | |
] | |
# keep the order of the columns | |
filtered_df = full_df[[c for c in full_df.columns if c in (always_here_cols + selected_columns)]] | |
_columns_to_select_visibility = [ | |
c.pretty_name for c in columns_dict.values() if not c.fully_hidden and not c.never_hidden | |
] | |
return filtered_df, [c for c in _columns_to_select_visibility if c in filtered_df.columns], "All", "All" | |
def filter_dataframe_by_search(full_df: pd.DataFrame, current_df: pd.DataFrame, search: str) -> pd.DataFrame: | |
filtered_df = full_df[ | |
(full_df[columns_dict["model_name"].pretty_name].str.contains(search, case=False, na=False)) | |
] | |
return filtered_df[current_df.columns] | |
def filter_dataframe_by_model_type( | |
full_df: pd.DataFrame, current_df: pd.DataFrame, filter_name: str | |
) -> pd.DataFrame: | |
if filter_name == "All": | |
return full_df[current_df.columns] | |
else: | |
query_symbol = filter_name[0] | |
filtered_df = full_df[full_df[columns_dict["model_type_symbol"].pretty_name] == query_symbol] | |
return filtered_df[current_df.columns] | |
def filter_dataframe_by_model_size( | |
full_df: pd.DataFrame, current_df: pd.DataFrame, filter_name: str | |
) -> pd.DataFrame: | |
if filter_name == "All": | |
return full_df[current_df.columns] | |
else: | |
query_symbol = filter_name[0] | |
filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol] | |
return filtered_df[current_df.columns] | |
def filter_dataframe_by_model_family( | |
full_df: pd.DataFrame, current_df: pd.DataFrame, filter_name: str, | |
) -> pd.DataFrame: | |
if filter_name == "All": | |
return full_df[current_df.columns] | |
else: | |
filtered_df = full_df[full_df[columns_dict["model_family"].pretty_name] == filter_name] | |
return filtered_df[current_df.columns] | |
with gr.Column(scale=1) as col: | |
with gr.Row(): | |
with gr.Column(scale=8): | |
with gr.Accordion("β‘οΈ See All Columns", open=False): | |
columns_to_select_visibility = [ | |
c.pretty_name for c in columns_dict.values() if not c.fully_hidden and not c.never_hidden | |
] | |
all_columns_selector = gr.CheckboxGroup( | |
choices=columns_to_select_visibility, | |
value=[ | |
c.pretty_name | |
for c in columns_dict.values() | |
if c.pretty_name in columns_to_select_visibility and c.displayed_by_default | |
], | |
label="Select Columns to Display:", | |
interactive=True, | |
container=False, | |
) | |
with gr.Column(variant='panel'): | |
gr.Markdown("Visible Columns:", elem_id="visible-columns-label") | |
all_tags = {} | |
with gr.Row(): | |
for c in columns_dict.values(): | |
for tag in c.tags: | |
if tag not in all_tags: | |
all_tags[tag] = gr.Button(tag, interactive=True, size="sm", variant="secondary", min_width=50) | |
with gr.Column(scale=8): | |
with gr.Row(): | |
search_bar = gr.Textbox( | |
placeholder="π Search for your model and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
with gr.Row(): | |
model_type_filter_selector = gr.Dropdown( | |
label="Filter model types:", | |
choices=["All", "π Open-Weights", "π₯ Proprietary"], | |
value="All", | |
elem_id="model-type-filter", | |
interactive=True, | |
multiselect=False, | |
min_width=120, | |
) | |
model_size_filter_selector = gr.Dropdown( | |
label="Filter model sizes:", | |
choices=["All", "π΄ Tiny (<5B)", "π Small (5-50B)", "π Medium (50-100B)", "π Large (>100B)"], | |
value="All", | |
elem_id="model-size-filter", | |
interactive=True, | |
multiselect=False, | |
min_width=120, | |
) | |
model_family_filter_selector = gr.Dropdown( | |
label="Filter model families:", | |
choices=["All"] + list(dataframe[columns_dict["model_family"].pretty_name].unique()), | |
value="All", | |
elem_id="model-family-filter", | |
interactive=True, | |
multiselect=False, | |
min_width=120, | |
) | |
# create the hidden and visible dataframes to display | |
hidden_leaderboard_df = gr.components.Dataframe( | |
value=dataframe, | |
datatype=[c.gradio_column_type for c in columns_dict.values()], | |
visible=False, | |
interactive=False, | |
) | |
leaderboard_df = gr.components.Dataframe( | |
value=dataframe[[c.pretty_name for c in columns_dict.values() if c.displayed_by_default]], | |
datatype=[c.gradio_column_type for c in columns_dict.values()], | |
elem_id="leaderboard-df", | |
interactive=False, | |
) | |
# Add the callbacks | |
all_columns_selector.change( | |
fn=filter_dataframe_by_selected_columns, | |
inputs=[hidden_leaderboard_df, all_columns_selector], | |
outputs=[leaderboard_df], | |
) | |
search_bar.submit( | |
fn=filter_dataframe_by_search, | |
inputs=[hidden_leaderboard_df, leaderboard_df, search_bar], | |
outputs=[leaderboard_df], | |
) | |
model_type_filter_selector.change( | |
fn=filter_dataframe_by_model_type, | |
inputs=[hidden_leaderboard_df, leaderboard_df, model_type_filter_selector], | |
outputs=[leaderboard_df], | |
) | |
model_size_filter_selector.change( | |
fn=filter_dataframe_by_model_size, | |
inputs=[hidden_leaderboard_df, leaderboard_df, model_size_filter_selector], | |
outputs=[leaderboard_df], | |
) | |
model_family_filter_selector.change( | |
fn=filter_dataframe_by_model_family, | |
inputs=[hidden_leaderboard_df, leaderboard_df, model_family_filter_selector], | |
outputs=[leaderboard_df], | |
) | |
# Wire up each visible-column button to filter by tag | |
for tag, button in all_tags.items(): | |
button.click( | |
fn=filter_dataframe_by_selected_tag_columns, | |
inputs=[hidden_leaderboard_df, button], | |
outputs=[leaderboard_df, all_columns_selector, model_type_filter_selector, model_size_filter_selector], | |
) | |
# On first load, show the default columns | |
filter_dataframe_by_selected_columns(dataframe, all_columns_selector.value) | |
return col | |
demo = gr.Blocks(css=".scatter-plot {height: 500px;}") | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
print(LEADERBOARD_U_MATH_DF) | |
print(LEADERBOARD_MU_MATH_DF) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π U-MATH", elem_id="u-math-benchmark-tab-table", id=0): | |
leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT) | |
# gr.ScatterPlot( | |
# value=LEADERBOARD_U_MATH_DF, | |
# title="U-MATH: Text vs Visual Accuracy", | |
# x=U_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name, | |
# y=U_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name, | |
# color=U_MATH_COLUMNS_DICT["model_family"].pretty_name, | |
# tooltip=[U_MATH_COLUMNS_DICT["full_model_name"].pretty_name, U_MATH_COLUMNS_DICT["u_math_acc"].pretty_name], | |
# elem_classes="scatter-plot", | |
# height=500, | |
# ) | |
with gr.TabItem("π ΞΌ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1): | |
leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT) | |
# gr.ScatterPlot( | |
# value=LEADERBOARD_MU_MATH_DF, | |
# title="ΞΌ-MATH: True Positive Rate (Recall) vs True Negative Rate (Specificity)", | |
# x=MU_MATH_COLUMNS_DICT["mu_math_tpr"].pretty_name, | |
# y=MU_MATH_COLUMNS_DICT["mu_math_tnr"].pretty_name, | |
# color=MU_MATH_COLUMNS_DICT["model_family"].pretty_name, | |
# tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name], | |
# elem_classes="scatter-plot", | |
# height=500, | |
# ) | |
with gr.TabItem("π U-MATH vs ΞΌ-MATH", elem_id="u-math-vs-mu-math-tab-table", id=2): | |
leaderboard_aggregated = init_leaderboard(LEADERBOARD_U_MATH_MU_MATH_JOINED_DF, U_MATH_AND_MU_MATH_COLUMNS_DICT) | |
# gr.ScatterPlot( | |
# value=LEADERBOARD_U_MATH_MU_MATH_JOINED_DF, | |
# title="U-MATH Accuracy (Solving) vs ΞΌ-MATH F1 Score (Judging)", | |
# x=U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_acc"].pretty_name, | |
# y=U_MATH_AND_MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name, | |
# color=U_MATH_AND_MU_MATH_COLUMNS_DICT["model_family"].pretty_name, | |
# tooltip=[ | |
# U_MATH_AND_MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, | |
# U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name, | |
# U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name, | |
# ], | |
# elem_classes="scatter-plot", | |
# height=500, | |
# ) | |
with gr.TabItem("π About", elem_id="about-tab-table", id=3): | |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
citation_button = gr.Textbox( | |
value=CITATION_TEXT, | |
label="π Citation", | |
lines=9, | |
elem_id="citation-button", | |
show_copy_button=True, | |
container=True, | |
) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=60 * 60) | |
scheduler.start() | |
# demo.queue(default_concurrency_limit=40).launch(ssr_mode=False) | |
demo.queue(default_concurrency_limit=40).launch(allowed_paths=[".cache"]) | |