MEDIC-Benchmark / app.py
cchristophe's picture
Cleaning and removing df copy causing storage issue
fc21df8 verified
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import time
import functools
import gc
import os
from src.about import (
CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS,
NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, LOGO, FIVE_PILLAR_DIAGRAM
)
from src.display.css_html_js import custom_css
from src.display.utils import (
DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS,
MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS,
HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS,
OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS,
HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS,
TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates,
OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS,
OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS,
OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS,
OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS,
ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
# =====================================================================================
# 1. SETUP AND DATA LOADING
# =====================================================================================
def restart_space():
API.restart_space(repo_id=REPO_ID)
print("Downloading evaluation data...")
try:
snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN)
print("Downloads complete.")
except Exception as e:
print(f"An error occurred during download: {e}")
restart_space()
print("Loading all dataframes into a central dictionary...")
start_time = time.time()
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
_, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
_, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
_, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
_, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
_, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
_, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
_, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
_, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
_, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian")
_, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
_, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
_, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
ALL_DATASETS = {
"datasets": harness_datasets_original_df,
"open_ended": open_ended_original_df,
"med_safety": med_safety_original_df,
"medical_summarization": medical_summarization_original_df,
"aci": aci_original_df,
"soap": soap_original_df,
"healthbench": healthbench_original_df,
"healthbench_hard": healthbench_hard_original_df,
"open_ended_arabic": open_ended_arabic_df,
"open_ended_french": open_ended_french_df,
"open_ended_portuguese": open_ended_portuguese_df,
"open_ended_romanian": open_ended_romanian_df,
"open_ended_greek": open_ended_greek_df,
"open_ended_spanish": open_ended_spanish_df,
"closed_ended_multilingual": closed_ended_multilingual_df,
}
end_time = time.time()
print(f"Dataframes loaded in {end_time - start_time:.2f} seconds.")
# Evaluation Queue DataFrames
(finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
# =====================================================================================
# 2. EFFICIENT FILTERING LOGIC
# =====================================================================================
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
final_df = []
if query != "":
queries = [q.strip() for q in query.split(";")]
for _q in queries:
_q = _q.strip()
if _q != "":
temp_filtered_df = search_table(filtered_df, _q)
if len(temp_filtered_df) > 0:
final_df.append(temp_filtered_df)
if len(final_df) > 0:
filtered_df = pd.concat(final_df)
filtered_df = filtered_df.drop_duplicates(
subset=[
AutoEvalColumn.model.name,
]
)
return filtered_df
def filter_models(
df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
) -> pd.DataFrame:
filtered_df = df
if type_query is not None:
type_name = [t.split(" ")[1] for t in type_query]
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)]
if domain_specific_query is not None:
domain_specifics = []
if "πŸ₯ Clinical models" in domain_specific_query:
domain_specifics.append(True)
if "Generic models" in domain_specific_query:
domain_specifics.append(False)
filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
if precision_query is not None:
if AutoEvalColumn.precision.name in df.columns:
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
if size_query is not None:
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
filtered_df = filtered_df.loc[mask]
return filtered_df
def get_filtered_table(
shown_columns: list,
query: str,
domain_specific_query: list,
size_query: list,
*, # force subset_name to be a keyword-only argument
subset_name: str
):
original_df = ALL_DATASETS[subset_name]
type_query = None
filtered_df = filter_models(original_df, type_query, domain_specific_query, size_query, None, False)
filtered_df = filter_queries(query, filtered_df)
always_here_cols = [AutoEvalColumn.model.name]
available_cols = [c for c in shown_columns if c in filtered_df.columns]
final_df = filtered_df[always_here_cols + available_cols]
del filtered_df
gc.collect()
return final_df
# =====================================================================================
# 3. REUSABLE UI CREATION FUNCTION
# =====================================================================================
def create_leaderboard_ui(subset_name: str, column_choices: list, default_columns: list):
"""Creates a full leaderboard UI block for a given subset."""
with gr.Row():
with gr.Column():
with gr.Row():
search_bar = gr.Textbox(
placeholder=f"πŸ” Search for models...",
show_label=False,
elem_id=f"search-bar-{subset_name}",
)
with gr.Row():
shown_columns = gr.CheckboxGroup(
choices=column_choices,
value=default_columns,
label="Select columns to show",
elem_id=f"column-select-{subset_name}",
interactive=True,
)
with gr.Column(min_width=320):
filter_domain_specific = gr.CheckboxGroup(
label="Domain Specificity",
choices=["πŸ₯ Clinical models", "Generic models"],
value=["πŸ₯ Clinical models", "Generic models"],
interactive=True,
elem_id=f"filter-domain-{subset_name}",
)
filter_columns_size = gr.CheckboxGroup(
label="Model sizes (in billions of parameters)",
choices=list(NUMERIC_INTERVALS.keys()),
value=list(NUMERIC_INTERVALS.keys()),
interactive=True,
elem_id=f"filter-size-{subset_name}",
)
update_fn = functools.partial(get_filtered_table, subset_name=subset_name)
initial_df = update_fn(
shown_columns=default_columns,
query="",
domain_specific_query=["πŸ₯ Clinical models", "Generic models"],
size_query=list(NUMERIC_INTERVALS.keys())
)
leaderboard_table = gr.Dataframe(
value=initial_df,
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + default_columns,
datatype=TYPES,
elem_id=f"leaderboard-table-{subset_name}",
interactive=False,
)
inputs = [shown_columns, search_bar, filter_domain_specific, filter_columns_size]
# Attach listeners to all input components
for component in inputs:
if isinstance(component, gr.Textbox):
component.submit(update_fn, inputs, leaderboard_table)
else:
component.change(update_fn, inputs, leaderboard_table)
return leaderboard_table
# =====================================================================================
# 4. GRADIO DEMO UI (Main application layout)
# =====================================================================================
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(LOGO)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
LANGUAGES = {
"πŸ‡ΊπŸ‡Έ English": "open_ended", "πŸ‡¦πŸ‡ͺ Arabic": "open_ended_arabic",
"πŸ‡«πŸ‡· French": "open_ended_french", "πŸ‡ͺπŸ‡Έ Spanish": "open_ended_spanish",
"πŸ‡΅πŸ‡Ή Portuguese": "open_ended_portuguese", "πŸ‡·πŸ‡΄ Romanian": "open_ended_romanian",
"πŸ‡¬πŸ‡· Greek": "open_ended_greek",
}
for idx, (label, subset) in enumerate(LANGUAGES.items()):
with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English." if label == "πŸ‡ΊπŸ‡Έ English" else "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
gr.Markdown(judge_text, elem_classes="markdown-text")
create_leaderboard_ui(
subset_name=subset,
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)]
)
with gr.Accordion("πŸ’¬ Generation templates", open=False):
with gr.Accordion("Response generation", open=False):
render_generation_templates(task="open_ended", generation_type="response_generation")
with gr.Accordion("Scoring Rubric", open=False):
render_generation_templates(task="open_ended", generation_type="scoring_rubric")
with gr.TabItem("πŸ… Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
create_leaderboard_ui(
subset_name="medical_summarization",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)]
)
with gr.Accordion("πŸ’¬ Generation templates", open=False):
with gr.Accordion("Response generation", open=False):
render_generation_templates(task="medical_summarization", generation_type="response_generation")
with gr.Accordion("Question generation", open=False):
render_generation_templates(task="ce", generation_type="question_generation")
with gr.Accordion("Cross Examination", open=False):
render_generation_templates(task="ce", generation_type="cross_examination")
with gr.TabItem("πŸ… Note generation", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons2"):
with gr.TabItem("ACI Bench", id=0):
create_leaderboard_ui(
subset_name="aci",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)]
)
with gr.TabItem("SOAP Notes", id=1):
create_leaderboard_ui(
subset_name="soap",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)]
)
# Add accordions for this section if needed, similar to other tabs
with gr.TabItem("πŸ… HealthBench", elem_id="llm-benchmark-tab-table", id=4):
gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons2"):
with gr.TabItem("HealthBench", id=0):
create_leaderboard_ui(
subset_name="healthbench",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)]
)
with gr.TabItem("HealthBench-Hard", id=1):
create_leaderboard_ui(
subset_name="healthbench_hard",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)]
)
with gr.TabItem("πŸ… Med Safety", elem_id="llm-benchmark-tab-table", id=5):
create_leaderboard_ui(
subset_name="med_safety",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)]
)
with gr.Accordion("πŸ’¬ Generation templates", open=False):
with gr.Accordion("Response generation", open=False):
render_generation_templates(task="med_safety", generation_type="response_generation")
with gr.Accordion("Scoring Rubric", open=False):
render_generation_templates(task="med_safety", generation_type="scoring_rubric")
with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
with gr.Tabs(elem_classes="tab-buttons2"):
with gr.TabItem("English", id=0):
create_leaderboard_ui(
subset_name="datasets",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)]
)
with gr.TabItem("🌍 Multilingual", id=1):
gr.Markdown("πŸ“Š **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
create_leaderboard_ui(
subset_name="closed_ended_multilingual",
column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)]
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7):
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
gr.HTML(FIVE_PILLAR_DIAGRAM)
gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=8):
with gr.Column():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
gr.Dataframe(value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
gr.Dataframe(value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
gr.Dataframe(value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
with gr.Row():
gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="auto",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value=WeightType.Original.value.name,
interactive=False,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)", interactive=False)
with gr.Row():
domain_specific_toggle = gr.Checkbox(
label="Domain specific",
value=False,
info="Is your model medically oriented?",
)
chat_template_toggle = gr.Checkbox(
label="Use chat template",
value=False,
info="Is your model a chat model?",
)
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
model_type,
domain_specific_toggle,
chat_template_toggle,
precision,
weight_type
],
submission_result,
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=86400)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False)