Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
a2e0e8f
1
Parent(s):
2ba536b
Update causal graph page to display separate views for average and highest values
Browse files- app.py +35 -7
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
|
@@ -681,12 +681,12 @@ with demo:
|
|
| 681 |
# Then modify the Causal Graph tab section
|
| 682 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
| 683 |
with gr.Tabs() as causalgraph_tabs:
|
| 684 |
-
with gr.TabItem("Detailed View", id=0):
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
with gr.TabItem("
|
| 690 |
gr.Markdown("""
|
| 691 |
### Filtering Options
|
| 692 |
Use the dropdown menus below to filter results by specific tasks or models.
|
|
@@ -718,11 +718,39 @@ with demo:
|
|
| 718 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
| 719 |
outputs=leaderboard_aggregated
|
| 720 |
)
|
| 721 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
| 723 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
| 724 |
"Causal Graph"
|
| 725 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
| 727 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
| 728 |
# Track selection
|
|
|
|
| 681 |
# Then modify the Causal Graph tab section
|
| 682 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
| 683 |
with gr.Tabs() as causalgraph_tabs:
|
| 684 |
+
# with gr.TabItem("Detailed View", id=0):
|
| 685 |
+
# leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
|
| 686 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
| 687 |
+
# "Causal Graph"
|
| 688 |
+
# )
|
| 689 |
+
with gr.TabItem("Highest View", id=0):
|
| 690 |
gr.Markdown("""
|
| 691 |
### Filtering Options
|
| 692 |
Use the dropdown menus below to filter results by specific tasks or models.
|
|
|
|
| 718 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
| 719 |
outputs=leaderboard_aggregated
|
| 720 |
)
|
| 721 |
+
with gr.TabItem("Averaged View", id=1):
|
| 722 |
+
|
| 723 |
+
task_substring_checkbox = gr.CheckboxGroup(
|
| 724 |
+
choices=TASK_SUBSTRINGS,
|
| 725 |
+
label="View tasks:",
|
| 726 |
+
value=TASK_SUBSTRINGS, # Default to all substrings selected
|
| 727 |
+
)
|
| 728 |
+
model_substring_checkbox = gr.CheckboxGroup(
|
| 729 |
+
choices = MODEL_SUBSTRINGS,
|
| 730 |
+
label = "View models:",
|
| 731 |
+
value = MODEL_SUBSTRINGS
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
| 735 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
| 736 |
"Causal Graph"
|
| 737 |
)
|
| 738 |
+
original_leaderboard = gr.State(value=data)
|
| 739 |
+
task_substring_checkbox.change(
|
| 740 |
+
fn=update_leaderboard,
|
| 741 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
| 742 |
+
outputs=leaderboard_averaged
|
| 743 |
+
)
|
| 744 |
+
model_substring_checkbox.change(
|
| 745 |
+
fn=update_leaderboard,
|
| 746 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
| 747 |
+
outputs=leaderboard_averaged
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
# leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
| 751 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
| 752 |
+
# "Causal Graph"
|
| 753 |
+
# )
|
| 754 |
|
| 755 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
| 756 |
# Track selection
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -508,7 +508,7 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
| 508 |
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
| 509 |
|
| 510 |
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
| 511 |
-
return detailed_df_highest,
|
| 512 |
|
| 513 |
|
| 514 |
|
|
|
|
| 508 |
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
| 509 |
|
| 510 |
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
| 511 |
+
return detailed_df_highest, detailed_df_highest, detailed_df_mean
|
| 512 |
|
| 513 |
|
| 514 |
|