Spaces:
Runtime error
Runtime error
jasonshaoshun
commited on
Commit
·
475701c
1
Parent(s):
a2e0e8f
add target variable to causal graph
Browse files- app.py +23 -12
- src/about.py +5 -4
- src/leaderboard/read_evals.py +4 -1
app.py
CHANGED
|
@@ -443,23 +443,32 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
| 443 |
"4_answer_MCQA": "MCQA",
|
| 444 |
"arithmetic_addition": "Arithmetic (+)",
|
| 445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
| 446 |
-
"
|
| 447 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
}
|
| 449 |
|
| 450 |
display_mapping = {}
|
| 451 |
for task in TasksMib_Causalgraph:
|
| 452 |
for model in task.value.models:
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
|
| 458 |
-
# print("\nDebugging display_mapping:", display_mapping)
|
| 459 |
|
| 460 |
renamed_df = dataframe.rename(columns=display_mapping)
|
| 461 |
|
| 462 |
-
# print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
|
| 463 |
|
| 464 |
# Create only necessary columns
|
| 465 |
return Leaderboard(
|
|
@@ -521,8 +530,10 @@ def get_hf_username(hf_repo):
|
|
| 521 |
# Define the preset substrings for filtering
|
| 522 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
| 523 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
|
|
|
| 524 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
| 525 |
|
|
|
|
| 526 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
| 527 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
| 528 |
"""
|
|
@@ -693,9 +704,9 @@ with demo:
|
|
| 693 |
You can combine filters to see specific task-model combinations.
|
| 694 |
""")
|
| 695 |
task_substring_checkbox = gr.CheckboxGroup(
|
| 696 |
-
choices=
|
| 697 |
label="View tasks:",
|
| 698 |
-
value=
|
| 699 |
)
|
| 700 |
model_substring_checkbox = gr.CheckboxGroup(
|
| 701 |
choices = MODEL_SUBSTRINGS,
|
|
@@ -721,9 +732,9 @@ with demo:
|
|
| 721 |
with gr.TabItem("Averaged View", id=1):
|
| 722 |
|
| 723 |
task_substring_checkbox = gr.CheckboxGroup(
|
| 724 |
-
choices=
|
| 725 |
label="View tasks:",
|
| 726 |
-
value=
|
| 727 |
)
|
| 728 |
model_substring_checkbox = gr.CheckboxGroup(
|
| 729 |
choices = MODEL_SUBSTRINGS,
|
|
|
|
| 443 |
"4_answer_MCQA": "MCQA",
|
| 444 |
"arithmetic_addition": "Arithmetic (+)",
|
| 445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
| 446 |
+
"ARC_easy": "ARC (Easy)",
|
| 447 |
+
"RAVEL_task": "RAVEL"
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
target_variables_mapping = {
|
| 451 |
+
"output_token": "Output Token",
|
| 452 |
+
"output_position": "Output Position",
|
| 453 |
+
"answer_pointer": "Answer Pointer",
|
| 454 |
+
"answer": "Answer",
|
| 455 |
+
"Continent": "Continent",
|
| 456 |
+
"Language": "Language",
|
| 457 |
+
"Country": "Country",
|
| 458 |
+
"Language": "Language"
|
| 459 |
}
|
| 460 |
|
| 461 |
display_mapping = {}
|
| 462 |
for task in TasksMib_Causalgraph:
|
| 463 |
for model in task.value.models:
|
| 464 |
+
for target_variables in task.value.target_variables:
|
| 465 |
+
field_name = f"{model}_{task.value.col_name}_{target_variables}"
|
| 466 |
+
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
|
| 467 |
+
display_mapping[field_name] = display_name
|
| 468 |
|
|
|
|
| 469 |
|
| 470 |
renamed_df = dataframe.rename(columns=display_mapping)
|
| 471 |
|
|
|
|
| 472 |
|
| 473 |
# Create only necessary columns
|
| 474 |
return Leaderboard(
|
|
|
|
| 530 |
# Define the preset substrings for filtering
|
| 531 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
| 532 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
| 533 |
+
TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
|
| 534 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
| 535 |
|
| 536 |
+
|
| 537 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
| 538 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
| 539 |
"""
|
|
|
|
| 704 |
You can combine filters to see specific task-model combinations.
|
| 705 |
""")
|
| 706 |
task_substring_checkbox = gr.CheckboxGroup(
|
| 707 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
| 708 |
label="View tasks:",
|
| 709 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
| 710 |
)
|
| 711 |
model_substring_checkbox = gr.CheckboxGroup(
|
| 712 |
choices = MODEL_SUBSTRINGS,
|
|
|
|
| 732 |
with gr.TabItem("Averaged View", id=1):
|
| 733 |
|
| 734 |
task_substring_checkbox = gr.CheckboxGroup(
|
| 735 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
| 736 |
label="View tasks:",
|
| 737 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
| 738 |
)
|
| 739 |
model_substring_checkbox = gr.CheckboxGroup(
|
| 740 |
choices = MODEL_SUBSTRINGS,
|
src/about.py
CHANGED
|
@@ -78,14 +78,15 @@ class TaskMIB_Causalgraph:
|
|
| 78 |
models: list[str] # list of models to show as sub-columns
|
| 79 |
col_name: str # display name in leaderboard
|
| 80 |
metrics: list[str] # metrics to store (average_score)
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
|
| 84 |
class TasksMib_Causalgraph(Enum):
|
| 85 |
-
task0 =
|
| 86 |
-
task1 =
|
| 87 |
-
task2 =
|
| 88 |
-
task3 =
|
| 89 |
|
| 90 |
@classmethod
|
| 91 |
def get_all_tasks(cls):
|
|
|
|
| 78 |
models: list[str] # list of models to show as sub-columns
|
| 79 |
col_name: str # display name in leaderboard
|
| 80 |
metrics: list[str] # metrics to store (average_score)
|
| 81 |
+
target_variables: list[str]
|
| 82 |
|
| 83 |
|
| 84 |
|
| 85 |
class TasksMib_Causalgraph(Enum):
|
| 86 |
+
task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
|
| 87 |
+
task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
|
| 88 |
+
task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
|
| 89 |
+
task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
|
| 90 |
|
| 91 |
@classmethod
|
| 92 |
def get_all_tasks(cls):
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -298,6 +298,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 298 |
return averaged_df
|
| 299 |
|
| 300 |
|
|
|
|
| 301 |
@dataclass
|
| 302 |
class EvalResult_MIB_CAUSALGRAPH:
|
| 303 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
|
@@ -370,7 +371,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 370 |
"""
|
| 371 |
# Create column name in the exact format requested
|
| 372 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 373 |
-
col_name = f"{self.model_name}_{self.task_name}"
|
| 374 |
print(f"col_name is {col_name}")
|
| 375 |
|
| 376 |
# Select the appropriate accuracy metric based on metric_type
|
|
@@ -526,6 +527,8 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
| 526 |
|
| 527 |
|
| 528 |
|
|
|
|
|
|
|
| 529 |
@dataclass
|
| 530 |
class EvalResult:
|
| 531 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
|
| 298 |
return averaged_df
|
| 299 |
|
| 300 |
|
| 301 |
+
|
| 302 |
@dataclass
|
| 303 |
class EvalResult_MIB_CAUSALGRAPH:
|
| 304 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
|
|
|
| 371 |
"""
|
| 372 |
# Create column name in the exact format requested
|
| 373 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 374 |
+
col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 375 |
print(f"col_name is {col_name}")
|
| 376 |
|
| 377 |
# Select the appropriate accuracy metric based on metric_type
|
|
|
|
| 527 |
|
| 528 |
|
| 529 |
|
| 530 |
+
|
| 531 |
+
|
| 532 |
@dataclass
|
| 533 |
class EvalResult:
|
| 534 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|