Spaces:
Restarting
Restarting
jasonshaoshun
commited on
Commit
Β·
e46e945
1
Parent(s):
61542b8
debug
Browse files- app.py +3 -3
- src/about.py +38 -11
- src/display/utils.py +72 -24
- src/leaderboard/read_evals.py +97 -0
- src/populate.py +15 -3
app.py
CHANGED
|
@@ -74,7 +74,7 @@ except Exception:
|
|
| 74 |
|
| 75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
| 76 |
|
| 77 |
-
|
| 78 |
|
| 79 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 80 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
@@ -203,8 +203,8 @@ with demo:
|
|
| 203 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 204 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
|
| 209 |
# with gr.Row():
|
| 210 |
# with gr.Accordion("π Citation", open=False):
|
|
|
|
| 74 |
|
| 75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
| 76 |
|
| 77 |
+
LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
| 78 |
|
| 79 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 80 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
|
|
| 203 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 204 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 205 |
|
| 206 |
+
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
| 207 |
+
leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
| 208 |
|
| 209 |
# with gr.Row():
|
| 210 |
# with gr.Accordion("π Citation", open=False):
|
src/about.py
CHANGED
|
@@ -8,13 +8,6 @@ class Task:
|
|
| 8 |
col_name: str
|
| 9 |
|
| 10 |
|
| 11 |
-
@dataclass
|
| 12 |
-
class TaskMIB:
|
| 13 |
-
benchmark: str # task name in json (ioi/arithmetic)
|
| 14 |
-
models: list[str] # list of models to show as sub-columns
|
| 15 |
-
col_name: str # display name in leaderboard
|
| 16 |
-
metrics: list[str] # metrics to store (edge_counts, faithfulness)
|
| 17 |
-
|
| 18 |
|
| 19 |
|
| 20 |
# Select your tasks here
|
|
@@ -27,10 +20,6 @@ class Tasks(Enum):
|
|
| 27 |
task3 = Task("ewok", "acc", "EWoK")
|
| 28 |
|
| 29 |
|
| 30 |
-
class TasksMib_Subgraph(Enum):
|
| 31 |
-
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
| 32 |
-
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
| 33 |
-
|
| 34 |
|
| 35 |
class TasksMultimodal(Enum):
|
| 36 |
task0 = Task("blimp", "acc", "BLiMP")
|
|
@@ -41,6 +30,44 @@ class TasksMultimodal(Enum):
|
|
| 41 |
task5 = Task("winoground", "acc", "Winoground")
|
| 42 |
task6 = Task("devbench", "acc", "DevBench")
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 45 |
# ---------------------------------------------------
|
| 46 |
|
|
|
|
| 8 |
col_name: str
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
# Select your tasks here
|
|
|
|
| 20 |
task3 = Task("ewok", "acc", "EWoK")
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
class TasksMultimodal(Enum):
|
| 25 |
task0 = Task("blimp", "acc", "BLiMP")
|
|
|
|
| 30 |
task5 = Task("winoground", "acc", "Winoground")
|
| 31 |
task6 = Task("devbench", "acc", "DevBench")
|
| 32 |
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class TaskMIB_Subgraph:
|
| 37 |
+
benchmark: str # task name in json (ioi/arithmetic)
|
| 38 |
+
models: list[str] # list of models to show as sub-columns
|
| 39 |
+
col_name: str # display name in leaderboard
|
| 40 |
+
metrics: list[str] # metrics to store (edge_counts, faithfulness)
|
| 41 |
+
|
| 42 |
+
class TasksMib_Subgraph(Enum):
|
| 43 |
+
task0 = TaskMIB_Subgraph("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
| 44 |
+
task1 = TaskMIB_Subgraph("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class TaskMIB_Causalgraph:
|
| 49 |
+
benchmark: str # MCQA
|
| 50 |
+
models: list[str] # LlamaForCausalLM
|
| 51 |
+
layers: list[str] # 0-31
|
| 52 |
+
col_name: str # display name in leaderboard
|
| 53 |
+
interventions: list[str] # output_token, output_location
|
| 54 |
+
counterfactuals: list[str] # symbol_counterfactual, randomLetter_counterfactual, etc.
|
| 55 |
+
metrics: list[str] # score <- Added this field
|
| 56 |
+
|
| 57 |
+
class TasksMib_Causalgraph(Enum):
|
| 58 |
+
task0 = TaskMIB_Causalgraph(
|
| 59 |
+
"MCQA",
|
| 60 |
+
["LlamaForCausalLM"],
|
| 61 |
+
[str(i) for i in range(32)], # 0-31 layers
|
| 62 |
+
"mcqa",
|
| 63 |
+
["output_token", "output_location"],
|
| 64 |
+
["symbol_counterfactual", "randomLetter_counterfactual",
|
| 65 |
+
"answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
|
| 66 |
+
["score"] # Added this
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 72 |
# ---------------------------------------------------
|
| 73 |
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -21,10 +21,41 @@ class ColumnContent:
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
| 24 |
-
auto_eval_column_dict_mib_subgraph = []
|
| 25 |
auto_eval_column_dict = []
|
| 26 |
auto_eval_column_dict_multimodal = []
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
|
|
@@ -67,6 +98,45 @@ COLS_MIB_CAUSALGRAPH = []
|
|
| 67 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
|
|
@@ -76,32 +146,10 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
| 76 |
|
| 77 |
|
| 78 |
|
| 79 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 80 |
-
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
| 81 |
-
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
| 82 |
-
#Scores
|
| 83 |
-
for task in Tasks:
|
| 84 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 85 |
-
# Model information
|
| 86 |
-
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
| 87 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 88 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 89 |
|
| 90 |
-
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 91 |
-
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
| 92 |
-
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
| 93 |
-
for task in TasksMultimodal:
|
| 94 |
-
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 95 |
-
if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
|
| 96 |
-
auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
| 97 |
-
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
| 98 |
-
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 99 |
-
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 100 |
|
| 101 |
|
| 102 |
|
| 103 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 104 |
-
AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
|
| 105 |
|
| 106 |
## For the queue columns in the submission tab
|
| 107 |
@dataclass(frozen=True)
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
|
|
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
auto_eval_column_dict_multimodal = []
|
| 26 |
|
| 27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
+
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
| 29 |
+
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
| 30 |
+
#Scores
|
| 31 |
+
for task in Tasks:
|
| 32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 33 |
+
# Model information
|
| 34 |
+
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
| 35 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 36 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 37 |
+
|
| 38 |
+
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 39 |
+
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
| 40 |
+
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
| 41 |
+
for task in TasksMultimodal:
|
| 42 |
+
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 43 |
+
if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
|
| 44 |
+
auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
| 45 |
+
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
| 46 |
+
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 47 |
+
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 52 |
+
AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
| 59 |
|
| 60 |
|
| 61 |
|
|
|
|
| 98 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
| 99 |
|
| 100 |
|
| 101 |
+
# Initialize the MIB causal graph columns
|
| 102 |
+
auto_eval_column_dict_mib_causalgraph = []
|
| 103 |
+
|
| 104 |
+
# Method name column
|
| 105 |
+
auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 106 |
+
|
| 107 |
+
# For each layer-intervention-counterfactual combination
|
| 108 |
+
for task in TasksMib_Causalgraph:
|
| 109 |
+
for model in task.value.models:
|
| 110 |
+
for layer in task.value.layers:
|
| 111 |
+
for intervention in task.value.interventions:
|
| 112 |
+
for counterfactual in task.value.counterfactuals:
|
| 113 |
+
# Create column name like "layer0_output_token_symbol_counterfactual"
|
| 114 |
+
col_name = f"layer{layer}_{intervention}_{counterfactual}"
|
| 115 |
+
field_name = col_name.lower()
|
| 116 |
+
auto_eval_column_dict_mib_causalgraph.append([
|
| 117 |
+
field_name,
|
| 118 |
+
ColumnContent,
|
| 119 |
+
ColumnContent(col_name, "number", True)
|
| 120 |
+
])
|
| 121 |
+
|
| 122 |
+
# Create the dataclass for MIB causal graph columns
|
| 123 |
+
AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
|
| 124 |
+
|
| 125 |
+
# Column selection for display
|
| 126 |
+
COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
| 127 |
+
BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"layer{layer}_{intervention}_{counterfactual}"
|
| 128 |
+
for task in TasksMib_Causalgraph
|
| 129 |
+
for model in task.value.models
|
| 130 |
+
for layer in task.value.layers
|
| 131 |
+
for intervention in task.value.interventions
|
| 132 |
+
for counterfactual in task.value.counterfactuals]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
|
| 141 |
|
| 142 |
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
|
|
|
|
|
|
|
| 153 |
|
| 154 |
## For the queue columns in the submission tab
|
| 155 |
@dataclass(frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -182,7 +182,104 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
| 182 |
|
| 183 |
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
|
| 188 |
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
|
| 185 |
+
@dataclass
|
| 186 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
| 187 |
+
"""Represents one full evaluation for a method in MIB causalgraph."""
|
| 188 |
+
eval_name: str # method name as identifier
|
| 189 |
+
method_name: str # name of the interpretation method (e.g., "baseline_patching")
|
| 190 |
+
results: Dict # nested dict of results {model_id: {task_scores: [{layer, scores}]}}
|
| 191 |
+
|
| 192 |
+
def init_from_json_file(self, json_filepath):
|
| 193 |
+
"""Inits results from the method result file"""
|
| 194 |
+
with open(json_filepath) as fp:
|
| 195 |
+
data = json.load(fp)
|
| 196 |
+
|
| 197 |
+
method_name = data.get("method_name")
|
| 198 |
+
results = {}
|
| 199 |
+
|
| 200 |
+
# Get results for each model
|
| 201 |
+
for model_result in data.get("results", []):
|
| 202 |
+
model_id = model_result.get("model_id", "")
|
| 203 |
+
task_scores = model_result.get("task_scores", {})
|
| 204 |
+
|
| 205 |
+
# Process MCQA task scores
|
| 206 |
+
mcqa_scores = {}
|
| 207 |
+
for layer_data in task_scores.get("MCQA", []):
|
| 208 |
+
layer = layer_data.get("layer")
|
| 209 |
+
layer_scores = layer_data.get("layer_scores", [])
|
| 210 |
+
|
| 211 |
+
# Store scores for each intervention and counterfactual
|
| 212 |
+
for intervention_data in layer_scores:
|
| 213 |
+
intervention = intervention_data["intervention"][0] # e.g., "output_token"
|
| 214 |
+
counterfactual_scores = intervention_data["counterfactual_scores"]
|
| 215 |
+
|
| 216 |
+
for cf_score in counterfactual_scores:
|
| 217 |
+
counterfactual = cf_score["counterfactual"][0] # e.g., "symbol_counterfactual"
|
| 218 |
+
score = cf_score["score"]
|
| 219 |
+
|
| 220 |
+
# Create key for this combination
|
| 221 |
+
key = f"layer{layer}_{intervention}_{counterfactual}"
|
| 222 |
+
mcqa_scores[key] = score
|
| 223 |
+
|
| 224 |
+
results[model_id] = mcqa_scores
|
| 225 |
+
|
| 226 |
+
return EvalResult_MIB_CAUSALGRAPH(
|
| 227 |
+
eval_name=method_name,
|
| 228 |
+
method_name=method_name,
|
| 229 |
+
results=results
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
def to_dict(self):
|
| 233 |
+
"""Converts the Eval Result to a dict for dataframe display"""
|
| 234 |
+
data_dict = {
|
| 235 |
+
"eval_name": self.eval_name,
|
| 236 |
+
"Method": self.method_name,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# For each model, add all layer/intervention/counterfactual combinations
|
| 240 |
+
for model_id, scores in self.results.items():
|
| 241 |
+
for score_key, score_value in scores.items():
|
| 242 |
+
data_dict[score_key] = score_value
|
| 243 |
+
|
| 244 |
+
return data_dict
|
| 245 |
+
|
| 246 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 247 |
+
"""From the path of the results folder root, extract all needed info for MIB causalgraph results"""
|
| 248 |
+
model_result_filepaths = []
|
| 249 |
+
|
| 250 |
+
print(f"results_path is {results_path}")
|
| 251 |
+
|
| 252 |
+
for root, dirnames, files in os.walk(results_path):
|
| 253 |
+
print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
| 254 |
+
# We should only have json files in model results
|
| 255 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 256 |
+
continue
|
| 257 |
+
|
| 258 |
+
# Sort the files by date - keeping original sorting logic
|
| 259 |
+
try:
|
| 260 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 261 |
+
except dateutil.parser._parser.ParserError:
|
| 262 |
+
files = [files[-1]]
|
| 263 |
+
|
| 264 |
+
for file in files:
|
| 265 |
+
model_result_filepaths.append(os.path.join(root, file))
|
| 266 |
|
| 267 |
+
print(f"model_result_filepaths is {model_result_filepaths}")
|
| 268 |
+
|
| 269 |
+
eval_results = []
|
| 270 |
+
for model_result_filepath in model_result_filepaths:
|
| 271 |
+
try:
|
| 272 |
+
eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
|
| 273 |
+
result = eval_result.init_from_json_file(model_result_filepath)
|
| 274 |
+
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
| 275 |
+
# Verify the result can be converted to dict format
|
| 276 |
+
result.to_dict()
|
| 277 |
+
eval_results.append(result)
|
| 278 |
+
except Exception as e:
|
| 279 |
+
print(f"Error processing {model_result_filepath}: {e}")
|
| 280 |
+
continue
|
| 281 |
+
|
| 282 |
+
return eval_results
|
| 283 |
|
| 284 |
|
| 285 |
|
src/populate.py
CHANGED
|
@@ -66,11 +66,23 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
| 66 |
return df
|
| 67 |
|
| 68 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 69 |
-
"""Creates a dataframe from all the MIB experiment results"""
|
| 70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 71 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 66 |
return df
|
| 67 |
|
| 68 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 69 |
+
"""Creates a dataframe from all the MIB causal graph experiment results"""
|
| 70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 71 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 72 |
+
print(f"raw_data is {raw_data}")
|
| 73 |
+
|
| 74 |
+
# Convert each result to dict format
|
| 75 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
| 76 |
+
print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
| 77 |
+
|
| 78 |
+
# Convert to dataframe
|
| 79 |
+
df = pd.DataFrame.from_records(all_data_json)
|
| 80 |
+
|
| 81 |
+
# Round numeric columns to 2 decimal places
|
| 82 |
+
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
| 83 |
+
df[numeric_cols] = df[numeric_cols].round(2)
|
| 84 |
+
|
| 85 |
+
return df
|
| 86 |
|
| 87 |
|
| 88 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|