Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
fdcf92b
1
Parent(s):
a0adce1
add one-line description field for submissions
Browse files- app.py +10 -4
- src/display/utils.py +7 -0
- src/leaderboard/read_evals.py +30 -5
- src/submission/submit.py +3 -1
app.py
CHANGED
|
@@ -691,6 +691,7 @@ with demo:
|
|
| 691 |
with gr.Group():
|
| 692 |
gr.Markdown("### Submission Information")
|
| 693 |
method_name = gr.Textbox(label="Method Name")
|
|
|
|
| 694 |
contact_email = gr.Textbox(label="Contact Email")
|
| 695 |
|
| 696 |
# Dynamic UI logic
|
|
@@ -707,7 +708,7 @@ with demo:
|
|
| 707 |
# Submission handling
|
| 708 |
status = gr.Textbox(label="Submission Status", visible=False)
|
| 709 |
|
| 710 |
-
def handle_submission(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email):
|
| 711 |
errors = []
|
| 712 |
warnings = []
|
| 713 |
|
|
@@ -722,6 +723,11 @@ with demo:
|
|
| 722 |
errors.append("Valid email address is required")
|
| 723 |
if "Circuit" in track and not level:
|
| 724 |
errors.append("Level of granularity is required")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
|
| 727 |
errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
|
|
@@ -761,11 +767,11 @@ with demo:
|
|
| 761 |
return [
|
| 762 |
gr.Textbox("Warnings:", visible=True),
|
| 763 |
gr.Markdown("\n\n".join(f"• {w}" for w in warnings)),
|
| 764 |
-
(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id),
|
| 765 |
gr.Column(visible=True)
|
| 766 |
]
|
| 767 |
else:
|
| 768 |
-
return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id)
|
| 769 |
|
| 770 |
# New warning confirmation dialog
|
| 771 |
warning_modal = gr.Column(visible=False, variant="panel")
|
|
@@ -781,7 +787,7 @@ with demo:
|
|
| 781 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
| 782 |
submit_btn.click(
|
| 783 |
handle_submission,
|
| 784 |
-
inputs=[track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email],
|
| 785 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
| 786 |
)
|
| 787 |
|
|
|
|
| 691 |
with gr.Group():
|
| 692 |
gr.Markdown("### Submission Information")
|
| 693 |
method_name = gr.Textbox(label="Method Name")
|
| 694 |
+
description = gr.Textbox(label="One-line Description")
|
| 695 |
contact_email = gr.Textbox(label="Contact Email")
|
| 696 |
|
| 697 |
# Dynamic UI logic
|
|
|
|
| 708 |
# Submission handling
|
| 709 |
status = gr.Textbox(label="Submission Status", visible=False)
|
| 710 |
|
| 711 |
+
def handle_submission(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email):
|
| 712 |
errors = []
|
| 713 |
warnings = []
|
| 714 |
|
|
|
|
| 723 |
errors.append("Valid email address is required")
|
| 724 |
if "Circuit" in track and not level:
|
| 725 |
errors.append("Level of granularity is required")
|
| 726 |
+
if len(description.strip()) > 150:
|
| 727 |
+
warnings.append("Description longer than 150 characters and will be truncated.")
|
| 728 |
+
description = description.strip()[:150]
|
| 729 |
+
if not description.strip():
|
| 730 |
+
errors.append("Description is required")
|
| 731 |
|
| 732 |
if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
|
| 733 |
errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
|
|
|
|
| 767 |
return [
|
| 768 |
gr.Textbox("Warnings:", visible=True),
|
| 769 |
gr.Markdown("\n\n".join(f"• {w}" for w in warnings)),
|
| 770 |
+
(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id),
|
| 771 |
gr.Column(visible=True)
|
| 772 |
]
|
| 773 |
else:
|
| 774 |
+
return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id)
|
| 775 |
|
| 776 |
# New warning confirmation dialog
|
| 777 |
warning_modal = gr.Column(visible=False, variant="panel")
|
|
|
|
| 787 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
| 788 |
submit_btn.click(
|
| 789 |
handle_submission,
|
| 790 |
+
inputs=[track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email],
|
| 791 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
| 792 |
)
|
| 793 |
|
src/display/utils.py
CHANGED
|
@@ -62,6 +62,11 @@ auto_eval_column_dict_mib_subgraph.append(
|
|
| 62 |
["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]
|
| 63 |
)
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
# Add columns for each task-model combination
|
| 66 |
for task in TasksMib_Subgraph:
|
| 67 |
for model in task.value.models:
|
|
@@ -113,6 +118,8 @@ auto_eval_column_dict_mib_causalgraph = []
|
|
| 113 |
# Only include Method column as required
|
| 114 |
auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 115 |
|
|
|
|
|
|
|
| 116 |
# For each model-task-intervention-counterfactual combination
|
| 117 |
for task in TasksMib_Causalgraph:
|
| 118 |
for model in task.value.models: # model will be lowercase
|
|
|
|
| 62 |
["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]
|
| 63 |
)
|
| 64 |
|
| 65 |
+
# Description support
|
| 66 |
+
auto_eval_column_dict_mib_subgraph.append(
|
| 67 |
+
["description", ColumnContent, ColumnContent("Description", "markdown", False)]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
# Add columns for each task-model combination
|
| 71 |
for task in TasksMib_Subgraph:
|
| 72 |
for model in task.value.models:
|
|
|
|
| 118 |
# Only include Method column as required
|
| 119 |
auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 120 |
|
| 121 |
+
auto_eval_column_dict_mib_causalgraph.append(["description", ColumnContent, ColumnContent("Description", "markdown", False)])
|
| 122 |
+
|
| 123 |
# For each model-task-intervention-counterfactual combination
|
| 124 |
for task in TasksMib_Causalgraph:
|
| 125 |
for model in task.value.models: # model will be lowercase
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -52,6 +52,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 52 |
"""Represents one full evaluation for a method across all models in MIB."""
|
| 53 |
eval_name: str # method name as identifier
|
| 54 |
method_name: str # name of the interpretation method
|
|
|
|
| 55 |
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
| 56 |
|
| 57 |
|
|
@@ -61,6 +62,10 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 61 |
data = json.load(fp)
|
| 62 |
|
| 63 |
method_name = data.get("method_name")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# Initialize results dictionary with the exact structure from JSON
|
| 66 |
results = {}
|
|
@@ -97,6 +102,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 97 |
return EvalResult_MIB_SUBGRAPH(
|
| 98 |
eval_name=method_name,
|
| 99 |
method_name=method_name,
|
|
|
|
| 100 |
results=results
|
| 101 |
)
|
| 102 |
|
|
@@ -119,6 +125,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 119 |
data_dict = {
|
| 120 |
"eval_name": self.eval_name,
|
| 121 |
"Method": self.method_name,
|
|
|
|
| 122 |
}
|
| 123 |
|
| 124 |
# Initialize all possible columns with '-'
|
|
@@ -183,7 +190,7 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
|
|
| 183 |
eval_results = []
|
| 184 |
for model_result_filepath in model_result_filepaths:
|
| 185 |
try:
|
| 186 |
-
eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
|
| 187 |
result = eval_result.init_from_json_file(model_result_filepath)
|
| 188 |
# Verify the result can be converted to dict format
|
| 189 |
result.to_dict()
|
|
@@ -284,6 +291,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 284 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
| 285 |
eval_name: str # method name as identifier
|
| 286 |
method_name: str # name of the interpretation method
|
|
|
|
| 287 |
model_name: str # name of the model
|
| 288 |
task_name: str # name of the task
|
| 289 |
target_variables: str # target variables (e.g., "answer", "answer_pointer")
|
|
@@ -308,16 +316,26 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 308 |
# Parse tuple key: "('method', 'model', 'task', 'variable')"
|
| 309 |
try:
|
| 310 |
key_tuple = ast.literal_eval(key)
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
except:
|
| 313 |
# Alternative parsing with regex
|
| 314 |
pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
|
| 315 |
match = re.match(pattern, key)
|
| 316 |
if match:
|
| 317 |
method_name, model_name, task_name, target_variable = match.groups()
|
|
|
|
| 318 |
else:
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
# Get average and highest accuracy
|
| 323 |
average_accuracy = entry.get("average_accuracy", 0.0)
|
|
@@ -327,6 +345,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 327 |
result = EvalResult_MIB_CAUSALGRAPH(
|
| 328 |
eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
|
| 329 |
method_name=method_name,
|
|
|
|
| 330 |
model_name=model_name,
|
| 331 |
task_name=task_name,
|
| 332 |
target_variables=target_variable,
|
|
@@ -352,7 +371,6 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 352 |
# Create column name in the exact format requested
|
| 353 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 354 |
col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 355 |
-
print(f"col_name is {col_name}")
|
| 356 |
|
| 357 |
# Select the appropriate accuracy metric based on metric_type
|
| 358 |
score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
|
|
@@ -361,6 +379,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 361 |
data_dict = {
|
| 362 |
"eval_name": self.eval_name,
|
| 363 |
"Method": self.method_name,
|
|
|
|
| 364 |
col_name: score
|
| 365 |
}
|
| 366 |
|
|
@@ -542,6 +561,7 @@ class EvalResult:
|
|
| 542 |
"""
|
| 543 |
eval_name: str # org_model_track (uid)
|
| 544 |
full_model: str # org/model (name of model)
|
|
|
|
| 545 |
repo_id: str # org/model (path to model on HF)
|
| 546 |
track: str
|
| 547 |
org: str
|
|
@@ -559,6 +579,10 @@ class EvalResult:
|
|
| 559 |
|
| 560 |
config = data.get("config")
|
| 561 |
track = data.get("track")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
# Get model and org
|
| 564 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
@@ -605,6 +629,7 @@ class EvalResult:
|
|
| 605 |
return self(
|
| 606 |
eval_name=eval_name,
|
| 607 |
full_model=full_model,
|
|
|
|
| 608 |
repo_id=repo_id,
|
| 609 |
track=track,
|
| 610 |
org=org,
|
|
|
|
| 52 |
"""Represents one full evaluation for a method across all models in MIB."""
|
| 53 |
eval_name: str # method name as identifier
|
| 54 |
method_name: str # name of the interpretation method
|
| 55 |
+
description: str # one-line description of the method
|
| 56 |
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
| 57 |
|
| 58 |
|
|
|
|
| 62 |
data = json.load(fp)
|
| 63 |
|
| 64 |
method_name = data.get("method_name")
|
| 65 |
+
if "description" in data:
|
| 66 |
+
description = data.get("description")
|
| 67 |
+
else:
|
| 68 |
+
description = ""
|
| 69 |
|
| 70 |
# Initialize results dictionary with the exact structure from JSON
|
| 71 |
results = {}
|
|
|
|
| 102 |
return EvalResult_MIB_SUBGRAPH(
|
| 103 |
eval_name=method_name,
|
| 104 |
method_name=method_name,
|
| 105 |
+
description=description,
|
| 106 |
results=results
|
| 107 |
)
|
| 108 |
|
|
|
|
| 125 |
data_dict = {
|
| 126 |
"eval_name": self.eval_name,
|
| 127 |
"Method": self.method_name,
|
| 128 |
+
# "Description": self.description,
|
| 129 |
}
|
| 130 |
|
| 131 |
# Initialize all possible columns with '-'
|
|
|
|
| 190 |
eval_results = []
|
| 191 |
for model_result_filepath in model_result_filepaths:
|
| 192 |
try:
|
| 193 |
+
eval_result = EvalResult_MIB_SUBGRAPH("", "", "", {}) # Create empty instance
|
| 194 |
result = eval_result.init_from_json_file(model_result_filepath)
|
| 195 |
# Verify the result can be converted to dict format
|
| 196 |
result.to_dict()
|
|
|
|
| 291 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
| 292 |
eval_name: str # method name as identifier
|
| 293 |
method_name: str # name of the interpretation method
|
| 294 |
+
description: str # one-line description of the method
|
| 295 |
model_name: str # name of the model
|
| 296 |
task_name: str # name of the task
|
| 297 |
target_variables: str # target variables (e.g., "answer", "answer_pointer")
|
|
|
|
| 316 |
# Parse tuple key: "('method', 'model', 'task', 'variable')"
|
| 317 |
try:
|
| 318 |
key_tuple = ast.literal_eval(key)
|
| 319 |
+
if len(key_tuple) == 4:
|
| 320 |
+
method_name, model_name, task_name, target_variable = key_tuple
|
| 321 |
+
description = ""
|
| 322 |
+
elif len(key_tuple) == 5:
|
| 323 |
+
method_name, description, model_name, task_name, target_variable = key_tuple
|
| 324 |
except:
|
| 325 |
# Alternative parsing with regex
|
| 326 |
pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
|
| 327 |
match = re.match(pattern, key)
|
| 328 |
if match:
|
| 329 |
method_name, model_name, task_name, target_variable = match.groups()
|
| 330 |
+
description = ""
|
| 331 |
else:
|
| 332 |
+
pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
|
| 333 |
+
match = re.match(pattern, key)
|
| 334 |
+
if match:
|
| 335 |
+
method_name, description, model_name, task_name, target_variable = match.groups()
|
| 336 |
+
else:
|
| 337 |
+
print(f"Couldn't parse key: {key}")
|
| 338 |
+
continue
|
| 339 |
|
| 340 |
# Get average and highest accuracy
|
| 341 |
average_accuracy = entry.get("average_accuracy", 0.0)
|
|
|
|
| 345 |
result = EvalResult_MIB_CAUSALGRAPH(
|
| 346 |
eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
|
| 347 |
method_name=method_name,
|
| 348 |
+
description=description,
|
| 349 |
model_name=model_name,
|
| 350 |
task_name=task_name,
|
| 351 |
target_variables=target_variable,
|
|
|
|
| 371 |
# Create column name in the exact format requested
|
| 372 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
| 373 |
col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
|
|
|
| 374 |
|
| 375 |
# Select the appropriate accuracy metric based on metric_type
|
| 376 |
score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
|
|
|
|
| 379 |
data_dict = {
|
| 380 |
"eval_name": self.eval_name,
|
| 381 |
"Method": self.method_name,
|
| 382 |
+
# "Description": self.description,
|
| 383 |
col_name: score
|
| 384 |
}
|
| 385 |
|
|
|
|
| 561 |
"""
|
| 562 |
eval_name: str # org_model_track (uid)
|
| 563 |
full_model: str # org/model (name of model)
|
| 564 |
+
description: str # description of model
|
| 565 |
repo_id: str # org/model (path to model on HF)
|
| 566 |
track: str
|
| 567 |
org: str
|
|
|
|
| 579 |
|
| 580 |
config = data.get("config")
|
| 581 |
track = data.get("track")
|
| 582 |
+
if "description" in data:
|
| 583 |
+
description = data.get("description")
|
| 584 |
+
else:
|
| 585 |
+
description = ""
|
| 586 |
|
| 587 |
# Get model and org
|
| 588 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
|
| 629 |
return self(
|
| 630 |
eval_name=eval_name,
|
| 631 |
full_model=full_model,
|
| 632 |
+
description=description,
|
| 633 |
repo_id=repo_id,
|
| 634 |
track=track,
|
| 635 |
org=org,
|
src/submission/submit.py
CHANGED
|
@@ -16,7 +16,7 @@ import gradio as gr
|
|
| 16 |
REQUESTED_MODELS = None
|
| 17 |
USERS_TO_SUBMISSION_DATES = None
|
| 18 |
|
| 19 |
-
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id):
|
| 20 |
errors = []
|
| 21 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
| 22 |
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
|
@@ -40,6 +40,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
|
|
| 40 |
"revision": commit_hash,
|
| 41 |
"circuit_level": level.lower(),
|
| 42 |
"method_name": method_name,
|
|
|
|
| 43 |
"contact_email": contact_email.lower(),
|
| 44 |
"submit_time": current_time,
|
| 45 |
"status": "PREVALIDATION",
|
|
@@ -53,6 +54,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
|
|
| 53 |
"user_name": user_name,
|
| 54 |
"revision": commit_hash,
|
| 55 |
"method_name": method_name,
|
|
|
|
| 56 |
"contact_email": contact_email.lower(),
|
| 57 |
"submit_time": current_time,
|
| 58 |
"status": "PREVALIDATION",
|
|
|
|
| 16 |
REQUESTED_MODELS = None
|
| 17 |
USERS_TO_SUBMISSION_DATES = None
|
| 18 |
|
| 19 |
+
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id):
|
| 20 |
errors = []
|
| 21 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
| 22 |
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
|
|
|
| 40 |
"revision": commit_hash,
|
| 41 |
"circuit_level": level.lower(),
|
| 42 |
"method_name": method_name,
|
| 43 |
+
"description": description,
|
| 44 |
"contact_email": contact_email.lower(),
|
| 45 |
"submit_time": current_time,
|
| 46 |
"status": "PREVALIDATION",
|
|
|
|
| 54 |
"user_name": user_name,
|
| 55 |
"revision": commit_hash,
|
| 56 |
"method_name": method_name,
|
| 57 |
+
"description": description,
|
| 58 |
"contact_email": contact_email.lower(),
|
| 59 |
"submit_time": current_time,
|
| 60 |
"status": "PREVALIDATION",
|