Spaces:

mib-bench
/

leaderboard

Runtime error

App Files Files Community

Aaron Mueller commited on Jul 21

Commit

fdcf92b

1 Parent(s): a0adce1

add one-line description field for submissions

Browse files

Files changed (4) hide show

app.py +10 -4
src/display/utils.py +7 -0
src/leaderboard/read_evals.py +30 -5
src/submission/submit.py +3 -1

app.py CHANGED Viewed

@@ -691,6 +691,7 @@ with demo:
             with gr.Group():
                 gr.Markdown("### Submission Information")
                 method_name = gr.Textbox(label="Method Name")
                 contact_email = gr.Textbox(label="Contact Email")
             # Dynamic UI logic
@@ -707,7 +708,7 @@ with demo:
             # Submission handling
             status = gr.Textbox(label="Submission Status", visible=False)
-            def handle_submission(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email):
                 errors = []
                 warnings = []
@@ -722,6 +723,11 @@ with demo:
                     errors.append("Valid email address is required")
                 if "Circuit" in track and not level:
                     errors.append("Level of granularity is required")
                 if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
                     errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
@@ -761,11 +767,11 @@ with demo:
                     return [
                         gr.Textbox("Warnings:", visible=True),
                         gr.Markdown("\n\n".join(f"• {w}" for w in warnings)),
-                        (track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id),
                         gr.Column(visible=True)
                     ]
                 else:
-                    return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id)
             # New warning confirmation dialog
             warning_modal = gr.Column(visible=False, variant="panel")
@@ -781,7 +787,7 @@ with demo:
             submit_btn = gr.Button("Submit Entry", variant="primary")
             submit_btn.click(
                 handle_submission,
-                inputs=[track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email],
                 outputs=[status, warning_display, pending_submission, warning_modal]
             )

             with gr.Group():
                 gr.Markdown("### Submission Information")
                 method_name = gr.Textbox(label="Method Name")
+                description = gr.Textbox(label="One-line Description")
                 contact_email = gr.Textbox(label="Contact Email")
             # Dynamic UI logic
             # Submission handling
             status = gr.Textbox(label="Submission Status", visible=False)
+            def handle_submission(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email):
                 errors = []
                 warnings = []
                     errors.append("Valid email address is required")
                 if "Circuit" in track and not level:
                     errors.append("Level of granularity is required")
+                if len(description.strip()) > 150:
+                    warnings.append("Description longer than 150 characters and will be truncated.")
+                    description = description.strip()[:150]
+                if not description.strip():
+                    errors.append("Description is required")
                 if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
                     errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
                     return [
                         gr.Textbox("Warnings:", visible=True),
                         gr.Markdown("\n\n".join(f"• {w}" for w in warnings)),
+                        (track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id),
                         gr.Column(visible=True)
                     ]
                 else:
+                    return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id)
             # New warning confirmation dialog
             warning_modal = gr.Column(visible=False, variant="panel")
             submit_btn = gr.Button("Submit Entry", variant="primary")
             submit_btn.click(
                 handle_submission,
+                inputs=[track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email],
                 outputs=[status, warning_display, pending_submission, warning_modal]
             )

src/display/utils.py CHANGED Viewed

@@ -62,6 +62,11 @@ auto_eval_column_dict_mib_subgraph.append(
     ["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]
 )
 # Add columns for each task-model combination
 for task in TasksMib_Subgraph:
     for model in task.value.models:
@@ -113,6 +118,8 @@ auto_eval_column_dict_mib_causalgraph = []
 # Only include Method column as required
 auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
 # For each model-task-intervention-counterfactual combination
 for task in TasksMib_Causalgraph:
     for model in task.value.models:  # model will be lowercase

     ["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]
 )
+# Description support
+auto_eval_column_dict_mib_subgraph.append(
+    ["description", ColumnContent, ColumnContent("Description", "markdown", False)]
+)
 # Add columns for each task-model combination
 for task in TasksMib_Subgraph:
     for model in task.value.models:
 # Only include Method column as required
 auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
+auto_eval_column_dict_mib_causalgraph.append(["description", ColumnContent, ColumnContent("Description", "markdown", False)])
 # For each model-task-intervention-counterfactual combination
 for task in TasksMib_Causalgraph:
     for model in task.value.models:  # model will be lowercase

src/leaderboard/read_evals.py CHANGED Viewed

@@ -52,6 +52,7 @@ class EvalResult_MIB_SUBGRAPH:
     """Represents one full evaluation for a method across all models in MIB."""
     eval_name: str        # method name as identifier
     method_name: str      # name of the interpretation method
     results: Dict         # nested dict of results {task: {model: {metric: scores}}}
@@ -61,6 +62,10 @@ class EvalResult_MIB_SUBGRAPH:
             data = json.load(fp)
         method_name = data.get("method_name")
         # Initialize results dictionary with the exact structure from JSON
         results = {}
@@ -97,6 +102,7 @@ class EvalResult_MIB_SUBGRAPH:
         return EvalResult_MIB_SUBGRAPH(
             eval_name=method_name,
             method_name=method_name,
             results=results
         )
@@ -119,6 +125,7 @@ class EvalResult_MIB_SUBGRAPH:
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
         }
         # Initialize all possible columns with '-'
@@ -183,7 +190,7 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
-            eval_result = EvalResult_MIB_SUBGRAPH("", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
             # Verify the result can be converted to dict format
             result.to_dict()
@@ -284,6 +291,7 @@ class EvalResult_MIB_CAUSALGRAPH:
     """Represents one full evaluation for a method across all models for causal variable localization."""
     eval_name: str        # method name as identifier
     method_name: str      # name of the interpretation method
     model_name: str       # name of the model
     task_name: str        # name of the task
     target_variables: str # target variables (e.g., "answer", "answer_pointer")
@@ -308,16 +316,26 @@ class EvalResult_MIB_CAUSALGRAPH:
                 # Parse tuple key: "('method', 'model', 'task', 'variable')"
                 try:
                     key_tuple = ast.literal_eval(key)
-                    method_name, model_name, task_name, target_variable = key_tuple
                 except:
                     # Alternative parsing with regex
                     pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
                     match = re.match(pattern, key)
                     if match:
                         method_name, model_name, task_name, target_variable = match.groups()
                     else:
-                        print(f"Couldn't parse key: {key}")
-                        continue
                 # Get average and highest accuracy
                 average_accuracy = entry.get("average_accuracy", 0.0)
@@ -327,6 +345,7 @@ class EvalResult_MIB_CAUSALGRAPH:
                 result = EvalResult_MIB_CAUSALGRAPH(
                     eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
                     method_name=method_name,
                     model_name=model_name,
                     task_name=task_name,
                     target_variables=target_variable,
@@ -352,7 +371,6 @@ class EvalResult_MIB_CAUSALGRAPH:
         # Create column name in the exact format requested
         # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
         col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
-        print(f"col_name is {col_name}")
         # Select the appropriate accuracy metric based on metric_type
         score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
@@ -361,6 +379,7 @@ class EvalResult_MIB_CAUSALGRAPH:
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
             col_name: score
         }
@@ -542,6 +561,7 @@ class EvalResult:
     """
     eval_name: str # org_model_track (uid)
     full_model: str # org/model (name of model)
     repo_id: str # org/model (path to model on HF)
     track: str
     org: str
@@ -559,6 +579,10 @@ class EvalResult:
         config = data.get("config")
         track = data.get("track")
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
@@ -605,6 +629,7 @@ class EvalResult:
         return self(
             eval_name=eval_name,
             full_model=full_model,
             repo_id=repo_id,
             track=track,
             org=org,

     """Represents one full evaluation for a method across all models in MIB."""
     eval_name: str        # method name as identifier
     method_name: str      # name of the interpretation method
+    description: str      # one-line description of the method
     results: Dict         # nested dict of results {task: {model: {metric: scores}}}
             data = json.load(fp)
         method_name = data.get("method_name")
+        if "description" in data:
+            description = data.get("description")
+        else:
+            description = ""
         # Initialize results dictionary with the exact structure from JSON
         results = {}
         return EvalResult_MIB_SUBGRAPH(
             eval_name=method_name,
             method_name=method_name,
+            description=description,
             results=results
         )
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
+            # "Description": self.description,
         }
         # Initialize all possible columns with '-'
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
+            eval_result = EvalResult_MIB_SUBGRAPH("", "", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
             # Verify the result can be converted to dict format
             result.to_dict()
     """Represents one full evaluation for a method across all models for causal variable localization."""
     eval_name: str        # method name as identifier
     method_name: str      # name of the interpretation method
+    description: str      # one-line description of the method
     model_name: str       # name of the model
     task_name: str        # name of the task
     target_variables: str # target variables (e.g., "answer", "answer_pointer")
                 # Parse tuple key: "('method', 'model', 'task', 'variable')"
                 try:
                     key_tuple = ast.literal_eval(key)
+                    if len(key_tuple) == 4:
+                        method_name, model_name, task_name, target_variable = key_tuple
+                        description = ""
+                    elif len(key_tuple) == 5:
+                        method_name, description, model_name, task_name, target_variable = key_tuple
                 except:
                     # Alternative parsing with regex
                     pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
                     match = re.match(pattern, key)
                     if match:
                         method_name, model_name, task_name, target_variable = match.groups()
+                        description = ""
                     else:
+                        pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
+                        match = re.match(pattern, key)
+                        if match:
+                            method_name, description, model_name, task_name, target_variable = match.groups()
+                        else:
+                            print(f"Couldn't parse key: {key}")
+                            continue
                 # Get average and highest accuracy
                 average_accuracy = entry.get("average_accuracy", 0.0)
                 result = EvalResult_MIB_CAUSALGRAPH(
                     eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
                     method_name=method_name,
+                    description=description,
                     model_name=model_name,
                     task_name=task_name,
                     target_variables=target_variable,
         # Create column name in the exact format requested
         # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
         col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
         # Select the appropriate accuracy metric based on metric_type
         score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
+            # "Description": self.description,
             col_name: score
         }
     """
     eval_name: str # org_model_track (uid)
     full_model: str # org/model (name of model)
+    description: str # description of model
     repo_id: str # org/model (path to model on HF)
     track: str
     org: str
         config = data.get("config")
         track = data.get("track")
+        if "description" in data:
+            description = data.get("description")
+        else:
+            description = ""
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
         return self(
             eval_name=eval_name,
             full_model=full_model,
+            description=description,
             repo_id=repo_id,
             track=track,
             org=org,

src/submission/submit.py CHANGED Viewed

@@ -16,7 +16,7 @@ import gradio as gr
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
-def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id):
     errors = []
     hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
     repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
@@ -40,6 +40,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
                 "revision": commit_hash,
                 "circuit_level": level.lower(),
                 "method_name": method_name,
                 "contact_email": contact_email.lower(),
                 "submit_time": current_time,
                 "status": "PREVALIDATION",
@@ -53,6 +54,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
                 "user_name": user_name,
                 "revision": commit_hash,
                 "method_name": method_name,
                 "contact_email": contact_email.lower(),
                 "submit_time": current_time,
                 "status": "PREVALIDATION",

 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, description, contact_email, _id):
     errors = []
     hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
     repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
                 "revision": commit_hash,
                 "circuit_level": level.lower(),
                 "method_name": method_name,
+                "description": description,
                 "contact_email": contact_email.lower(),
                 "submit_time": current_time,
                 "status": "PREVALIDATION",
                 "user_name": user_name,
                 "revision": commit_hash,
                 "method_name": method_name,
+                "description": description,
                 "contact_email": contact_email.lower(),
                 "submit_time": current_time,
                 "status": "PREVALIDATION",