asta-bench-leaderboard

Running

App Files Files Community

Amber Tanaka commited on Aug 22

Commit

85744c7

unverified ·

1 Parent(s): 268d785

Change name of LLM Base and adjust hover behavior (#85)

Browse files

Files changed (2) hide show

leaderboard_transformer.py +16 -22
ui_components.py +14 -14

leaderboard_transformer.py CHANGED Viewed

@@ -112,7 +112,7 @@ def _pretty_column_name(raw_col: str) -> str:
         'Logs': 'Logs',
         'Openness': 'Openness',
         'Agent tooling': 'Agent Tooling',
-        'LLM base': 'LLM Base',
         'Source': 'Source',
     }
@@ -255,7 +255,7 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness and Tooling ---
-        base_cols = ["id","Agent","Submitter","LLM Base","Source"]
         new_cols = ["Openness", "Agent Tooling"]
         ending_cols = ["Date", "Logs"]
@@ -361,7 +361,7 @@ def _plot_scatter_plotly(
     x_col_to_use = x
     y_col_to_use = y
-    llm_base = data["LLM Base"] if "LLM Base" in data.columns else "LLM Base"
     # --- Section 2: Data Preparation---
     required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
@@ -443,33 +443,37 @@ def _plot_scatter_plotly(
             ))
     # --- Section 5: Prepare for Marker Plotting ---
-    def format_hover_text(row, agent_col, x_axis_label, x_col, y_col):
         """
         Builds the complete HTML string for the plot's hover tooltip.
-        Formats the 'LLM Base' column as a bulleted list if multiple.
         """
         h_pad = "   "
         parts = ["<br>"]
         parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
-        parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
-        # Clean and format LLM Base column
-        llm_base_value = row['LLM Base']
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
-            parts.append(f"{h_pad}LLM Base:{h_pad}<br>")
             # Create a list of padded bullet points
             list_items = [f"{h_pad}  • <b>{item}</b>{h_pad}" for item in llm_base_value]
             # Join them with line breaks
             parts.append('<br>'.join(list_items))
         else:
             # Handle the non-list case with padding
-            parts.append(f"{h_pad}LLM Base: <b>{llm_base_value}</b>{h_pad}")
         # Add a final line break for bottom "padding"
         parts.append("<br>")
         # Join all the parts together into the final HTML string
@@ -481,7 +485,8 @@ def _plot_scatter_plotly(
             agent_col=agent_col,
             x_axis_label=x_axis_label,
             x_col=x_col_to_use,
-            y_col=y_col_to_use
         ),
         axis=1
     )
@@ -542,17 +547,6 @@ def _plot_scatter_plotly(
             font_color="#d3dedc",
         ),
     )
-    # fig.add_layout_image(
-    #     dict(
-    #         source=logo_data_uri,
-    #         xref="x domain", yref="y domain",
-    #         x=1.1, y=1.1,
-    #         sizex=0.2, sizey=0.2,
-    #         xanchor="left",
-    #         yanchor="bottom",
-    #         layer="above",
-    #     ),
-    # )
     return fig

         'Logs': 'Logs',
         'Openness': 'Openness',
         'Agent tooling': 'Agent Tooling',
+        'LLM base': 'Models Used',
         'Source': 'Source',
     }
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness and Tooling ---
+        base_cols = ["id","Agent","Submitter","Models Used","Source"]
         new_cols = ["Openness", "Agent Tooling"]
         ending_cols = ["Date", "Logs"]
     x_col_to_use = x
     y_col_to_use = y
+    llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
     # --- Section 2: Data Preparation---
     required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
             ))
     # --- Section 5: Prepare for Marker Plotting ---
+    def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
+        Formats the 'Models Used' column as a bulleted list if multiple.
         """
         h_pad = "   "
         parts = ["<br>"]
         parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
+        if divider_line_x > 0 and row[x_col] >= divider_line_x:
+            # If no cost, display "Missing" for the cost.
+            parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
+        else:
+            parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
         parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
+        # Clean and format Models Used column
+        llm_base_value = row['Models Used']
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
+            parts.append(f"{h_pad}Models Used:{h_pad}<br>")
             # Create a list of padded bullet points
             list_items = [f"{h_pad}  • <b>{item}</b>{h_pad}" for item in llm_base_value]
             # Join them with line breaks
             parts.append('<br>'.join(list_items))
         else:
             # Handle the non-list case with padding
+            parts.append(f"{h_pad}Models Used: <b>{llm_base_value}</b>{h_pad}")
         # Add a final line break for bottom "padding"
         parts.append("<br>")
         # Join all the parts together into the final HTML string
             agent_col=agent_col,
             x_axis_label=x_axis_label,
             x_col=x_col_to_use,
+            y_col=y_col_to_use,
+            divider_line_x=divider_line_x
         ),
         axis=1
     )
             font_color="#d3dedc",
         ),
     )
     return fig

ui_components.py CHANGED Viewed

@@ -246,7 +246,7 @@ def build_descriptions_tooltip_content(table) -> str:
         return """
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
@@ -264,7 +264,7 @@ def build_descriptions_tooltip_content(table) -> str:
         return f"""
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
@@ -277,7 +277,7 @@ def build_descriptions_tooltip_content(table) -> str:
         return f"""
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
@@ -543,9 +543,9 @@ def create_leaderboard_display(
         if "Score" in col:
             df_view = format_score_column(df_view, col)
     scatter_plot = plots_dict.get('scatter_plot', go.Figure())
-    #Make pretty and format the LLM Base column
-    df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
-    df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
     # append the repro url to the end of the agent name
     if 'Source' in df_view.columns:
         df_view['Agent'] = df_view.apply(
@@ -567,7 +567,7 @@ def create_leaderboard_display(
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
-        elif col in ["Agent","Icon","LLM Base", "Pareto"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
@@ -655,7 +655,7 @@ def create_benchmark_details_display(
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
-        table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -684,9 +684,9 @@ def create_benchmark_details_display(
             axis=1  # IMPORTANT: axis=1 tells pandas to process row-by-row
         )
-        #Make pretty and format the LLM Base column
-        benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
-        benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
         # append the repro url to the end of the agent name
         if 'Source' in benchmark_table_df.columns:
             benchmark_table_df['Agent'] = benchmark_table_df.apply(
@@ -719,7 +719,7 @@ def create_benchmark_details_display(
             'Icon',
             'Agent',
             'Submitter',
-            'LLM Base',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
@@ -741,7 +741,7 @@ def create_benchmark_details_display(
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
-            elif col in ["Agent", "Icon", "LLM Base", "Pareto"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
@@ -857,7 +857,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: boo
 def format_llm_base_with_html(value):
     """
-    Formats the 'LLM Base' cell value.
     If the value is a list with more than 1 element, it returns an
       HTML <span> with the full list in a hover-over tooltip.
     If it's a single-element list, it returns just that element.

         return """
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
         return f"""
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
         return f"""
             <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
             <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
         if "Score" in col:
             df_view = format_score_column(df_view, col)
     scatter_plot = plots_dict.get('scatter_plot', go.Figure())
+    #Make pretty and format the Models Used column
+    df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list)
+    df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html)
     # append the repro url to the end of the agent name
     if 'Source' in df_view.columns:
         df_view['Agent'] = df_view.apply(
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
+        elif col in ["Agent","Icon","Models Used", "Pareto"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
+        table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
             axis=1  # IMPORTANT: axis=1 tells pandas to process row-by-row
         )
+        #Make pretty and format the Models Used column
+        benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
+        benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html)
         # append the repro url to the end of the agent name
         if 'Source' in benchmark_table_df.columns:
             benchmark_table_df['Agent'] = benchmark_table_df.apply(
             'Icon',
             'Agent',
             'Submitter',
+            'Models Used',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
+            elif col in ["Agent", "Icon", "Models Used", "Pareto"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
 def format_llm_base_with_html(value):
     """
+    Formats the 'Models Used' cell value.
     If the value is a list with more than 1 element, it returns an
       HTML <span> with the full list in a hover-over tooltip.
     If it's a single-element list, it returns just that element.