core_leaderboard

Running

App Files Files Community

Zachary Siegel commited on Sep 29, 2024

Commit

8de3f0a

1 Parent(s): 797d23f

add results to leaderboard

Browse files

Files changed (2) hide show

app.py +36 -0
utils/viz.py +1 -1

app.py CHANGED Viewed

@@ -397,6 +397,42 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                 gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
     # Will trigger autoscaling of plots when tabs are switched
     tabs.select(fn=None, inputs=None, outputs=None, js="""

                 gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+        with gr.Tab("CORE-Bench-Medium"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+        with gr.Tab("CORE-Bench-Easy"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
     # Will trigger autoscaling of plots when tabs are switched
     tabs.select(fn=None, inputs=None, outputs=None, js="""

utils/viz.py CHANGED Viewed

@@ -283,7 +283,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
             showlegend=False,
             hovertemplate="<br>".join([
                 "<b>Agent</b>: %{customdata[0]}",
-                "<b>Total Cost</b>: $%{x:.1f}",
                 "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
             ]),
             hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),

             showlegend=False,
             hovertemplate="<br>".join([
                 "<b>Agent</b>: %{customdata[0]}",
+                "<b>Total Cost</b>: $%{x:.3f}",
                 "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
             ]),
             hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),