Spaces:
Running
Running
Zachary Siegel
commited on
Commit
·
8de3f0a
1
Parent(s):
797d23f
add results to leaderboard
Browse files- app.py +36 -0
- utils/viz.py +1 -1
app.py
CHANGED
@@ -397,6 +397,42 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
397 |
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
|
398 |
with gr.Row():
|
399 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
|
401 |
# Will trigger autoscaling of plots when tabs are switched
|
402 |
tabs.select(fn=None, inputs=None, outputs=None, js="""
|
|
|
397 |
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
|
398 |
with gr.Row():
|
399 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
400 |
+
with gr.Tab("CORE-Bench-Medium"):
|
401 |
+
with gr.Row():
|
402 |
+
with gr.Column(scale=2):
|
403 |
+
Leaderboard(
|
404 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
|
405 |
+
select_columns=SelectColumns(
|
406 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
407 |
+
cant_deselect=["Agent Name"],
|
408 |
+
label="Select Columns to Display:",
|
409 |
+
),
|
410 |
+
hide_columns=config.COREBENCH_HIDE_COLUMNS,
|
411 |
+
search_columns=config.COREBENCH_SEARCH_COLUMNS,
|
412 |
+
)
|
413 |
+
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
414 |
+
with gr.Row():
|
415 |
+
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
|
416 |
+
with gr.Row():
|
417 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
418 |
+
with gr.Tab("CORE-Bench-Easy"):
|
419 |
+
with gr.Row():
|
420 |
+
with gr.Column(scale=2):
|
421 |
+
Leaderboard(
|
422 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
|
423 |
+
select_columns=SelectColumns(
|
424 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
425 |
+
cant_deselect=["Agent Name"],
|
426 |
+
label="Select Columns to Display:",
|
427 |
+
),
|
428 |
+
hide_columns=config.COREBENCH_HIDE_COLUMNS,
|
429 |
+
search_columns=config.COREBENCH_SEARCH_COLUMNS,
|
430 |
+
)
|
431 |
+
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
432 |
+
with gr.Row():
|
433 |
+
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
|
434 |
+
with gr.Row():
|
435 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
436 |
|
437 |
# Will trigger autoscaling of plots when tabs are switched
|
438 |
tabs.select(fn=None, inputs=None, outputs=None, js="""
|
utils/viz.py
CHANGED
@@ -283,7 +283,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
|
|
283 |
showlegend=False,
|
284 |
hovertemplate="<br>".join([
|
285 |
"<b>Agent</b>: %{customdata[0]}",
|
286 |
-
"<b>Total Cost</b>: $%{x:.
|
287 |
"<b>Accuracy</b>: %{y:.1%}<extra></extra>",
|
288 |
]),
|
289 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
|
|
283 |
showlegend=False,
|
284 |
hovertemplate="<br>".join([
|
285 |
"<b>Agent</b>: %{customdata[0]}",
|
286 |
+
"<b>Total Cost</b>: $%{x:.3f}",
|
287 |
"<b>Accuracy</b>: %{y:.1%}<extra></extra>",
|
288 |
]),
|
289 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|