Zachary Siegel commited on
Commit
8de3f0a
·
1 Parent(s): 797d23f

add results to leaderboard

Browse files
Files changed (2) hide show
  1. app.py +36 -0
  2. utils/viz.py +1 -1
app.py CHANGED
@@ -397,6 +397,42 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
397
  gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
398
  with gr.Row():
399
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  # Will trigger autoscaling of plots when tabs are switched
402
  tabs.select(fn=None, inputs=None, outputs=None, js="""
 
397
  gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
398
  with gr.Row():
399
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
400
+ with gr.Tab("CORE-Bench-Medium"):
401
+ with gr.Row():
402
+ with gr.Column(scale=2):
403
+ Leaderboard(
404
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
405
+ select_columns=SelectColumns(
406
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
407
+ cant_deselect=["Agent Name"],
408
+ label="Select Columns to Display:",
409
+ ),
410
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
411
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
412
+ )
413
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
414
+ with gr.Row():
415
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
416
+ with gr.Row():
417
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
418
+ with gr.Tab("CORE-Bench-Easy"):
419
+ with gr.Row():
420
+ with gr.Column(scale=2):
421
+ Leaderboard(
422
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
423
+ select_columns=SelectColumns(
424
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
425
+ cant_deselect=["Agent Name"],
426
+ label="Select Columns to Display:",
427
+ ),
428
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
429
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
430
+ )
431
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
432
+ with gr.Row():
433
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
434
+ with gr.Row():
435
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
436
 
437
  # Will trigger autoscaling of plots when tabs are switched
438
  tabs.select(fn=None, inputs=None, outputs=None, js="""
utils/viz.py CHANGED
@@ -283,7 +283,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
283
  showlegend=False,
284
  hovertemplate="<br>".join([
285
  "<b>Agent</b>: %{customdata[0]}",
286
- "<b>Total Cost</b>: $%{x:.1f}",
287
  "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
288
  ]),
289
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
 
283
  showlegend=False,
284
  hovertemplate="<br>".join([
285
  "<b>Agent</b>: %{customdata[0]}",
286
+ "<b>Total Cost</b>: $%{x:.3f}",
287
  "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
288
  ]),
289
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),