Spaces:

McGill-NLP
/

agent-reward-bench-demo

Running

App Files Files Community

xhluca commited on Apr 14

Commit

99e2870

1 Parent(s): 331ed33

Simplify table

Browse files

Files changed (1) hide show

demo.py +79 -80

demo.py CHANGED Viewed

@@ -484,95 +484,94 @@ base_screenshot_dir = Path(base_screenshot_dir)
 hl_action_parser = _build_highlevel_action_parser()
-with gr.Blocks(title="AgentRewardBench Demo") as demo, gr.Row():
     gr.Markdown(
         """
-        # AgentRewardBench Demo
-        | [**🤗Dataset**](https://huggingface.co/datasets/McGill-NLP/agent-reward-bench) | **📄Paper (TBA)** | [**🌐Website**](https://agent-reward-bench.github.io) | [**🏆Leaderboard**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-leaderboard) | [**💻Demo**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-demo)
-        | :--: | :--: | :--: | :--: | :--: |
         """
     )
-    with gr.Column(scale=4):
-        benchmark_default = "WebArena"
-        benchmark_dd = gr.Dropdown(
-            label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
-        )
-        agents = list_agents(base_traj_dir, benchmark_default)
-        model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
-        task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
-        task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
-    @benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
-    def update_agents(benchmark):
-        agents = list_agents(base_traj_dir, benchmark)
-        return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
-    @model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
-    def update_task_ids(benchmark, agent):
-        task_ids = list_task_ids(base_traj_dir, benchmark, agent)
-        return gr.Dropdown(choices=task_ids, value=task_ids[0])
-    with gr.Column(scale=8):
-        @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
-        def render_trajectory(benchmark, agent, task_id):
-            traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
-            with open(traj_path, "rb") as f:
-                traj = orjson.loads(f.read())
-            goal = replace_string_content(traj["goal"])
-            gr.Textbox(label="Goal", value=goal, visible=True)
-            for step in traj["steps"]:
-                num = step["num"]
-                action = step["action"]
-                reasoning = step["reasoning"]
-                screenshot_path = step["screenshot_path"]
-                gr.Markdown(f"# Step {num}")
-                with gr.Group():
-                    im = Image.open(screenshot_path)
-                    im = apply_overlay_to_image(
-                        im, step, highlevel_action_parser=hl_action_parser
                     )
-                    format_ = "webp" if im.format is None else im.format
-                    gr.Image(im, label="Screenshot", format=format_)
-                    if reasoning is not None:
-                        gr.Textbox(reasoning, label="Reasoning", lines=4)
-                    if action is not None:
-                        gr.Textbox(action, label="Action", lines=2)
-        # multi-choices dropdown for judges
-        judge_dd = gr.Dropdown(
-            label="Judges",
-            choices=list(judges_dict.values()),
-            multiselect=True,
-            value=default_judges,
-        )
-        @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
-        def render_judge(benchmark, agent, task_id, judge_choices):
-            # load judgments
-            for judge in judges_dict.values():
-                if judge not in judge_choices:
-                    continue
-                judgment_path = get_judgment_path(
-                    base_judgments_dir, benchmark, agent, judge, task_id
-                )
-                if not judgment_path.exists():
-                    continue
-                with open(judgment_path, "rb") as f:
-                    judgment = orjson.loads(f.read())
-                if judge == "Rule-based":
-                    msg = get_message_from_rule_based(judgment)
-                else:
-                    msg = get_message_from_judgment(judgment)
-                gr.Textbox(label=judge, value=msg, lines=4)
 demo.launch()

 hl_action_parser = _build_highlevel_action_parser()
+with gr.Blocks(title="AgentRewardBench Demo") as demo:
     gr.Markdown(
         """
+        # AgentRewardBench Demo ([Website](https://agent-reward-bench.github.io))
         """
     )
+    with gr.Row():
+        with gr.Column(scale=4):
+            benchmark_default = "WebArena"
+            benchmark_dd = gr.Dropdown(
+                label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
+            )
+            agents = list_agents(base_traj_dir, benchmark_default)
+            model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
+            task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
+            task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
+        @benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
+        def update_agents(benchmark):
+            agents = list_agents(base_traj_dir, benchmark)
+            return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
+        @model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
+        def update_task_ids(benchmark, agent):
+            task_ids = list_task_ids(base_traj_dir, benchmark, agent)
+            return gr.Dropdown(choices=task_ids, value=task_ids[0])
+        with gr.Column(scale=8):
+            @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
+            def render_trajectory(benchmark, agent, task_id):
+                traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
+                with open(traj_path, "rb") as f:
+                    traj = orjson.loads(f.read())
+                goal = replace_string_content(traj["goal"])
+                gr.Textbox(label="Goal", value=goal, visible=True)
+                for step in traj["steps"]:
+                    num = step["num"]
+                    action = step["action"]
+                    reasoning = step["reasoning"]
+                    screenshot_path = step["screenshot_path"]
+                    gr.Markdown(f"# Step {num}")
+                    with gr.Group():
+                        im = Image.open(screenshot_path)
+                        im = apply_overlay_to_image(
+                            im, step, highlevel_action_parser=hl_action_parser
+                        )
+                        format_ = "webp" if im.format is None else im.format
+                        gr.Image(im, label="Screenshot", format=format_)
+                        if reasoning is not None:
+                            gr.Textbox(reasoning, label="Reasoning", lines=4)
+                        if action is not None:
+                            gr.Textbox(action, label="Action", lines=2)
+            # multi-choices dropdown for judges
+            judge_dd = gr.Dropdown(
+                label="Judges",
+                choices=list(judges_dict.values()),
+                multiselect=True,
+                value=default_judges,
+            )
+            @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
+            def render_judge(benchmark, agent, task_id, judge_choices):
+                # load judgments
+                for judge in judges_dict.values():
+                    if judge not in judge_choices:
+                        continue
+                    judgment_path = get_judgment_path(
+                        base_judgments_dir, benchmark, agent, judge, task_id
                     )
+                    if not judgment_path.exists():
+                        continue
+                    with open(judgment_path, "rb") as f:
+                        judgment = orjson.loads(f.read())
+                    if judge == "Rule-based":
+                        msg = get_message_from_rule_based(judgment)
+                    else:
+                        msg = get_message_from_judgment(judgment)
+                    gr.Textbox(label=judge, value=msg, lines=4)
 demo.launch()