new-results (#10)
Browse files- add new results (f6deee3a68bae677a45241391a990d2993d93616)
- move results to results directory (6218e4b3d7964f5903e36a6ebcd653959e99e2ac)
- fix names (a437028ee192e55ceaf9326b4b6cea8cf1820881)
- fix decimals (1eeaad645ed122796dced99d555403761ad10e45)
Co-authored-by: Aman Jaiswal <[email protected]>
- results/GenericAgent-Claude-3.7-Sonnet/README.md +44 -0
- results/GenericAgent-Claude-3.7-Sonnet/webarena.json +16 -0
- results/GenericAgent-Claude-4-Sonnet/README.md +44 -0
- results/GenericAgent-Claude-4-Sonnet/miniwob.json +17 -0
- results/GenericAgent-Claude-4-Sonnet/workarena-l1.json +16 -0
- results/GenericAgent-Claude-4-Sonnet/workarena-l2.json +16 -0
- results/GenericAgent-GPT-4.1-Mini/README.md +44 -0
- results/GenericAgent-GPT-4.1-Mini/webarena.json +16 -0
- results/GenericAgent-GPT-5-mini/README.md +44 -0
- results/GenericAgent-GPT-5-mini/miniwob.json +16 -0
- results/GenericAgent-GPT-5-mini/workarena-l1.json +16 -0
- results/GenericAgent-GPT-5-mini/workarena-l2.json +16 -0
- results/GenericAgent-GPT-5-nano/README.md +44 -0
- results/GenericAgent-GPT-5-nano/miniwob.json +16 -0
- results/GenericAgent-GPT-5-nano/workarena-l1.json +16 -0
- results/GenericAgent-GPT-5-nano/workarena-l2.json +16 -0
- results/GenericAgent-GPT-5/README.md +44 -0
- results/GenericAgent-GPT-5/miniwob.json +16 -0
- results/GenericAgent-GPT-5/workarena-l1.json +16 -0
- results/GenericAgent-GPT-5/workarena-l2.json +16 -0
- results/GenericAgent-GPT-5/workarena-l3.json +16 -0
- results/GenericAgent-GPT-oss-120b/README.md +44 -0
- results/GenericAgent-GPT-oss-120b/miniwob.json +16 -0
- results/GenericAgent-GPT-oss-120b/workarena-l1.json +16 -0
- results/GenericAgent-GPT-oss-120b/workarena-l2.json +16 -0
- results/GenericAgent-GPT-oss-20b/README.md +44 -0
- results/GenericAgent-GPT-oss-20b/miniwob.json +16 -0
- results/GenericAgent-GPT-oss-20b/workarena-l1.json +16 -0
- results/GenericAgent-GPT-oss-20b/workarena-l2.json +16 -0
- results/OrbyAgent-Claude-3.5-Sonnet/README.md +1 -0
results/GenericAgent-Claude-3.7-Sonnet/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-Claude-3.7-Sonnet
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses Claude-3.7-Sonnet (claude-3-7-sonnet-20250219) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-Claude-3.7-Sonnet/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.7-Sonnet",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 44.6,
|
7 |
+
"std_err": 2.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-Claude-4-Sonnet/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-Claude-4-Sonnet
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses claude-4-sonnet (claude-sonnet-4-20250514) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-Claude-4-Sonnet/miniwob.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 70.7,
|
7 |
+
"std_err": 1.8,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
|
17 |
+
]
|
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 63.3,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 40.4,
|
7 |
+
"std_err": 3.2,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-4.1-Mini/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT_4_1_mini
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-4.1-mini (gpt-4.1-mini-2025-04-14) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-4.1-Mini/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4.1-Mini",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 30.7,
|
7 |
+
"std_err": 2.4,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-mini/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT-5-Mini
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-5-mini (gpt-5-mini-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-5-mini/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 71,
|
7 |
+
"std_err": 1.8,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-mini/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 60.6,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-mini/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 47.7,
|
7 |
+
"std_err": 3.3,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-nano/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT-5-Nano
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-5-nano (gpt-5-nano-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-5-nano/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 64.8,
|
7 |
+
"std_err": 1.9,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-nano/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 40.6,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5-nano/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 3.4,
|
7 |
+
"std_err": 1.2,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT-5
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-5 (gpt-5-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-5/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 71.5,
|
7 |
+
"std_err": 1.8,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 79.1,
|
7 |
+
"std_err": 2.2,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "No",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "Increased max_steps from 15 to 30",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 69.4,
|
7 |
+
"std_err": 3.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-5/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L3",
|
6 |
+
"score": 11.5,
|
7 |
+
"std_err": 2.1,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "No",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "Increased max_steps from 50 to 100",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-120b/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-OSS-120B
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-oss-120b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-oss-120b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 66.4,
|
7 |
+
"std_err": 1.9,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-120b/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 50.9,
|
7 |
+
"std_err": 2.8,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-120b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 11.5,
|
7 |
+
"std_err": 2.1,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-20b/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-OSS-20b
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses gpt-oss-20b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
results/GenericAgent-GPT-oss-20b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 64,
|
7 |
+
"std_err": 1.9,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-20b/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 38.5,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-oss-20b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
+
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
+
"score": 2.6,
|
7 |
+
"std_err": 1.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
results/OrbyAgent-Claude-3.5-Sonnet/README.md
CHANGED
@@ -5,3 +5,4 @@ This agent is developed by [Orby AI](https://www.orby.ai/).
|
|
5 |
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
6 |
|
7 |
It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
|
|
|
5 |
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
6 |
|
7 |
It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
8 |
+
|