fix names
Browse files- results/GenericAgent-Claude-3.7-Sonnet/webarena.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/miniwob.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/workarena-l1.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/workarena-l2.json +2 -2
- results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/README.md +0 -0
- results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/webarena.json +2 -2
- results/GenericAgent-GPT-5-mini/miniwob.json +1 -1
- results/GenericAgent-GPT-5-mini/workarena-l1.json +2 -2
- results/GenericAgent-GPT-5-mini/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5-nano/miniwob.json +1 -1
- results/GenericAgent-GPT-5-nano/workarena-l1.json +2 -2
- results/GenericAgent-GPT-5-nano/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5/miniwob.json +1 -1
- results/GenericAgent-GPT-5/workarena-l1.json +4 -19
- results/GenericAgent-GPT-5/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5/workarena-l3.json +2 -2
- results/GenericAgent-GPT-oss-120b/miniwob.json +1 -1
- results/GenericAgent-GPT-oss-120b/workarena-l1.json +2 -2
- results/GenericAgent-GPT-oss-120b/workarena-l2.json +2 -2
- results/GenericAgent-GPT-oss-20b/miniwob.json +1 -1
- results/GenericAgent-GPT-oss-20b/workarena-l1.json +2 -2
- results/GenericAgent-GPT-oss-20b/workarena-l2.json +2 -2
results/GenericAgent-Claude-3.7-Sonnet/webarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.446,
|
7 |
"std_err": 0.025,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.7-Sonnet",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WebArena",
|
6 |
"score": 0.446,
|
7 |
"std_err": 0.025,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-Claude-4-Sonnet/miniwob.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.707,
|
7 |
"std_err": 0.018,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
"score": 0.707,
|
7 |
"std_err": 0.018,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.633,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.633,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.404,
|
7 |
"std_err": 0.032,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.404,
|
7 |
"std_err": 0.032,
|
8 |
"benchmark_specific": "No",
|
results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/README.md
RENAMED
File without changes
|
results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/webarena.json
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.307,
|
7 |
"std_err": 0.024,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4.1-Mini",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WebArena",
|
6 |
"score": 0.307,
|
7 |
"std_err": 0.024,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5-mini/miniwob.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.71,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.71,
|
results/GenericAgent-GPT-5-mini/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.606,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.606,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5-mini/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.477,
|
7 |
"std_err": 0.033,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-mini",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.477,
|
7 |
"std_err": 0.033,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5-nano/miniwob.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.648,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.648,
|
results/GenericAgent-GPT-5-nano/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.406,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.406,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5-nano/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.034,
|
7 |
"std_err": 0.012,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5-nano",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.034,
|
7 |
"std_err": 0.012,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5/miniwob.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.715,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.715,
|
results/GenericAgent-GPT-5/workarena-l1.json
CHANGED
@@ -1,22 +1,8 @@
|
|
1 |
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
4 |
-
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "Workarena-L1",
|
6 |
-
"score": 0.661,
|
7 |
-
"std_err": 0.026,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2025-08-07 21:09:16"
|
15 |
-
},
|
16 |
{
|
17 |
-
"agent_name": "GenericAgent-
|
18 |
"study_id": "2025-08-07_21-09-16",
|
19 |
-
"benchmark": "
|
20 |
"score": 0.791,
|
21 |
"std_err": 0.022,
|
22 |
"benchmark_specific": "No",
|
@@ -26,6 +12,5 @@
|
|
26 |
"comments": "Increased max_steps from 15 to 30",
|
27 |
"original_or_reproduced": "Original",
|
28 |
"date_time": "2025-08-07 21:09:16"
|
29 |
-
}
|
30 |
-
|
31 |
-
]
|
|
|
1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.791,
|
7 |
"std_err": 0.022,
|
8 |
"benchmark_specific": "No",
|
|
|
12 |
"comments": "Increased max_steps from 15 to 30",
|
13 |
"original_or_reproduced": "Original",
|
14 |
"date_time": "2025-08-07 21:09:16"
|
15 |
+
}
|
16 |
+
]
|
|
results/GenericAgent-GPT-5/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.694,
|
7 |
"std_err": 0.03,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.694,
|
7 |
"std_err": 0.03,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-5/workarena-l3.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.115,
|
7 |
"std_err": 0.021,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-5",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L3",
|
6 |
"score": 0.115,
|
7 |
"std_err": 0.021,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-oss-120b/miniwob.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.664,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.664,
|
results/GenericAgent-GPT-oss-120b/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.509,
|
7 |
"std_err": 0.028,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.509,
|
7 |
"std_err": 0.028,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-oss-120b/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.115,
|
7 |
"std_err": 0.021,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-120b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.115,
|
7 |
"std_err": 0.021,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-oss-20b/miniwob.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.64,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 0.64,
|
results/GenericAgent-GPT-oss-20b/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.385,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
"score": 0.385,
|
7 |
"std_err": 0.027,
|
8 |
"benchmark_specific": "No",
|
results/GenericAgent-GPT-oss-20b/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "GenericAgent-
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
-
"benchmark": "
|
6 |
"score": 0.026,
|
7 |
"std_err": 0.01,
|
8 |
"benchmark_specific": "No",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-oss-20b",
|
4 |
"study_id": "2025-08-07_21-09-16",
|
5 |
+
"benchmark": "WorkArena-L2",
|
6 |
"score": 0.026,
|
7 |
"std_err": 0.01,
|
8 |
"benchmark_specific": "No",
|