fix decimals
Browse files- results/GenericAgent-Claude-3.7-Sonnet/webarena.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/miniwob.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/workarena-l1.json +2 -2
- results/GenericAgent-Claude-4-Sonnet/workarena-l2.json +2 -2
- results/GenericAgent-GPT-4.1-Mini/webarena.json +2 -2
- results/GenericAgent-GPT-5-mini/miniwob.json +2 -2
- results/GenericAgent-GPT-5-mini/workarena-l1.json +2 -2
- results/GenericAgent-GPT-5-mini/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5-nano/miniwob.json +2 -2
- results/GenericAgent-GPT-5-nano/workarena-l1.json +2 -2
- results/GenericAgent-GPT-5-nano/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5/miniwob.json +2 -2
- results/GenericAgent-GPT-5/workarena-l1.json +2 -2
- results/GenericAgent-GPT-5/workarena-l2.json +2 -2
- results/GenericAgent-GPT-5/workarena-l3.json +2 -2
- results/GenericAgent-GPT-oss-120b/miniwob.json +2 -2
- results/GenericAgent-GPT-oss-120b/workarena-l1.json +2 -2
- results/GenericAgent-GPT-oss-120b/workarena-l2.json +2 -2
- results/GenericAgent-GPT-oss-20b/miniwob.json +2 -2
- results/GenericAgent-GPT-oss-20b/workarena-l1.json +2 -2
- results/GenericAgent-GPT-oss-20b/workarena-l2.json +2 -2
results/GenericAgent-Claude-3.7-Sonnet/webarena.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-Claude-3.7-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-Claude-3.7-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
+
"score": 44.6,
|
| 7 |
+
"std_err": 2.5,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-Claude-4-Sonnet/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 70.7,
|
| 7 |
+
"std_err": 1.8,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 63.3,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-Claude-4-Sonnet",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 40.4,
|
| 7 |
+
"std_err": 3.2,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-4.1-Mini/webarena.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-4.1-Mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-4.1-Mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
+
"score": 30.7,
|
| 7 |
+
"std_err": 2.4,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-mini/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 71,
|
| 7 |
+
"std_err": 1.8,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-mini/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 60.6,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-mini/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-mini",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 47.7,
|
| 7 |
+
"std_err": 3.3,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-nano/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 64.8,
|
| 7 |
+
"std_err": 1.9,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-nano/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 40.6,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5-nano/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5-nano",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 3.4,
|
| 7 |
+
"std_err": 1.2,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 71.5,
|
| 7 |
+
"std_err": 1.8,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "No",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 79.1,
|
| 7 |
+
"std_err": 2.2,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "No",
|
results/GenericAgent-GPT-5/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err": 0
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 69.4,
|
| 7 |
+
"std_err": 3.0,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-5/workarena-l3.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L3",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "No",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-5",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L3",
|
| 6 |
+
"score": 11.5,
|
| 7 |
+
"std_err": 2.1,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "No",
|
results/GenericAgent-GPT-oss-120b/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 66.4,
|
| 7 |
+
"std_err": 1.9,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-oss-120b/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 50.9,
|
| 7 |
+
"std_err": 2.8,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-oss-120b/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-120b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 11.5,
|
| 7 |
+
"std_err": 2.1,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-oss-20b/miniwob.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 64,
|
| 7 |
+
"std_err": 1.9,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-oss-20b/workarena-l1.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err":
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 38.5,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
results/GenericAgent-GPT-oss-20b/workarena-l2.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
-
"score":
|
| 7 |
-
"std_err": 0
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 3 |
"agent_name": "GenericAgent-GPT-oss-20b",
|
| 4 |
"study_id": "2025-08-07_21-09-16",
|
| 5 |
"benchmark": "WorkArena-L2",
|
| 6 |
+
"score": 2.6,
|
| 7 |
+
"std_err": 1.0,
|
| 8 |
"benchmark_specific": "No",
|
| 9 |
"benchmark_tuned": "No",
|
| 10 |
"followed_evaluation_protocol": "Yes",
|