Aman-J commited on
Commit
1eeaad6
·
1 Parent(s): a437028

fix decimals

Browse files
results/GenericAgent-Claude-3.7-Sonnet/webarena.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-Claude-3.7-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WebArena",
6
- "score": 0.446,
7
- "std_err": 0.025,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-Claude-3.7-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WebArena",
6
+ "score": 44.6,
7
+ "std_err": 2.5,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-Claude-4-Sonnet/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.707,
7
- "std_err": 0.018,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 70.7,
7
+ "std_err": 1.8,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.633,
7
- "std_err": 0.027,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 63.3,
7
+ "std_err": 2.7,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.404,
7
- "std_err": 0.032,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 40.4,
7
+ "std_err": 3.2,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-4.1-Mini/webarena.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-4.1-Mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WebArena",
6
- "score": 0.307,
7
- "std_err": 0.024,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-4.1-Mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WebArena",
6
+ "score": 30.7,
7
+ "std_err": 2.4,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-mini/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.71,
7
- "std_err": 0.018,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 71,
7
+ "std_err": 1.8,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-mini/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.606,
7
- "std_err": 0.027,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 60.6,
7
+ "std_err": 2.7,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-mini/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.477,
7
- "std_err": 0.033,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 47.7,
7
+ "std_err": 3.3,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-nano/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.648,
7
- "std_err": 0.019,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 64.8,
7
+ "std_err": 1.9,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-nano/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.406,
7
- "std_err": 0.027,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 40.6,
7
+ "std_err": 2.7,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5-nano/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.034,
7
- "std_err": 0.012,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 3.4,
7
+ "std_err": 1.2,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.715,
7
- "std_err": 0.018,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 71.5,
7
+ "std_err": 1.8,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.791,
7
- "std_err": 0.022,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "No",
 
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 79.1,
7
+ "std_err": 2.2,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "No",
results/GenericAgent-GPT-5/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.694,
7
- "std_err": 0.03,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 69.4,
7
+ "std_err": 3.0,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-5/workarena-l3.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L3",
6
- "score": 0.115,
7
- "std_err": 0.021,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "No",
 
3
  "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L3",
6
+ "score": 11.5,
7
+ "std_err": 2.1,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "No",
results/GenericAgent-GPT-oss-120b/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.664,
7
- "std_err": 0.019,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 66.4,
7
+ "std_err": 1.9,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-oss-120b/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.509,
7
- "std_err": 0.028,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 50.9,
7
+ "std_err": 2.8,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-oss-120b/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.115,
7
- "std_err": 0.021,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 11.5,
7
+ "std_err": 2.1,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-oss-20b/miniwob.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
- "score": 0.64,
7
- "std_err": 0.019,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
+ "score": 64,
7
+ "std_err": 1.9,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-oss-20b/workarena-l1.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
- "score": 0.385,
7
- "std_err": 0.027,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L1",
6
+ "score": 38.5,
7
+ "std_err": 2.7,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
results/GenericAgent-GPT-oss-20b/workarena-l2.json CHANGED
@@ -3,8 +3,8 @@
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
- "score": 0.026,
7
- "std_err": 0.01,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
 
3
  "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "WorkArena-L2",
6
+ "score": 2.6,
7
+ "std_err": 1.0,
8
  "benchmark_specific": "No",
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",