Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

Aman-J commited on 9 days ago

Commit

1eeaad6

1 Parent(s): a437028

fix decimals

Browse files

Files changed (21) hide show

results/GenericAgent-Claude-3.7-Sonnet/webarena.json +2 -2
results/GenericAgent-Claude-4-Sonnet/miniwob.json +2 -2
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json +2 -2
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json +2 -2
results/GenericAgent-GPT-4.1-Mini/webarena.json +2 -2
results/GenericAgent-GPT-5-mini/miniwob.json +2 -2
results/GenericAgent-GPT-5-mini/workarena-l1.json +2 -2
results/GenericAgent-GPT-5-mini/workarena-l2.json +2 -2
results/GenericAgent-GPT-5-nano/miniwob.json +2 -2
results/GenericAgent-GPT-5-nano/workarena-l1.json +2 -2
results/GenericAgent-GPT-5-nano/workarena-l2.json +2 -2
results/GenericAgent-GPT-5/miniwob.json +2 -2
results/GenericAgent-GPT-5/workarena-l1.json +2 -2
results/GenericAgent-GPT-5/workarena-l2.json +2 -2
results/GenericAgent-GPT-5/workarena-l3.json +2 -2
results/GenericAgent-GPT-oss-120b/miniwob.json +2 -2
results/GenericAgent-GPT-oss-120b/workarena-l1.json +2 -2
results/GenericAgent-GPT-oss-120b/workarena-l2.json +2 -2
results/GenericAgent-GPT-oss-20b/miniwob.json +2 -2
results/GenericAgent-GPT-oss-20b/workarena-l1.json +2 -2
results/GenericAgent-GPT-oss-20b/workarena-l2.json +2 -2

results/GenericAgent-Claude-3.7-Sonnet/webarena.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-Claude-3.7-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WebArena",
-    "score": 0.446,
-    "std_err": 0.025,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-Claude-3.7-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WebArena",
+    "score": 44.6,
+    "std_err": 2.5,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-Claude-4-Sonnet/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.707,
-    "std_err": 0.018,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 70.7,
+    "std_err": 1.8,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-Claude-4-Sonnet/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.633,
-    "std_err": 0.027,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 63.3,
+    "std_err": 2.7,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-Claude-4-Sonnet/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.404,
-    "std_err": 0.032,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-Claude-4-Sonnet",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 40.4,
+    "std_err": 3.2,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-4.1-Mini/webarena.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-4.1-Mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WebArena",
-    "score": 0.307,
-    "std_err": 0.024,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-4.1-Mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WebArena",
+    "score": 30.7,
+    "std_err": 2.4,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-mini/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.71,
-    "std_err": 0.018,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 71,
+    "std_err": 1.8,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-mini/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.606,
-    "std_err": 0.027,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 60.6,
+    "std_err": 2.7,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-mini/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.477,
-    "std_err": 0.033,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-mini",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 47.7,
+    "std_err": 3.3,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-nano/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.648,
-    "std_err": 0.019,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 64.8,
+    "std_err": 1.9,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-nano/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.406,
-    "std_err": 0.027,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 40.6,
+    "std_err": 2.7,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5-nano/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.034,
-    "std_err": 0.012,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5-nano",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 3.4,
+    "std_err": 1.2,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.715,
-    "std_err": 0.018,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 71.5,
+    "std_err": 1.8,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.791,
-    "std_err": 0.022,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "No",

     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 79.1,
+    "std_err": 2.2,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "No",

results/GenericAgent-GPT-5/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.694,
-    "std_err": 0.03,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 69.4,
+    "std_err": 3.0,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-5/workarena-l3.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L3",
-    "score": 0.115,
-    "std_err": 0.021,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "No",

     "agent_name": "GenericAgent-GPT-5",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L3",
+    "score": 11.5,
+    "std_err": 2.1,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "No",

results/GenericAgent-GPT-oss-120b/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.664,
-    "std_err": 0.019,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 66.4,
+    "std_err": 1.9,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-oss-120b/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.509,
-    "std_err": 0.028,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 50.9,
+    "std_err": 2.8,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-oss-120b/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.115,
-    "std_err": 0.021,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-120b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 11.5,
+    "std_err": 2.1,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-oss-20b/miniwob.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
-    "score": 0.64,
-    "std_err": 0.019,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "MiniWoB",
+    "score": 64,
+    "std_err": 1.9,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-oss-20b/workarena-l1.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
-    "score": 0.385,
-    "std_err": 0.027,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L1",
+    "score": 38.5,
+    "std_err": 2.7,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

results/GenericAgent-GPT-oss-20b/workarena-l2.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
-    "score": 0.026,
-    "std_err": 0.01,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",

     "agent_name": "GenericAgent-GPT-oss-20b",
     "study_id": "2025-08-07_21-09-16",
     "benchmark": "WorkArena-L2",
+    "score": 2.6,
+    "std_err": 1.0,
     "benchmark_specific": "No",
     "benchmark_tuned": "No",
     "followed_evaluation_protocol": "Yes",