Aman-J commited on
Commit
a437028
·
1 Parent(s): 6218e4b
results/GenericAgent-Claude-3.7-Sonnet/webarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-claude-3-7-sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Webarena",
6
  "score": 0.446,
7
  "std_err": 0.025,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.7-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WebArena",
6
  "score": 0.446,
7
  "std_err": 0.025,
8
  "benchmark_specific": "No",
results/GenericAgent-Claude-4-Sonnet/miniwob.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-claude-sonnet-4",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Miniwob",
6
  "score": 0.707,
7
  "std_err": 0.018,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "MiniWoB",
6
  "score": 0.707,
7
  "std_err": 0.018,
8
  "benchmark_specific": "No",
results/GenericAgent-Claude-4-Sonnet/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-claude-sonnet-4-20250514",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
  "score": 0.633,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.633,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
results/GenericAgent-Claude-4-Sonnet/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-claude-sonnet-4-20250514",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.404,
7
  "std_err": 0.032,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-4-Sonnet",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.404,
7
  "std_err": 0.032,
8
  "benchmark_specific": "No",
results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/README.md RENAMED
File without changes
results/{GenericAgent-GPT-4_1-Mini → GenericAgent-GPT-4.1-Mini}/webarena.json RENAMED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-4.1-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "webarena",
6
  "score": 0.307,
7
  "std_err": 0.024,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4.1-Mini",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WebArena",
6
  "score": 0.307,
7
  "std_err": 0.024,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5-mini/miniwob.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.71,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.71,
results/GenericAgent-GPT-5-mini/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
  "score": 0.606,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.606,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5-mini/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.477,
7
  "std_err": 0.033,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-mini",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.477,
7
  "std_err": 0.033,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5-nano/miniwob.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.648,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.648,
results/GenericAgent-GPT-5-nano/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
  "score": 0.406,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.406,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5-nano/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.034,
7
  "std_err": 0.012,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5-nano",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.034,
7
  "std_err": 0.012,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5/miniwob.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-2025-08-07",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.715,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.715,
results/GenericAgent-GPT-5/workarena-l1.json CHANGED
@@ -1,22 +1,8 @@
1
  [
2
- {
3
- "agent_name": "GenericAgent-gpt-5-2025-08-07",
4
- "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
- "score": 0.661,
7
- "std_err": 0.026,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2025-08-07 21:09:16"
15
- },
16
  {
17
- "agent_name": "GenericAgent-gpt-5-2025-08-07",
18
  "study_id": "2025-08-07_21-09-16",
19
- "benchmark": "Workarena-L1",
20
  "score": 0.791,
21
  "std_err": 0.022,
22
  "benchmark_specific": "No",
@@ -26,6 +12,5 @@
26
  "comments": "Increased max_steps from 15 to 30",
27
  "original_or_reproduced": "Original",
28
  "date_time": "2025-08-07 21:09:16"
29
- }
30
-
31
- ]
 
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
+ "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.791,
7
  "std_err": 0.022,
8
  "benchmark_specific": "No",
 
12
  "comments": "Increased max_steps from 15 to 30",
13
  "original_or_reproduced": "Original",
14
  "date_time": "2025-08-07 21:09:16"
15
+ }
16
+ ]
 
results/GenericAgent-GPT-5/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-2025-08-07",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.694,
7
  "std_err": 0.03,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.694,
7
  "std_err": 0.03,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-5/workarena-l3.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-5-2025-08-07",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L3",
6
  "score": 0.115,
7
  "std_err": 0.021,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-5",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L3",
6
  "score": 0.115,
7
  "std_err": 0.021,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-oss-120b/miniwob.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-openai_gpt-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.664,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.664,
results/GenericAgent-GPT-oss-120b/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-openai_gpt-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
  "score": 0.509,
7
  "std_err": 0.028,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.509,
7
  "std_err": 0.028,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-oss-120b/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-openai_gpt-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.115,
7
  "std_err": 0.021,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-120b",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.115,
7
  "std_err": 0.021,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-oss-20b/miniwob.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-openai_gpt-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.64,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
  "benchmark": "MiniWoB",
6
  "score": 0.64,
results/GenericAgent-GPT-oss-20b/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L1",
6
  "score": 0.385,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L1",
6
  "score": 0.385,
7
  "std_err": 0.027,
8
  "benchmark_specific": "No",
results/GenericAgent-GPT-oss-20b/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-gpt-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
- "benchmark": "Workarena-L2",
6
  "score": 0.026,
7
  "std_err": 0.01,
8
  "benchmark_specific": "No",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-oss-20b",
4
  "study_id": "2025-08-07_21-09-16",
5
+ "benchmark": "WorkArena-L2",
6
  "score": 0.026,
7
  "std_err": 0.01,
8
  "benchmark_specific": "No",