SondosMB commited on
Commit
a153b62
·
verified ·
1 Parent(s): 5d7d689

Update big.json

Browse files
Files changed (1) hide show
  1. big.json +65 -45
big.json CHANGED
@@ -1,68 +1,88 @@
1
  [
 
 
2
  {
 
 
 
 
 
 
 
 
 
 
 
 
3
  "model": "GPT-4",
4
- "Average": 65.94,
5
- "MMLU": 74.8,
6
- "WinoGrande": 66.2,
7
- "PiQA": 61.6,
8
- "CommonsenseQA": 63.0,
9
- "Race": 67.0,
10
- "MedMCQA": 51.8,
11
- "OpenkookQA": 60.3
 
12
  },
13
  {
14
  "model": "Claude-3 Opus",
15
- "Average": 62.64,
16
- "MMLU": 70.4,
17
- "WinoGrande": 63.5,
18
- "PiQA": 59.1,
19
- "CommonsenseQA": 63.7,
20
- "Race": 66.2,
21
- "MedMCQA": 49.1,
22
- "OpenkookQA": 54.0
 
23
  },
24
  {
25
  "model": "Mistral Large",
26
- "Average": 61.45,
27
- "MMLU": 67.8,
28
- "WinoGrande": 56.8,
29
- "PiQA": 61.2,
30
- "CommonsenseQA": 55.4,
31
- "Race": 70.1,
32
- "MedMCQA": 43.4,
33
- "OpenkookQA": 58.7
 
34
  },
35
  {
36
  "model": "GPT-3.5",
37
- "Average": 59.06,
38
- "MMLU": 65.4,
39
- "WinoGrande": 54.6,
40
- "PiQA": 54.9,
41
- "CommonsenseQA": 67.9,
42
- "Race": 60.1,
43
- "MedMCQA": 41.4,
44
- "OpenkookQA": 49.9
 
45
  },
46
  {
47
- "model": "Gemini Pro",
48
- "Average": 54.45,
49
- "MMLU": 57.7,
50
- "WinoGrande": 56.4,
51
- "PiQA": 47.7,
52
- "CommonsenseQA": 50.6,
53
- "Race": 61.0,
54
- "MedMCQA": 37.5,
55
- "OpenkookQA": 52.5
 
56
  },
57
  {
58
  "model": "Llama3-70b-instruct",
59
- "Average": 54.06,
60
- "MMLU": 64.67,
 
61
  "WinoGrande": 57.14,
62
- "PiQA": 43.1,
63
  "CommonsenseQA": 55.49,
64
  "Race": 58.21,
65
  "MedMCQA": 41.67,
66
- "OpenkookQA": 41.93
67
  }
68
  ]
 
1
  [
2
+
3
+
4
  {
5
+ "model": "GPT-4o",
6
+ "Average": 70.15,
7
+ "MMLU": 70.09,
8
+ "ARC":86.33,
9
+ "WinoGrande:"72.22,
10
+ "PiQA":60.34,
11
+ "CommonsenseQA":70.28,
12
+ "Race":67.87 ,
13
+ "MedMCQA":57.85 ,
14
+ "OpenkookQA":67.21
15
+ },
16
+ {
17
  "model": "GPT-4",
18
+ "Average": 65.93,
19
+ "MMLU": 74.77,
20
+ "ARC":82.68,
21
+ "WinoGrande": 66.22,
22
+ "PiQA": 61.64,
23
+ "CommonsenseQA": 62.96,
24
+ "Race": 67.05,
25
+ "MedMCQA": 51.81,
26
+ "OpenkookQA": 60.29
27
  },
28
  {
29
  "model": "Claude-3 Opus",
30
+ "Average": 62.68,
31
+ "MMLU": 70.23,
32
+ "ARC":76.62,
33
+ "WinoGrande": 63.54,
34
+ "PiQA": 59.05,
35
+ "CommonsenseQA": 63.66,
36
+ "Race": 66.22,
37
+ "MedMCQA": 49.14,
38
+ "OpenkookQA": 52.95
39
  },
40
  {
41
  "model": "Mistral Large",
42
+ "Average": 60.48,
43
+ "MMLU": 68.76,
44
+ "ARC":72.32
45
+ "WinoGrande": 56.83,
46
+ "PiQA": 61.21,
47
+ "CommonsenseQA": 55.35,
48
+ "Race": 70.17,
49
+ "MedMCQA": 43.44,
50
+ "OpenkookQA": 58.66
51
  },
52
  {
53
  "model": "GPT-3.5",
54
+ "Average": 60.30,
55
+ "MMLU": 65.38,
56
+ "ARC":78.24,
57
+ "WinoGrande": 64.56,
58
+ "PiQA": 54.89,
59
+ "CommonsenseQA": 67.89,
60
+ "Race": 60.11,
61
+ "MedMCQA": 41.42,
62
+ "OpenkookQA": 49.90
63
  },
64
  {
65
+ "model": "Gemini 1.0 Pro",
66
+ "Average": 54.04,
67
+ "MMLU": 56.04,
68
+ "ARC":72.23,
69
+ "WinoGrande": 56.35,
70
+ "PiQA": 47.70,
71
+ "CommonsenseQA": 50.56,
72
+ "Race": 61.02,
73
+ "MedMCQA": 35.89,
74
+ "OpenkookQA": 52.55
75
  },
76
  {
77
  "model": "Llama3-70b-instruct",
78
+ "Average": 52.92,
79
+ "MMLU": 59.67,
80
+ "ARC":67.09,
81
  "WinoGrande": 57.14,
82
+ "PiQA": 43.10,
83
  "CommonsenseQA": 55.49,
84
  "Race": 58.21,
85
  "MedMCQA": 41.67,
86
+ "OpenkookQA": 40.94
87
  }
88
  ]