Spaces:
Running
Running
| [ | |
| { | |
| "model": "GPT-4o", | |
| "Average": 70.15, | |
| "MMLU": 70.09, | |
| "ARC":86.31, | |
| "WinoGrande":72.22, | |
| "PIQA":60.34, | |
| "CommonsenseQA":70.28, | |
| "Race":67.87 , | |
| "MedMCQA":57.85 , | |
| "OpenkookQA":67.21 | |
| }, | |
| { | |
| "model": "GPT-4-1106-preview", | |
| "Average": 65.93, | |
| "MMLU": 74.77, | |
| "ARC":82.68, | |
| "WinoGrande": 66.22, | |
| "PIQA": 61.64, | |
| "CommonsenseQA": 62.96, | |
| "Race": 67.05, | |
| "MedMCQA": 51.81, | |
| "OpenkookQA": 60.29 | |
| }, | |
| { | |
| "model": "Claude-3 Opus", | |
| "Average": 62.53, | |
| "MMLU": 70.23, | |
| "ARC":75.47, | |
| "WinoGrande": 63.54, | |
| "PIQA": 59.05, | |
| "CommonsenseQA": 63.66, | |
| "Race": 66.22, | |
| "MedMCQA": 49.14, | |
| "OpenkookQA": 52.95 | |
| }, | |
| { | |
| "model": "Mistral Large", | |
| "Average": 60.48, | |
| "MMLU": 68.76, | |
| "ARC":72.32, | |
| "WinoGrande": 56.83, | |
| "PIQA": 61.21, | |
| "CommonsenseQA": 55.35, | |
| "Race": 70.17, | |
| "MedMCQA": 43.44, | |
| "OpenkookQA": 58.66 | |
| }, | |
| { | |
| "model": "GPT-3.5", | |
| "Average": 60.32, | |
| "MMLU": 65.38, | |
| "ARC":78.24, | |
| "WinoGrande": 64.56, | |
| "PIQA": 54.89, | |
| "CommonsenseQA": 67.89, | |
| "Race": 60.11, | |
| "MedMCQA": 41.42, | |
| "OpenkookQA": 49.90 | |
| }, | |
| { | |
| "model": "Gemini 1.0 Pro", | |
| "Average": 54.06, | |
| "MMLU": 56.04, | |
| "ARC":72.35, | |
| "WinoGrande": 56.35, | |
| "PIQA": 47.70, | |
| "CommonsenseQA": 50.56, | |
| "Race": 61.02, | |
| "MedMCQA": 35.89, | |
| "OpenkookQA": 52.55 | |
| }, | |
| { | |
| "model": "Llama3-70b-instruct", | |
| "Average": 52.92, | |
| "MMLU": 59.67, | |
| "ARC":67.09, | |
| "WinoGrande": 57.14, | |
| "PIQA": 43.10, | |
| "CommonsenseQA": 55.49, | |
| "Race": 58.21, | |
| "MedMCQA": 41.67, | |
| "OpenkookQA": 40.94 | |
| } | |
| ] |