Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Ruochen Zhao
		
	commited on
		
		
					Commit 
							
							·
						
						bcdb701
	
1
								Parent(s):
							
							bf46203
								
updated leaderboard
Browse files
    	
        app.py
    CHANGED
    
    | @@ -23,7 +23,7 @@ def restart_space(): | |
| 23 | 
             
                API.restart_space(repo_id="Auto-Arena/Leaderboard")
         | 
| 24 |  | 
| 25 |  | 
| 26 | 
            -
            csv_path = f"./src/results/auto-arena-llms-results- | 
| 27 | 
             
            csv_path_chinese = f"./src/results/auto-arena-llms-results-chinese-20240531.csv"
         | 
| 28 | 
             
            df_results = load_data(csv_path).sort_values(by="Rank")
         | 
| 29 | 
             
            df_results_chinese = load_data(csv_path_chinese)
         | 
|  | |
| 23 | 
             
                API.restart_space(repo_id="Auto-Arena/Leaderboard")
         | 
| 24 |  | 
| 25 |  | 
| 26 | 
            +
            csv_path = f"./src/results/auto-arena-llms-results-20241007.csv"
         | 
| 27 | 
             
            csv_path_chinese = f"./src/results/auto-arena-llms-results-chinese-20240531.csv"
         | 
| 28 | 
             
            df_results = load_data(csv_path).sort_values(by="Rank")
         | 
| 29 | 
             
            df_results_chinese = load_data(csv_path_chinese)
         | 
    	
        src/.DS_Store
    CHANGED
    
    | Binary files a/src/.DS_Store and b/src/.DS_Store differ | 
|  | 
    	
        src/results/auto-arena-llms-results-20240615.csv
    CHANGED
    
    | @@ -1,19 +1,19 @@ | |
| 1 | 
            -
            Model,Rank,MT-Bench Hard,MT-Bench,LC-AlpacaEval,openLLM,MMLU,From,Open?,Params(B),Cost,Score | 
| 2 | 
            -
            [gpt-4o-2024-05-13](https://openai.com/index/hello-gpt-4o/),1,,,57.5,,87.2,OpenAI,No,-,15,1196.881 | 
| 3 | 
            -
            [GPT-4-turbo-0409](https://platform.openai.com/docs/models/gpt-4o),2,82.6,,55,86.27,86.5,OpenAI,No,-,30,1138. | 
| 4 | 
            -
            [meta-llama/Llama-3-70b-chat-hf](https://ai.meta.com/blog/meta-llama-3/),3,41.1,,34.4,77.88,80.06,meta,Yes,70B,-,1079.705 | 
| 5 | 
            -
            [qwen2-72B-instruct](https://qwenlm.github.io/blog/qwen2/),4,48.1,9.12,,,84.2,Alibaba,Yes,72B,-,1077.809 | 
| 6 | 
            -
            [minimax-abab6.5-chat](https://platform.minimaxi.com/),5,,,,,78.7,minimax,No,-,4.2,1071.106 | 
| 7 | 
            -
            [glm-4](https://open.bigmodel.cn/trialcenter?modelCode=glm-4),6,,,,,81.5,Zhipu AI,No,-,13.8,1062.398 | 
| 8 | 
            -
            [command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus),7,33.1,,,74.62,75.7,Cohere,Yes,104B,15,1043.217 | 
| 9 | 
            -
            [claude-3-haiku-20240307](https://www.anthropic.com/api),8,41.5,9.1,,84.8,75.2,Anthropic,No,-,1.25,1028.622 | 
| 10 | 
            -
            [reka-core-20240501](https://www.reka.ai/news/reka-core-our-frontier-class-multimodal-language-model),9,,,,,83.2,Reka AI,No,-,25,1016.344 | 
| 11 | 
            -
            [Qwen1.5-72B-chat](https://huggingface.co/Qwen/Qwen1.5-72B),10,36.1,8.61,36.6,72.91,77.2,Alibaba,Yes,72B,-,1013.886 | 
| 12 | 
            -
            [SenseChat-5](https://console.sensecore.cn/nova/home),11,,,,,84.7,SenseTime,No,-,13.8,1001.696 | 
| 13 | 
            -
            [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),12,23.4,8.3,23.7,72.71,71.4,Mistral AI,Yes,7B,-,950.769 | 
| 14 | 
            -
            [wenxin-4](https://yiyan.baidu.com/),13,,,,,,Baidu,No,-,16.6,945.233 | 
| 15 | 
            -
            [zero-one-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat),14,23.1,7,27.2,63.17,74.87,Zero One AI,Yes,34B,-,935. | 
| 16 | 
            -
            [mistral-large-2402](https://mistral.ai/news/mistral-large/),15,37.7,8.63,32.7,,81.2,Mistral AI,No,-,12,919.873 | 
| 17 | 
            -
            [GPT-3.5-Turbo-0125](https://openai.com/index/new-embedding-models-and-api-updates/),16,23.3,7.94,17.7,71.02,70,OpenAI,No,-,1.5,880.908 | 
| 18 | 
            -
            [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat),17,,,17.8,,71.3,Deepseek AI,Yes,67B,-,832.252 | 
| 19 | 
            -
            [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf),18,11.6,6.86,14.7,62.4,63.91,Meta,Yes,70B,-,804.969 | 
|  | |
| 1 | 
            +
            Model,Rank,MT-Bench Hard,MT-Bench,LC-AlpacaEval,openLLM,MMLU,From,Open?,Params(B),Cost,Score
         | 
| 2 | 
            +
            [gpt-4o-2024-05-13](https://openai.com/index/hello-gpt-4o/),1,,,57.5,,87.2,OpenAI,No,-,15,1196.881
         | 
| 3 | 
            +
            [GPT-4-turbo-0409](https://platform.openai.com/docs/models/gpt-4o),2,82.6,,55,86.27,86.5,OpenAI,No,-,30,1138.81
         | 
| 4 | 
            +
            [meta-llama/Llama-3-70b-chat-hf](https://ai.meta.com/blog/meta-llama-3/),3,41.1,,34.4,77.88,80.06,meta,Yes,70B,-,1079.705
         | 
| 5 | 
            +
            [qwen2-72B-instruct](https://qwenlm.github.io/blog/qwen2/),4,48.1,9.12,,,84.2,Alibaba,Yes,72B,-,1077.809
         | 
| 6 | 
            +
            [minimax-abab6.5-chat](https://platform.minimaxi.com/),5,,,,,78.7,minimax,No,-,4.2,1071.106
         | 
| 7 | 
            +
            [glm-4](https://open.bigmodel.cn/trialcenter?modelCode=glm-4),6,,,,,81.5,Zhipu AI,No,-,13.8,1062.398
         | 
| 8 | 
            +
            [command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus),7,33.1,,,74.62,75.7,Cohere,Yes,104B,15,1043.217
         | 
| 9 | 
            +
            [claude-3-haiku-20240307](https://www.anthropic.com/api),8,41.5,9.1,,84.8,75.2,Anthropic,No,-,1.25,1028.622
         | 
| 10 | 
            +
            [reka-core-20240501](https://www.reka.ai/news/reka-core-our-frontier-class-multimodal-language-model),9,,,,,83.2,Reka AI,No,-,25,1016.344
         | 
| 11 | 
            +
            [Qwen1.5-72B-chat](https://huggingface.co/Qwen/Qwen1.5-72B),10,36.1,8.61,36.6,72.91,77.2,Alibaba,Yes,72B,-,1013.886
         | 
| 12 | 
            +
            [SenseChat-5](https://console.sensecore.cn/nova/home),11,,,,,84.7,SenseTime,No,-,13.8,1001.696
         | 
| 13 | 
            +
            [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),12,23.4,8.3,23.7,72.71,71.4,Mistral AI,Yes,7B,-,950.769
         | 
| 14 | 
            +
            [wenxin-4](https://yiyan.baidu.com/),13,,,,,,Baidu,No,-,16.6,945.233
         | 
| 15 | 
            +
            [zero-one-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat),14,23.1,7,27.2,63.17,74.87,Zero One AI,Yes,34B,-,935.52
         | 
| 16 | 
            +
            [mistral-large-2402](https://mistral.ai/news/mistral-large/),15,37.7,8.63,32.7,,81.2,Mistral AI,No,-,12,919.873
         | 
| 17 | 
            +
            [GPT-3.5-Turbo-0125](https://openai.com/index/new-embedding-models-and-api-updates/),16,23.3,7.94,17.7,71.02,70,OpenAI,No,-,1.5,880.908
         | 
| 18 | 
            +
            [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat),17,,,17.8,,71.3,Deepseek AI,Yes,67B,-,832.252
         | 
| 19 | 
            +
            [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf),18,11.6,6.86,14.7,62.4,63.91,Meta,Yes,70B,-,804.969
         | 
    	
        src/results/auto-arena-llms-results-20240624.csv
    CHANGED
    
    | @@ -1,20 +1,20 @@ | |
| 1 | 
            -
            Model,Rank,MT-Bench Hard,MT-Bench,LC-AlpacaEval,openLLM,MMLU,From,Open?,Params(B),Cost,Score | 
| 2 | 
            -
            [claude-3-5-sonnet-20240620](https://www.anthropic.com/news/claude-3-5-sonnet),1,,,57.5,,87.2,Anthropic,No,-,15,1282.192081 | 
| 3 | 
            -
            [gpt-4o-2024-05-13](https://openai.com/index/hello-gpt-4o/),2,,,57.5,,87.2,OpenAI,No,-,15,1194.520424 | 
| 4 | 
            -
            [GPT-4-turbo-0409](https://platform.openai.com/docs/models/gpt-4o),3,82.6,,55,86.27,86.5,OpenAI,No,-,30,1124.732733 | 
| 5 | 
            -
            [qwen2-72B-instruct](https://qwenlm.github.io/blog/qwen2/),4,48.1,9.12,,,84.2,Alibaba,Yes,72B,-,1109.810932 | 
| 6 | 
            -
            [meta-llama/Llama-3-70b-chat-hf](https://ai.meta.com/blog/meta-llama-3/),5,41.1,,34.4,77.88,80.06,meta,Yes,70B,-,1048.258949 | 
| 7 | 
            -
            [glm-4](https://open.bigmodel.cn/trialcenter?modelCode=glm-4),6,,,,,81.5,Zhipu AI,No,-,13.8,1038.939252 | 
| 8 | 
            -
            [minimax-abab6.5-chat](https://platform.minimaxi.com/),7,,,,,78.7,minimax,No,-,4.2,1037.480905 | 
| 9 | 
            -
            [command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus),8,33.1,,,74.62,75.7,Cohere,Yes,104B,15,1023.464325 | 
| 10 | 
            -
            [claude-3-haiku-20240307](https://www.anthropic.com/api),9,41.5,9.1,,84.8,75.2,Anthropic,No,-,1.25,1009.099768 | 
| 11 | 
            -
            [Qwen1.5-72B-chat](https://huggingface.co/Qwen/Qwen1.5-72B),10,36.1,8.61,36.6,72.91,77.2,Alibaba,Yes,72B,-,994.660656 | 
| 12 | 
            -
            [reka-core-20240501](https://www.reka.ai/news/reka-core-our-frontier-class-multimodal-language-model),11,,,,,83.2,Reka AI,No,-,25,994.535244 | 
| 13 | 
            -
            [SenseChat-5](https://console.sensecore.cn/nova/home),12,,,,,84.7,SenseTime,No,-,13.8,993.937723 | 
| 14 | 
            -
            [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),13,23.4,8.3,23.7,72.71,71.4,Mistral AI,Yes,7B,-,935.679463 | 
| 15 | 
            -
            [wenxin-4](https://yiyan.baidu.com/),14,,,,,,Baidu,No,-,16.6,927. | 
| 16 | 
            -
            [zero-one-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat),15,23.1,7,27.2,63.17,74.87,Zero One AI,Yes,34B,-,917.300671 | 
| 17 | 
            -
            [mistral-large-2402](https://mistral.ai/news/mistral-large/),16,37.7,8.63,32.7,,81.2,Mistral AI,No,-,12,900.837414 | 
| 18 | 
            -
            [GPT-3.5-Turbo-0125](https://openai.com/index/new-embedding-models-and-api-updates/),17,23.3,7.94,17.7,71.02,70,OpenAI,No,-,1.5,863.193661 | 
| 19 | 
            -
            [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat),18,,,17.8,,71.3,Deepseek AI,Yes,67B,-,814.974318 | 
| 20 | 
            -
            [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf),19,11.6,6.86,14.7,62.4,63.91,Meta,Yes,70B,-,788.694112 | 
|  | |
| 1 | 
            +
            Model,Rank,MT-Bench Hard,MT-Bench,LC-AlpacaEval,openLLM,MMLU,From,Open?,Params(B),Cost,Score
         | 
| 2 | 
            +
            [claude-3-5-sonnet-20240620](https://www.anthropic.com/news/claude-3-5-sonnet),1,,,57.5,,87.2,Anthropic,No,-,15,1282.192081
         | 
| 3 | 
            +
            [gpt-4o-2024-05-13](https://openai.com/index/hello-gpt-4o/),2,,,57.5,,87.2,OpenAI,No,-,15,1194.520424
         | 
| 4 | 
            +
            [GPT-4-turbo-0409](https://platform.openai.com/docs/models/gpt-4o),3,82.6,,55,86.27,86.5,OpenAI,No,-,30,1124.732733
         | 
| 5 | 
            +
            [qwen2-72B-instruct](https://qwenlm.github.io/blog/qwen2/),4,48.1,9.12,,,84.2,Alibaba,Yes,72B,-,1109.810932
         | 
| 6 | 
            +
            [meta-llama/Llama-3-70b-chat-hf](https://ai.meta.com/blog/meta-llama-3/),5,41.1,,34.4,77.88,80.06,meta,Yes,70B,-,1048.258949
         | 
| 7 | 
            +
            [glm-4](https://open.bigmodel.cn/trialcenter?modelCode=glm-4),6,,,,,81.5,Zhipu AI,No,-,13.8,1038.939252
         | 
| 8 | 
            +
            [minimax-abab6.5-chat](https://platform.minimaxi.com/),7,,,,,78.7,minimax,No,-,4.2,1037.480905
         | 
| 9 | 
            +
            [command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus),8,33.1,,,74.62,75.7,Cohere,Yes,104B,15,1023.464325
         | 
| 10 | 
            +
            [claude-3-haiku-20240307](https://www.anthropic.com/api),9,41.5,9.1,,84.8,75.2,Anthropic,No,-,1.25,1009.099768
         | 
| 11 | 
            +
            [Qwen1.5-72B-chat](https://huggingface.co/Qwen/Qwen1.5-72B),10,36.1,8.61,36.6,72.91,77.2,Alibaba,Yes,72B,-,994.660656
         | 
| 12 | 
            +
            [reka-core-20240501](https://www.reka.ai/news/reka-core-our-frontier-class-multimodal-language-model),11,,,,,83.2,Reka AI,No,-,25,994.535244
         | 
| 13 | 
            +
            [SenseChat-5](https://console.sensecore.cn/nova/home),12,,,,,84.7,SenseTime,No,-,13.8,993.937723
         | 
| 14 | 
            +
            [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),13,23.4,8.3,23.7,72.71,71.4,Mistral AI,Yes,7B,-,935.679463
         | 
| 15 | 
            +
            [wenxin-4](https://yiyan.baidu.com/),14,,,,,,Baidu,No,-,16.6,927.68737
         | 
| 16 | 
            +
            [zero-one-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat),15,23.1,7,27.2,63.17,74.87,Zero One AI,Yes,34B,-,917.300671
         | 
| 17 | 
            +
            [mistral-large-2402](https://mistral.ai/news/mistral-large/),16,37.7,8.63,32.7,,81.2,Mistral AI,No,-,12,900.837414
         | 
| 18 | 
            +
            [GPT-3.5-Turbo-0125](https://openai.com/index/new-embedding-models-and-api-updates/),17,23.3,7.94,17.7,71.02,70,OpenAI,No,-,1.5,863.193661
         | 
| 19 | 
            +
            [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat),18,,,17.8,,71.3,Deepseek AI,Yes,67B,-,814.974318
         | 
| 20 | 
            +
            [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf),19,11.6,6.86,14.7,62.4,63.91,Meta,Yes,70B,-,788.694112
         | 
    	
        src/results/auto-arena-llms-results-20241007.csv
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Model,Rank,MT-Bench Hard,MT-Bench,LC-AlpacaEval,openLLM,MMLU,From,Open?,Params(B),Cost,Score
         | 
| 2 | 
            +
            [claude-3-5-sonnet-20240620](https://www.anthropic.com/news/claude-3-5-sonnet),1,,,57.5,,87.2,Anthropic,No,-,15,1181.774515
         | 
| 3 | 
            +
            [gpt-4o-2024-05-13](https://openai.com/index/hello-gpt-4o/),2,,,57.5,,87.2,OpenAI,No,-,15,1130.486708
         | 
| 4 | 
            +
            [GPT-4-turbo-0409](https://platform.openai.com/docs/models/gpt-4o),3,82.6,,55,,86.5,OpenAI,No,-,30,1097.746895
         | 
| 5 | 
            +
            [command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus),4,33.1,,,,75.7,Cohere,Yes,104B,15,1042.974783
         | 
| 6 | 
            +
            [meta-llama/Llama-3-70b-chat-hf](https://ai.meta.com/blog/meta-llama-3/),5,41.1,,34.4,36.18,80.06,meta,Yes,70B,-,1033.389278
         | 
| 7 | 
            +
            [gemini-1.5-flash-exp-0827](https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash),6,,,,,77.9,Google,No,-,0.3,1028.810421
         | 
| 8 | 
            +
            [claude-3-haiku-20240307](https://www.anthropic.com/api),7,41.5,9.1,,,75.2,Anthropic,No,-,1.25,1021.014418
         | 
| 9 | 
            +
            [qwen2-72B-instruct](https://qwenlm.github.io/blog/qwen2/),8,48.1,9.12,,42.49,84.2,Alibaba,Yes,72B,-,1017.866425
         | 
| 10 | 
            +
            [google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it),9,57.51,,,32.31,,Google,Yes,27B,-,1015.595207
         | 
| 11 | 
            +
            [Qwen1.5-72B-chat](https://huggingface.co/Qwen/Qwen1.5-72B),10,36.1,8.61,36.6,,77.2,Alibaba,Yes,72B,-,1011.735171
         | 
| 12 | 
            +
            [zero-one-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat),11,23.1,7,27.2,23.9,74.87,Zero One AI,Yes,34B,-,949.9850023
         | 
| 13 | 
            +
            [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),12,23.4,8.3,23.7,24.35,71.4,Mistral AI,Yes,7B,-,939.4805565
         | 
| 14 | 
            +
            [GPT-3.5-Turbo-0125](https://openai.com/index/new-embedding-models-and-api-updates/),13,23.3,7.94,17.7,,70,OpenAI,No,-,1.5,889.8482309
         | 
| 15 | 
            +
            [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat),14,,,17.8,26.87,71.3,Deepseek AI,Yes,67B,-,846.4850997
         | 
| 16 | 
            +
            [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf),15,11.6,6.86,14.7,,63.91,Meta,Yes,70B,-,792.8072912
         | 
