Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	rename the github link
Browse files- ZeroEval-main/result_dirs/zebra-grid.summary.json +44 -0
- _about_us.md +1 -1
- _header.md +1 -1
- app.py +2 -2
- constants.py +1 -1
- data_utils.py +1 -1
- update_data.sh +3 -3
    	
        ZeroEval-main/result_dirs/zebra-grid.summary.json
    CHANGED
    
    | @@ -175,6 +175,17 @@ | |
| 175 | 
             
                "Total Puzzles": 1000,
         | 
| 176 | 
             
                "Reason Lens": "855.72"
         | 
| 177 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 178 | 
             
              {
         | 
| 179 | 
             
                "Model": "gpt-4-turbo-2024-04-09",
         | 
| 180 | 
             
                "Mode": "sampling",
         | 
| @@ -186,6 +197,17 @@ | |
| 186 | 
             
                "Total Puzzles": 1000,
         | 
| 187 | 
             
                "Reason Lens": "1165.90"
         | 
| 188 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 189 | 
             
              {
         | 
| 190 | 
             
                "Model": "gemini-1.5-pro-exp-0801",
         | 
| 191 | 
             
                "Mode": "greedy",
         | 
| @@ -472,6 +494,17 @@ | |
| 472 | 
             
                "Total Puzzles": 1000,
         | 
| 473 | 
             
                "Reason Lens": "849.84"
         | 
| 474 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 475 | 
             
              {
         | 
| 476 | 
             
                "Model": "Meta-Llama-3-8B-Instruct",
         | 
| 477 | 
             
                "Mode": "greedy",
         | 
| @@ -604,6 +637,17 @@ | |
| 604 | 
             
                "Total Puzzles": 1000,
         | 
| 605 | 
             
                "Reason Lens": "718.43"
         | 
| 606 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 607 | 
             
              {
         | 
| 608 | 
             
                "Model": "gemma-2-2b-it",
         | 
| 609 | 
             
                "Mode": "greedy",
         | 
|  | |
| 175 | 
             
                "Total Puzzles": 1000,
         | 
| 176 | 
             
                "Reason Lens": "855.72"
         | 
| 177 | 
             
              },
         | 
| 178 | 
            +
              {
         | 
| 179 | 
            +
                "Model": "Qwen2.5-72B-Instruct",
         | 
| 180 | 
            +
                "Mode": "greedy",
         | 
| 181 | 
            +
                "Puzzle Acc": "26.60",
         | 
| 182 | 
            +
                "Cell Acc": "40.92",
         | 
| 183 | 
            +
                "No answer": "11.90",
         | 
| 184 | 
            +
                "Easy Puzzle Acc": "76.43",
         | 
| 185 | 
            +
                "Hard Puzzle Acc": "7.22",
         | 
| 186 | 
            +
                "Total Puzzles": 1000,
         | 
| 187 | 
            +
                "Reason Lens": "1795.90"
         | 
| 188 | 
            +
              },
         | 
| 189 | 
             
              {
         | 
| 190 | 
             
                "Model": "gpt-4-turbo-2024-04-09",
         | 
| 191 | 
             
                "Mode": "sampling",
         | 
|  | |
| 197 | 
             
                "Total Puzzles": 1000,
         | 
| 198 | 
             
                "Reason Lens": "1165.90"
         | 
| 199 | 
             
              },
         | 
| 200 | 
            +
              {
         | 
| 201 | 
            +
                "Model": "Qwen2.5-32B-Instruct",
         | 
| 202 | 
            +
                "Mode": "greedy",
         | 
| 203 | 
            +
                "Puzzle Acc": "26.10",
         | 
| 204 | 
            +
                "Cell Acc": "43.39",
         | 
| 205 | 
            +
                "No answer": "6.30",
         | 
| 206 | 
            +
                "Easy Puzzle Acc": "77.50",
         | 
| 207 | 
            +
                "Hard Puzzle Acc": "6.11",
         | 
| 208 | 
            +
                "Total Puzzles": 1000,
         | 
| 209 | 
            +
                "Reason Lens": "1333.07"
         | 
| 210 | 
            +
              },
         | 
| 211 | 
             
              {
         | 
| 212 | 
             
                "Model": "gemini-1.5-pro-exp-0801",
         | 
| 213 | 
             
                "Mode": "greedy",
         | 
|  | |
| 494 | 
             
                "Total Puzzles": 1000,
         | 
| 495 | 
             
                "Reason Lens": "849.84"
         | 
| 496 | 
             
              },
         | 
| 497 | 
            +
              {
         | 
| 498 | 
            +
                "Model": "Qwen2.5-7B-Instruct",
         | 
| 499 | 
            +
                "Mode": "greedy",
         | 
| 500 | 
            +
                "Puzzle Acc": "12.00",
         | 
| 501 | 
            +
                "Cell Acc": "30.67",
         | 
| 502 | 
            +
                "No answer": "9.50",
         | 
| 503 | 
            +
                "Easy Puzzle Acc": "38.93",
         | 
| 504 | 
            +
                "Hard Puzzle Acc": "1.53",
         | 
| 505 | 
            +
                "Total Puzzles": 1000,
         | 
| 506 | 
            +
                "Reason Lens": "850.93"
         | 
| 507 | 
            +
              },
         | 
| 508 | 
             
              {
         | 
| 509 | 
             
                "Model": "Meta-Llama-3-8B-Instruct",
         | 
| 510 | 
             
                "Mode": "greedy",
         | 
|  | |
| 637 | 
             
                "Total Puzzles": 1000,
         | 
| 638 | 
             
                "Reason Lens": "718.43"
         | 
| 639 | 
             
              },
         | 
| 640 | 
            +
              {
         | 
| 641 | 
            +
                "Model": "Qwen2.5-3B-Instruct",
         | 
| 642 | 
            +
                "Mode": "greedy",
         | 
| 643 | 
            +
                "Puzzle Acc": "4.80",
         | 
| 644 | 
            +
                "Cell Acc": "11.44",
         | 
| 645 | 
            +
                "No answer": "56.70",
         | 
| 646 | 
            +
                "Easy Puzzle Acc": "17.14",
         | 
| 647 | 
            +
                "Hard Puzzle Acc": "0.00",
         | 
| 648 | 
            +
                "Total Puzzles": 1000,
         | 
| 649 | 
            +
                "Reason Lens": "906.58"
         | 
| 650 | 
            +
              },
         | 
| 651 | 
             
              {
         | 
| 652 | 
             
                "Model": "gemma-2-2b-it",
         | 
| 653 | 
             
                "Mode": "greedy",
         | 
    	
        _about_us.md
    CHANGED
    
    | @@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz | |
| 10 | 
             
            ### Contact
         | 
| 11 |  | 
| 12 | 
             
            Please contact us in the following ways:
         | 
| 13 | 
            -
            - Github Issues/PRs: [https://github.com/ | 
| 14 | 
             
            - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
         | 
| 15 |  | 
|  | |
| 10 | 
             
            ### Contact
         | 
| 11 |  | 
| 12 | 
             
            Please contact us in the following ways:
         | 
| 13 | 
            +
            - Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/) 
         | 
| 14 | 
             
            - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
         | 
| 15 |  | 
    	
        _header.md
    CHANGED
    
    | @@ -2,5 +2,5 @@ | |
| 2 |  | 
| 3 | 
             
            # π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
         | 
| 4 | 
             
            <!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         | 
| 5 | 
            -
            [π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/ | 
| 6 |  | 
|  | |
| 2 |  | 
| 3 | 
             
            # π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
         | 
| 4 | 
             
            <!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         | 
| 5 | 
            +
            [π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/WildEval/ZeroEval) | [π€ HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [π¦ X](https://twitter.com/billyuchenlin/) | [π¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
         | 
| 6 |  | 
    	
        app.py
    CHANGED
    
    | @@ -135,8 +135,8 @@ def _tab_explore(): | |
| 135 |  | 
| 136 | 
             
            def _tab_submit():
         | 
| 137 | 
             
                markdown_text = """
         | 
| 138 | 
            -
                Please create an issue on our [Github](https://github.com/ | 
| 139 | 
            -
                If you would like to do local testing, please read our code [here](https://github.com/ | 
| 140 | 
             
                and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
         | 
| 141 | 
             
                """
         | 
| 142 |  | 
|  | |
| 135 |  | 
| 136 | 
             
            def _tab_submit():
         | 
| 137 | 
             
                markdown_text = """
         | 
| 138 | 
            +
                Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
         | 
| 139 | 
            +
                If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py) 
         | 
| 140 | 
             
                and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
         | 
| 141 | 
             
                """
         | 
| 142 |  | 
    	
        constants.py
    CHANGED
    
    | @@ -4,7 +4,7 @@ from collections import OrderedDict | |
| 4 | 
             
            DEFAULT_K = "β"
         | 
| 5 | 
             
            # DEFAULT_K = "1500"
         | 
| 6 |  | 
| 7 | 
            -
            banner_url = "https://github.com/ | 
| 8 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         | 
| 9 |  | 
| 10 | 
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
         | 
|  | |
| 4 | 
             
            DEFAULT_K = "β"
         | 
| 5 | 
             
            # DEFAULT_K = "1500"
         | 
| 6 |  | 
| 7 | 
            +
            banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
         | 
| 8 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         | 
| 9 |  | 
| 10 | 
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
         | 
    	
        data_utils.py
    CHANGED
    
    | @@ -49,7 +49,7 @@ def load_all_data(): | |
| 49 | 
             
                    model_summary = json.load(f)
         | 
| 50 | 
             
                model_names = [model["Model"] for model in model_summary]
         | 
| 51 | 
             
                for model_name in model_names:
         | 
| 52 | 
            -
                    download_url = f"https://raw.githubusercontent.com/ | 
| 53 | 
             
                    output_file = os.path.join(result_dir, f"{model_name}.json")
         | 
| 54 | 
             
                    # mkdir -p result_dir if not exists 
         | 
| 55 | 
             
                    os.makedirs(result_dir, exist_ok=True)
         | 
|  | |
| 49 | 
             
                    model_summary = json.load(f)
         | 
| 50 | 
             
                model_names = [model["Model"] for model in model_summary]
         | 
| 51 | 
             
                for model_name in model_names:
         | 
| 52 | 
            +
                    download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
         | 
| 53 | 
             
                    output_file = os.path.join(result_dir, f"{model_name}.json")
         | 
| 54 | 
             
                    # mkdir -p result_dir if not exists 
         | 
| 55 | 
             
                    os.makedirs(result_dir, exist_ok=True)
         | 
    	
        update_data.sh
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
            -
            # download the file from https://raw.githubusercontent.com/ | 
| 2 | 
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         | 
| 3 | 
             
            mkdir -p ZeroEval-main/result_dirs/zebra-grid/
         | 
| 4 | 
            -
            wget https://raw.githubusercontent.com/ | 
| 5 | 
            -
            wget https://raw.githubusercontent.com/ | 
|  | |
| 1 | 
            +
            # download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
         | 
| 2 | 
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         | 
| 3 | 
             
            mkdir -p ZeroEval-main/result_dirs/zebra-grid/
         | 
| 4 | 
            +
            wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
         | 
| 5 | 
            +
            wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
         | 

