Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Commit 
							
							Β·
						
						439afd4
	
1
								Parent(s):
							
							7a2430c
								
Fix model eval links and remove huggingface icon from Leaderboard name
Browse files- src/display/about.py +8 -7
- src/display/formatting.py +18 -3
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +6 -1
- src/populate.py +3 -3
    	
        src/display/about.py
    CHANGED
    
    | @@ -4,21 +4,22 @@ from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEAD | |
| 4 |  | 
| 5 | 
             
            LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
         | 
| 6 |  | 
| 7 | 
            -
            TITLE = F"""<h1 align="center" id="space-title" | 
| 8 |  | 
| 9 | 
             
            INTRODUCTION_TEXT = f"""
         | 
| 10 | 
            -
            π The  | 
| 11 |  | 
| 12 | 
             
            This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
         | 
| 13 |  | 
| 14 | 
            -
             | 
| 15 | 
             
            The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
         | 
| 16 | 
             
            """
         | 
| 17 | 
            -
             | 
| 18 | 
             
            TASKS_LIST= ""
         | 
| 19 | 
             
            for task in Tasks:
         | 
| 20 | 
             
                  task = task.value
         | 
| 21 | 
             
                  TASKS_LIST += f'- <a href="{task.link}" target="_blank">  {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
         | 
|  | |
| 22 |  | 
| 23 | 
             
            TASKS_PARAMETERS = ""
         | 
| 24 | 
             
            for task in Tasks:
         | 
| @@ -33,7 +34,7 @@ With the plethora of large language models (LLMs) and chatbots being released we | |
| 33 |  | 
| 34 | 
             
            ## How it works
         | 
| 35 |  | 
| 36 | 
            -
            π We evaluate models on  | 
| 37 |  | 
| 38 | 
             
            {TASKS_LIST}
         | 
| 39 |  | 
| @@ -133,9 +134,9 @@ I have an issue about accessing the leaderboard through the Gradio API | |
| 133 |  | 
| 134 |  | 
| 135 | 
             
            EVALUATION_QUEUE_TEXT = f"""
         | 
| 136 | 
            -
            # Evaluation Queue for the  | 
| 137 |  | 
| 138 | 
            -
            Models added here will be automatically evaluated on  | 
| 139 |  | 
| 140 | 
             
            ## First steps before submitting a model
         | 
| 141 |  | 
|  | |
| 4 |  | 
| 5 | 
             
            LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
         | 
| 6 |  | 
| 7 | 
            +
            TITLE = F"""<h1 align="center" id="space-title">π {LEADERBOARD_NAME}</h1>"""
         | 
| 8 |  | 
| 9 | 
             
            INTRODUCTION_TEXT = f"""
         | 
| 10 | 
            +
            π The π {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.  
         | 
| 11 |  | 
| 12 | 
             
            This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
         | 
| 13 |  | 
| 14 | 
            +
            Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
         | 
| 15 | 
             
            The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
         | 
| 16 | 
             
            """
         | 
| 17 | 
            +
            task_count = 0
         | 
| 18 | 
             
            TASKS_LIST= ""
         | 
| 19 | 
             
            for task in Tasks:
         | 
| 20 | 
             
                  task = task.value
         | 
| 21 | 
             
                  TASKS_LIST += f'- <a href="{task.link}" target="_blank">  {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
         | 
| 22 | 
            +
                  task_count += 1
         | 
| 23 |  | 
| 24 | 
             
            TASKS_PARAMETERS = ""
         | 
| 25 | 
             
            for task in Tasks:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            ## How it works
         | 
| 36 |  | 
| 37 | 
            +
            π We evaluate models on {task_count} key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
         | 
| 38 |  | 
| 39 | 
             
            {TASKS_LIST}
         | 
| 40 |  | 
|  | |
| 134 |  | 
| 135 |  | 
| 136 | 
             
            EVALUATION_QUEUE_TEXT = f"""
         | 
| 137 | 
            +
            # Evaluation Queue for the π {LEADERBOARD_NAME}
         | 
| 138 |  | 
| 139 | 
            +
            Models added here will be automatically evaluated on our evaluation cluster.
         | 
| 140 |  | 
| 141 | 
             
            ## First steps before submitting a model
         | 
| 142 |  | 
    	
        src/display/formatting.py
    CHANGED
    
    | @@ -4,6 +4,7 @@ from datetime import datetime, timezone | |
| 4 | 
             
            from huggingface_hub import HfApi
         | 
| 5 | 
             
            from huggingface_hub.hf_api import ModelInfo
         | 
| 6 |  | 
|  | |
| 7 |  | 
| 8 | 
             
            API = HfApi()
         | 
| 9 |  | 
| @@ -11,11 +12,25 @@ def model_hyperlink(link, model_name): | |
| 11 | 
             
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 12 |  | 
| 13 |  | 
| 14 | 
            -
            def  | 
| 15 | 
             
                link = f"https://huggingface.co/{model_name}"
         | 
| 16 |  | 
| 17 | 
            -
                details_model_name = model_name.replace("/", "__")
         | 
| 18 | 
            -
                details_link = f"https://huggingface.co/datasets/ | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 19 |  | 
| 20 | 
             
                return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "π")
         | 
| 21 |  | 
|  | |
| 4 | 
             
            from huggingface_hub import HfApi
         | 
| 5 | 
             
            from huggingface_hub.hf_api import ModelInfo
         | 
| 6 |  | 
| 7 | 
            +
            from src.envs import RESULTS_REPO, QUEUE_REPO
         | 
| 8 |  | 
| 9 | 
             
            API = HfApi()
         | 
| 10 |  | 
|  | |
| 12 | 
             
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 13 |  | 
| 14 |  | 
| 15 | 
            +
            def make_requests_clickable_model(model_name, json_path=None):
         | 
| 16 | 
             
                link = f"https://huggingface.co/{model_name}"
         | 
| 17 |  | 
| 18 | 
            +
                #details_model_name = model_name.replace("/", "__")
         | 
| 19 | 
            +
                details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/tree/main"
         | 
| 20 | 
            +
                if '/' in model_name:
         | 
| 21 | 
            +
                    details_link += f"/{model_name.split('/')[0]}"
         | 
| 22 | 
            +
                if json_path is not None:
         | 
| 23 | 
            +
                    details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/blob/main/{json_path}"
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "π")
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            def make_clickable_model(model_name, json_path=None):
         | 
| 28 | 
            +
                link = f"https://huggingface.co/{model_name}"
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                #details_model_name = model_name.replace("/", "__")
         | 
| 31 | 
            +
                details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/tree/main/{model_name}"
         | 
| 32 | 
            +
                if json_path is not None:
         | 
| 33 | 
            +
                    details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/blob/main/{model_name}/{json_path}"
         | 
| 34 |  | 
| 35 | 
             
                return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "π")
         | 
| 36 |  | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -31,7 +31,7 @@ class Tasks(Enum): | |
| 31 | 
             
                    limit=None,
         | 
| 32 | 
             
                    task_list=["oab_exams_generate"],
         | 
| 33 | 
             
                    link="https://huggingface.co/datasets/eduagarcia/oab_exams",
         | 
| 34 | 
            -
                    description="OAB Exams is a dataset of  | 
| 35 | 
             
                )
         | 
| 36 | 
             
                brazilian_court_decisions_judgment = Task(
         | 
| 37 | 
             
                    benchmark="brazilian_court_decisions_judgment",
         | 
|  | |
| 31 | 
             
                    limit=None,
         | 
| 32 | 
             
                    task_list=["oab_exams_generate"],
         | 
| 33 | 
             
                    link="https://huggingface.co/datasets/eduagarcia/oab_exams",
         | 
| 34 | 
            +
                    description="OAB Exams is a dataset of 2,000 questions from the Brazilian Bar Association's exams."
         | 
| 35 | 
             
                )
         | 
| 36 | 
             
                brazilian_court_decisions_judgment = Task(
         | 
| 37 | 
             
                    benchmark="brazilian_court_decisions_judgment",
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -35,6 +35,7 @@ class EvalResult: | |
| 35 | 
             
                flagged: bool = False
         | 
| 36 | 
             
                status: str = "FINISHED"
         | 
| 37 | 
             
                tags: list = None
         | 
|  | |
| 38 |  | 
| 39 | 
             
                @classmethod
         | 
| 40 | 
             
                def init_from_json_file(self, json_filepath):
         | 
| @@ -42,6 +43,8 @@ class EvalResult: | |
| 42 | 
             
                    with open(json_filepath) as fp:
         | 
| 43 | 
             
                        data = json.load(fp)
         | 
| 44 |  | 
|  | |
|  | |
| 45 | 
             
                    # We manage the legacy config format
         | 
| 46 | 
             
                    config = data.get("config_general")
         | 
| 47 |  | 
| @@ -100,6 +103,7 @@ class EvalResult: | |
| 100 | 
             
                        results=results,
         | 
| 101 | 
             
                        precision=precision,  
         | 
| 102 | 
             
                        revision= config.get("model_sha", ""),
         | 
|  | |
| 103 | 
             
                    )
         | 
| 104 |  | 
| 105 | 
             
                def update_with_request_file(self, requests_path):
         | 
| @@ -137,7 +141,7 @@ class EvalResult: | |
| 137 | 
             
                        AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
         | 
| 138 | 
             
                        AutoEvalColumn.weight_type.name: self.weight_type.value.name,
         | 
| 139 | 
             
                        AutoEvalColumn.architecture.name: self.architecture,
         | 
| 140 | 
            -
                        AutoEvalColumn.model.name: make_clickable_model(self.full_model),
         | 
| 141 | 
             
                        AutoEvalColumn.dummy.name: self.full_model,
         | 
| 142 | 
             
                        AutoEvalColumn.revision.name: self.revision,
         | 
| 143 | 
             
                        AutoEvalColumn.average.name: average,
         | 
| @@ -202,6 +206,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st | |
| 202 | 
             
                eval_results = {}
         | 
| 203 | 
             
                for model_result_filepath in model_result_filepaths:
         | 
| 204 | 
             
                    # Creation of result
         | 
|  | |
| 205 | 
             
                    eval_result = EvalResult.init_from_json_file(model_result_filepath)
         | 
| 206 | 
             
                    eval_result.update_with_request_file(requests_path)
         | 
| 207 | 
             
                    if eval_result.full_model in dynamic_data:
         | 
|  | |
| 35 | 
             
                flagged: bool = False
         | 
| 36 | 
             
                status: str = "FINISHED"
         | 
| 37 | 
             
                tags: list = None
         | 
| 38 | 
            +
                json_filename: str = None
         | 
| 39 |  | 
| 40 | 
             
                @classmethod
         | 
| 41 | 
             
                def init_from_json_file(self, json_filepath):
         | 
|  | |
| 43 | 
             
                    with open(json_filepath) as fp:
         | 
| 44 | 
             
                        data = json.load(fp)
         | 
| 45 |  | 
| 46 | 
            +
                    json_filename = os.path.basename(json_filepath)
         | 
| 47 | 
            +
             | 
| 48 | 
             
                    # We manage the legacy config format
         | 
| 49 | 
             
                    config = data.get("config_general")
         | 
| 50 |  | 
|  | |
| 103 | 
             
                        results=results,
         | 
| 104 | 
             
                        precision=precision,  
         | 
| 105 | 
             
                        revision= config.get("model_sha", ""),
         | 
| 106 | 
            +
                        json_filename=json_filename
         | 
| 107 | 
             
                    )
         | 
| 108 |  | 
| 109 | 
             
                def update_with_request_file(self, requests_path):
         | 
|  | |
| 141 | 
             
                        AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
         | 
| 142 | 
             
                        AutoEvalColumn.weight_type.name: self.weight_type.value.name,
         | 
| 143 | 
             
                        AutoEvalColumn.architecture.name: self.architecture,
         | 
| 144 | 
            +
                        AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.json_filename),
         | 
| 145 | 
             
                        AutoEvalColumn.dummy.name: self.full_model,
         | 
| 146 | 
             
                        AutoEvalColumn.revision.name: self.revision,
         | 
| 147 | 
             
                        AutoEvalColumn.average.name: average,
         | 
|  | |
| 206 | 
             
                eval_results = {}
         | 
| 207 | 
             
                for model_result_filepath in model_result_filepaths:
         | 
| 208 | 
             
                    # Creation of result
         | 
| 209 | 
            +
                    print(model_result_filepath)
         | 
| 210 | 
             
                    eval_result = EvalResult.init_from_json_file(model_result_filepath)
         | 
| 211 | 
             
                    eval_result.update_with_request_file(requests_path)
         | 
| 212 | 
             
                    if eval_result.full_model in dynamic_data:
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ import os | |
| 3 |  | 
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
            -
            from src.display.formatting import has_no_nan_values,  | 
| 7 | 
             
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
         | 
| 8 | 
             
            from src.leaderboard.filter_models import filter_models
         | 
| 9 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| @@ -35,7 +35,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: | |
| 35 | 
             
                        with open(file_path) as fp:
         | 
| 36 | 
             
                            data = json.load(fp)
         | 
| 37 |  | 
| 38 | 
            -
                        data[EvalQueueColumn.model.name] =  | 
| 39 | 
             
                        data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         | 
| 40 |  | 
| 41 | 
             
                        all_evals.append(data)
         | 
| @@ -47,7 +47,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: | |
| 47 | 
             
                            with open(file_path) as fp:
         | 
| 48 | 
             
                                data = json.load(fp)
         | 
| 49 |  | 
| 50 | 
            -
                            data[EvalQueueColumn.model.name] =  | 
| 51 | 
             
                            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         | 
| 52 | 
             
                            all_evals.append(data)
         | 
| 53 |  | 
|  | |
| 3 |  | 
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
            +
            from src.display.formatting import has_no_nan_values, make_requests_clickable_model
         | 
| 7 | 
             
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
         | 
| 8 | 
             
            from src.leaderboard.filter_models import filter_models
         | 
| 9 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
|  | |
| 35 | 
             
                        with open(file_path) as fp:
         | 
| 36 | 
             
                            data = json.load(fp)
         | 
| 37 |  | 
| 38 | 
            +
                        data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], entry)
         | 
| 39 | 
             
                        data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         | 
| 40 |  | 
| 41 | 
             
                        all_evals.append(data)
         | 
|  | |
| 47 | 
             
                            with open(file_path) as fp:
         | 
| 48 | 
             
                                data = json.load(fp)
         | 
| 49 |  | 
| 50 | 
            +
                            data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], os.path.join(entry, sub_entry))
         | 
| 51 | 
             
                            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         | 
| 52 | 
             
                            all_evals.append(data)
         | 
| 53 |  | 
 
			
