Spaces:

kilian-group
/

arxiv-classifier-leaderboard-backend

Runtime error

App Files Files Community

ag2435 commited on 30 days ago

Commit

4ab6298

verified ·

1 Parent(s): 609d373

copying backend space from demo leaderboard

Browse files

Files changed (21) hide show

.gitattributes +1 -1
.gitignore +15 -0
.pre-commit-config.yaml +53 -0
Makefile +11 -0
README.md +13 -6
app.py +69 -0
custom_tasks.py +90 -0
main_backend_harness.py +109 -0
main_backend_lighteval.py +127 -0
pyproject.toml +10 -0
requirements.txt +17 -0
scripts/create_request_file.py +109 -0
scripts/fix_harness_import.py +13 -0
src/backend/manage_requests.py +149 -0
src/backend/run_eval_suite_harness.py +86 -0
src/backend/run_eval_suite_lighteval.py +105 -0
src/backend/sort_queue.py +31 -0
src/display/css_html_js.py +20 -0
src/display/log_visualizer.py +39 -0
src/envs.py +42 -0
src/logging.py +36 -0

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+.idea/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/
+output.log

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,11 @@

+.PHONY: style format
+style:
+	ruff check --fix .
+	ruff format .
+quality:
+	ruff check .
+	ruff format --check .

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
 ---
-title: Arxiv Classifier Leaderboard Backend
-emoji: 📉
-colorFrom: red
 colorTo: indigo
 sdk: gradio
-sdk_version: 5.16.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Backend
+emoji: 🥇
+colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.1.0
 app_file: app.py
+pinned: true
+license: apache-2.0
 ---
+Depending on whether you want to use lighteval or lm_eval for your evaluations, you might need to complete the
+requirements.txt file to contain relevant dependencies.
+You'll also need to select, in app.py, whether you want to use the ligtheval or lm_eval by selecting the correct
+import and commenting the other.
+All env variables that you should need to edit to launch the evaluations should be in `envs`.

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+from functools import partial
+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
+# Choose ligtheval or harness backend
+# from main_backend_harness import run_auto_eval
+from main_backend_lighteval import run_auto_eval
+from src.display.css_html_js import dark_mode_gradio_js
+from src.display.log_visualizer import log_file_to_html_string
+from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
+from src.logging import configure_root_logger, log_file, setup_logger
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("numexpr").setLevel(logging.WARNING)
+logging.getLogger("absl").setLevel(logging.WARNING)
+configure_root_logger()
+logging.basicConfig(level=logging.INFO)
+logger = setup_logger(__name__)
+intro_md = """
+# Intro
+This is a visual for the auto evaluator.
+"""
+links_md = f"""
+# Important links
+| Description     | Link |
+|-----------------|------|
+| Leaderboard     | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
+| Queue Repo      | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
+| Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
+"""
+def auto_eval():
+    logger.info("Triggering Auto Eval")
+    run_auto_eval()
+reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
+with gr.Blocks(js=dark_mode_gradio_js) as demo:
+    gr.Markdown(intro_md)
+    with gr.Tab("Application"):
+        output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
+        with gr.Row():
+            download_button = gr.DownloadButton("Download Log File", value=log_file)
+            with gr.Accordion("Log View Configuration", open=False):
+                reverse_order_checkbox.render()
+        # Add a button that when pressed, triggers run_auto_eval
+        button = gr.Button("Manually Run Evaluation")
+        gr.Markdown(links_md)
+        # dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
+        button.click(fn=auto_eval, inputs=[], outputs=[])
+if __name__ == "__main__":
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
+    scheduler.start()
+    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)

custom_tasks.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
+This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+Author:
+"""
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+## EVAL WITH NO SUBSET ##
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+task = LightevalTaskConfig(
+    name="myothertask",
+    prompt_function="prompt_fn",  # must be defined in the file or imported from lighteval.tasks.default_prompts
+    suite=["community"],
+    hf_repo="",
+    hf_subset="default",
+    hf_avail_splits=[],
+    evaluation_splits=[],
+    few_shots_split="",
+    few_shots_select="",
+    metric=[""],
+)
+## EVALS WITH SUBSET
+# This is how you create a subset task (like MMLU), which has several subset
+# each being its own evaluation task.
+# fmt: off
+SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
+# fmt: on
+class CustomSubsetTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function="prompt_fn",  # must be defined in this file or imported from lighteval.tasks.default_prompts
+            hf_repo="",
+            metric=[""],
+            hf_avail_splits=[],
+            evaluation_splits=[],
+            few_shots_split="",
+            few_shots_select="",
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+        )
+## DEFINE YOUR PROMPT FUNCTIONS
+# Define as many as you need for your different tasks
+def prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query="",
+        choices=[],
+        gold_index=0,
+        instruction="",
+    )
+## STORE YOUR EVALS
+SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
+_TASKS = SUBSET_TASKS + [task]
+## MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in _TASKS]
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

main_backend_harness.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import logging
+import pprint
+from huggingface_hub import snapshot_download
+from src.backend.manage_requests import (
+    FAILED_STATUS,
+    FINISHED_STATUS,
+    PENDING_STATUS,
+    RUNNING_STATUS,
+    check_completed_evals,
+    get_eval_requests,
+    set_eval_request,
+)
+from src.backend.run_eval_suite_harness import run_evaluation
+from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    API,
+    DEVICE,
+    EVAL_REQUESTS_PATH_BACKEND,
+    EVAL_RESULTS_PATH_BACKEND,
+    LIMIT,
+    NUM_FEWSHOT,
+    QUEUE_REPO,
+    RESULTS_REPO,
+    TASKS_HARNESS,
+    TOKEN,
+)
+from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
+# logging.basicConfig(level=logging.ERROR)
+logger = setup_logger(__name__)
+pp = pprint.PrettyPrinter(width=80)
+snapshot_download(
+    repo_id=RESULTS_REPO,
+    revision="main",
+    local_dir=EVAL_RESULTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+snapshot_download(
+    repo_id=QUEUE_REPO,
+    revision="main",
+    local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    # pull the eval dataset from the hub and parse any eval requests
+    # check completed evals and set them to finished
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
+    )
+    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
+    # Sort the evals by priority (first submitted first run)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        return
+    eval_request = eval_requests[0]
+    logger.info(pp.pformat(eval_request))
+    set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_HARNESS,
+        num_fewshot=NUM_FEWSHOT,
+        local_dir=EVAL_RESULTS_PATH_BACKEND,
+        results_repo=RESULTS_REPO,
+        batch_size="auto",
+        device=DEVICE,
+        limit=LIMIT,
+    )
+if __name__ == "__main__":
+    run_auto_eval()

main_backend_lighteval.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import logging
+import pprint
+from huggingface_hub import snapshot_download
+from src.backend.manage_requests import (
+    FAILED_STATUS,
+    FINISHED_STATUS,
+    PENDING_STATUS,
+    RUNNING_STATUS,
+    check_completed_evals,
+    get_eval_requests,
+    set_eval_request,
+)
+from src.backend.run_eval_suite_lighteval import run_evaluation
+from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    ACCELERATOR,
+    API,
+    EVAL_REQUESTS_PATH_BACKEND,
+    EVAL_RESULTS_PATH_BACKEND,
+    LIMIT,
+    QUEUE_REPO,
+    REGION,
+    RESULTS_REPO,
+    TASKS_LIGHTEVAL,
+    TOKEN,
+    VENDOR,
+)
+from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
+logger = setup_logger(__name__)
+# logging.basicConfig(level=logging.ERROR)
+pp = pprint.PrettyPrinter(width=80)
+snapshot_download(
+    repo_id=RESULTS_REPO,
+    revision="main",
+    local_dir=EVAL_RESULTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+snapshot_download(
+    repo_id=QUEUE_REPO,
+    revision="main",
+    local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    # pull the eval dataset from the hub and parse any eval requests
+    # check completed evals and set them to finished
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
+    )
+    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
+    # Sort the evals by priority (first submitted first run)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        return
+    eval_request = eval_requests[0]
+    logger.info(pp.pformat(eval_request))
+    set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    # This needs to be done
+    # instance_size, instance_type = get_instance_for_model(eval_request)
+    # For GPU
+    # instance_size, instance_type = "small", "g4dn.xlarge"
+    # For CPU
+    # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
+    instance_size, instance_type = "x4", "intel-icl"
+    logger.info(
+        f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
+    )
+    run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_LIGHTEVAL,
+        local_dir=EVAL_RESULTS_PATH_BACKEND,
+        batch_size=1,
+        accelerator=ACCELERATOR,
+        region=REGION,
+        vendor=VENDOR,
+        instance_size=instance_size,
+        instance_type=instance_type,
+        limit=LIMIT,
+    )
+    logger.info(
+        f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
+    )
+if __name__ == "__main__":
+    run_auto_eval()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,10 @@

+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+select = ["C", "E", "F", "I", "W"]
+ignore = ["E501"] # line too long (the formatter is taking care of this)
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-local-folder = ["src"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+APScheduler==3.10.1
+click==8.1.3
+huggingface-hub>=0.18.0
+python-dateutil==2.8.2
+requests==2.28.2
+tqdm==4.65.0
+accelerate>=0.26.0
+sentencepiece
+# Evaluation suites
+lighteval>=0.5.0
+lm_eval==0.4.3
+# Log Visualizer
+BeautifulSoup4==4.12.2
+lxml==4.9.3
+rich==13.3.4

scripts/create_request_file.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import json
+import os
+import pprint
+import re
+from datetime import datetime, timezone
+import click
+from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
+from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
+precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
+model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
+weight_types = ("Original", "Delta", "Adapter")
+def get_model_size(model_info, precision: str):
+    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def main():
+    api = HfApi()
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
+    )
+    model_name = click.prompt("Enter model name")
+    revision = click.prompt("Enter revision", default="main")
+    precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
+    model_type = click.prompt("Enter model type", type=click.Choice(model_types))
+    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
+    base_model = click.prompt("Enter base model", default="")
+    status = click.prompt("Enter status", default="FINISHED")
+    try:
+        model_info = api.model_info(repo_id=model_name, revision=revision)
+    except Exception as e:
+        print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
+        return 1
+    model_size = get_model_size(model_info=model_info, precision=precision)
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        license = "?"
+    eval_entry = {
+        "model": model_name,
+        "base_model": base_model,
+        "revision": revision,
+        "private": False,
+        "precision": precision,
+        "weight_type": weight_type,
+        "status": status,
+        "submitted_time": current_time,
+        "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
+    }
+    user_name = ""
+    model_path = model_name
+    if "/" in model_name:
+        user_name = model_name.split("/")[0]
+        model_path = model_name.split("/")[1]
+    pprint.pprint(eval_entry)
+    if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
+        click.echo("continuing...")
+        out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
+        os.makedirs(out_dir, exist_ok=True)
+        out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
+        with open(out_path, "w") as f:
+            f.write(json.dumps(eval_entry))
+        api.upload_file(
+            path_or_fileobj=out_path,
+            path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {model_name} to eval queue",
+        )
+    else:
+        click.echo("aborting...")
+if __name__ == "__main__":
+    main()

scripts/fix_harness_import.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""This file should be used after pip install -r requirements.
+It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
+It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
+"""
+import os
+import lm_eval
+if __name__ == "__main__":
+    lm_eval_path = lm_eval.__path__[0]
+    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import glob
+import json
+from dataclasses import dataclass
+from typing import Optional
+from huggingface_hub import HfApi, snapshot_download
+from src.envs import TOKEN
+from src.logging import setup_logger
+logger = setup_logger(__name__)
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+@dataclass
+class EvalRequest:
+    """This class represents one evaluation request file."""
+    model: str
+    status: str
+    json_filepath: str
+    weight_type: str = "Original"
+    model_type: Optional[str] = None  # pretrained, fine-tuned, etc. - define your own categories in
+    precision: str = ""  # float16, bfloat16
+    revision: str = "main"  # commit hash
+    submitted_time: Optional[str] = (
+        "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    )
+    likes: Optional[int] = 0
+    params: Optional[int] = None
+    license: Optional[str] = ""
+    base_model: Optional[str] = ""
+    private: Optional[bool] = False
+    def get_model_args(self):
+        """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
+        the evaluation suite you chose.
+        """
+        model_args = f"pretrained={self.model},revision={self.revision}"
+        if self.precision in ["float16", "bfloat16"]:
+            model_args += f",dtype={self.precision}"
+        # Quantized models need some added config, the install of bits and bytes, etc
+        else:
+            raise Exception(f"Unknown precision {self.precision}.")
+        return model_args
+def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
+    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    data["status"] = set_to_status
+    with open(json_filepath, "w") as f:
+        f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
+def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
+    """Gets all pending evaluation requests and return a list in which private
+    models appearing first, followed by public models sorted by the number of
+    likes.
+    Returns:
+        `list[EvalRequest]`: a list of model info dicts.
+    """
+    snapshot_download(
+        repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
+    )
+    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
+    eval_requests = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        if data["status"] in job_status:
+            data["json_filepath"] = json_filepath
+            eval_request = EvalRequest(**data)
+            eval_requests.append(eval_request)
+    return eval_requests
+def eval_was_running(eval_request: EvalRequest):
+    """Checks whether a file says it's RUNNING to determine whether to FAIL"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    status = data["status"]
+    return status == RUNNING_STATUS
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
+    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(
+        repo_id=hf_repo_results,
+        revision="main",
+        local_dir=local_dir_results,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
+    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
+    for eval_request in running_evals:
+        model = eval_request.model
+        logger.info("====================================")
+        logger.info(f"Checking {model}")
+        output_path = model
+        output_file = f"{local_dir_results}/{output_path}/results*.json"
+        output_file_exists = len(glob.glob(output_file)) > 0
+        if output_file_exists:
+            logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
+            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
+        else:
+            if eval_was_running(eval_request=eval_request):
+                logger.info(f"No result file found for {model} setting it to {failed_status}")
+                set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite_harness.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Union
+from lm_eval import evaluator, utils
+from lm_eval.tasks import TaskManager
+from src.backend.manage_requests import EvalRequest
+from src.envs import API
+from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
+logger = setup_logger(__name__)
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names: list,
+    num_fewshot: int,
+    batch_size: Union[int, str],
+    device: str,
+    local_dir: str,
+    results_repo: str,
+    limit: int = None,
+):
+    """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
+    Args:
+        eval_request (EvalRequest): Input evaluation request file representation
+        task_names (list): Tasks to launch
+        num_fewshot (int): Number of few shots to use
+        batch_size (int or str): Selected batch size or 'auto'
+        device (str): "cpu" or "cuda:0", depending on what you assigned to the space
+        local_dir (str): Where to save the results locally
+        results_repo (str): To which repository to upload the results
+        limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
+    Returns:
+        _type_: _description_
+    """
+    if limit:
+        logger.info(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    task_manager = TaskManager()
+    all_tasks = task_manager.all_tasks
+    task_names = utils.pattern_match(task_names, all_tasks)
+    logger.info(f"Selected Tasks: {task_names}")
+    results = evaluator.simple_evaluate(
+        model="hf",
+        model_args=eval_request.get_model_args(),
+        tasks=task_names,
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        limit=limit,
+        write_out=True,  # Whether to write out an example document and model input, for checking task integrity
+    )
+    results["config"]["model_dtype"] = eval_request.precision
+    results["config"]["model_name"] = eval_request.model
+    results["config"]["model_sha"] = eval_request.revision
+    dumped = json.dumps(results, indent=2)
+    logger.info(dumped)
+    results_path = Path(local_dir, eval_request.model, f"results_{datetime.now()}.json")
+    results_path.parent.mkdir(exist_ok=True, parents=True)
+    results_path.write_text(dumped)
+    logger.info(evaluator.make_table(results))
+    API.upload_file(
+        path_or_fileobj=results_path,
+        path_in_repo=results_path.relative_to(local_dir).as_posix(),
+        repo_id=results_repo,
+        repo_type="dataset",
+    )
+    return results

src/backend/run_eval_suite_lighteval.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import json
+import logging
+import re
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.models.model_config import InferenceEndpointModelConfig
+from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
+from src.backend.manage_requests import EvalRequest
+from src.envs import OWNER
+from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
+logger = setup_logger(__name__)
+SPECIAL_CHARACTERS_PATTERN = re.compile(r"[^a-zA-Z0-9-]")
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names: str,
+    batch_size: int,
+    local_dir: str,
+    accelerator: str,
+    region: str,
+    vendor: str,
+    instance_size: str,
+    instance_type: str,
+    limit=None,
+):
+    """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
+    Args:
+        eval_request (EvalRequest): Input evaluation request file representation
+        task_names (list): Tasks to launch
+        batch_size (int): Selected batch size
+        accelerator (str): Inference endpoint parameter for running the evaluation
+        region (str):  Inference endpoint parameter for running the evaluation
+        vendor (str):  Inference endpoint parameter for running the evaluation
+        instance_size (str):  Inference endpoint parameter for running the evaluation
+        instance_type (str):  Inference endpoint parameter for running the evaluation
+        local_dir (str): Where to save the results locally
+        limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
+    """
+    if limit:
+        logger.info(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    evaluation_tracker = EvaluationTracker(
+        output_dir="./results",
+        save_details=True,
+        push_to_hub=True,
+        push_to_tensorboard=False,
+        hub_results_org=OWNER,
+        public=False,
+    )
+    pipeline_params = PipelineParameters(
+        launcher_type=ParallelismManager.ACCELERATE,
+        override_batch_size=batch_size,
+        max_samples=limit,
+        use_chat_template=False,
+        system_prompt=None,
+        # custom_tasks_directory="custom_tasks.py",  # TODO: pass if using a custom task
+    )
+    model_config = InferenceEndpointModelConfig(
+        # Endpoint parameters
+        name=SPECIAL_CHARACTERS_PATTERN.sub("-", eval_request.model.lower()),
+        repository=eval_request.model,
+        accelerator=accelerator,
+        vendor=vendor,
+        region=region,
+        instance_size=instance_size,
+        instance_type=instance_type,
+        should_reuse_existing=False,
+        model_dtype=eval_request.precision,
+        revision=eval_request.revision,
+    )
+    pipeline = Pipeline(
+        tasks=task_names,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+    try:
+        pipeline.evaluate()
+        pipeline.show_results()
+        pipeline.save_and_push_results()
+        results = pipeline.get_results()
+        dumped = json.dumps(results, indent=2)
+        logger.info(dumped)
+    except Exception:  # if eval failed, we force a cleanup
+        pipeline.model.cleanup()
+        raise
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from dataclasses import dataclass
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+@dataclass
+class ModelMetadata:
+    likes: int = 0
+    size: int = 15
+# All the functions below sort the models in the queue based on different parameters
+def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    private_models = [model for model in models if model.private]
+    public_models = [model for model in models if not model.private]
+    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
+def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
+def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
+def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,20 @@

+style_content = """
+pre, code {
+    background-color: #272822;
+}
+    .scrollable {
+        font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
+        height: 500px;
+        overflow: auto;
+    }
+    """
+dark_mode_gradio_js = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+        window.location.href = url.href;
+    }
+}
+"""

src/display/log_visualizer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from io import StringIO
+from bs4 import BeautifulSoup
+from rich.console import Console
+from rich.syntax import Syntax
+from src.display.css_html_js import style_content
+from src.envs import NUM_LINES_VISUALIZE
+from src.logging import log_file
+def log_file_to_html_string(reverse=True):
+    with open(log_file, "rt") as f:
+        lines = f.readlines()
+        lines = lines[-NUM_LINES_VISUALIZE:]
+    if reverse:
+        lines = reversed(lines)
+    output = "".join(lines)
+    syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
+    console = Console(record=True, width=150, style="#272822", file=StringIO())
+    console.print(syntax)
+    html_content = console.export_html(inline_styles=True)
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, "lxml")
+    # Modify the <pre> tag and add custom styles
+    pre_tag = soup.pre
+    pre_tag["class"] = "scrollable"
+    del pre_tag["style"]
+    # Add your custom styles and the .scrollable CSS to the <style> tag
+    style_tag = soup.style
+    style_tag.append(style_content)
+    return soup.prettify()

src/envs.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = "demo-leaderboard-backend"  # Change to your org - don't forget to create a results and request dataset
+# For harness evaluations
+DEVICE = "cpu"  # "cuda:0" if you add compute, for harness evaluations
+LIMIT = 20  # !!!! For testing, should be None for actual evaluations!!!
+NUM_FEWSHOT = 0  # Change with your few shot for the Harness evaluations
+TASKS_HARNESS = ["anli_r1", "logiqa"]
+# For lighteval evaluations
+ACCELERATOR = "cpu"
+REGION = "us-east-1"
+VENDOR = "aws"
+TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+# To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
+# ---------------------------------------------------
+REPO_ID = f"{OWNER}/backend"
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+REFRESH_RATE = 10 * 60  # 10 min
+NUM_LINES_VISUALIZE = 300
+API = HfApi(token=TOKEN)

src/logging.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import logging
+from pathlib import Path
+proj_dir = Path(__file__).parents[1]
+log_file = proj_dir / "output.log"
+def setup_logger(name: str):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    # Create a file handler to write logs to a file
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    return logger
+def configure_root_logger():
+    # Configure the root logger
+    logging.basicConfig(level=logging.INFO)
+    root_logger = logging.getLogger()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+    root_logger.addHandler(file_handler)