Spaces:
Runtime error
Runtime error
copying backend space from demo leaderboard
Browse files- .gitattributes +1 -1
- .gitignore +15 -0
- .pre-commit-config.yaml +53 -0
- Makefile +11 -0
- README.md +13 -6
- app.py +69 -0
- custom_tasks.py +90 -0
- main_backend_harness.py +109 -0
- main_backend_lighteval.py +127 -0
- pyproject.toml +10 -0
- requirements.txt +17 -0
- scripts/create_request_file.py +109 -0
- scripts/fix_harness_import.py +13 -0
- src/backend/manage_requests.py +149 -0
- src/backend/run_eval_suite_harness.py +86 -0
- src/backend/run_eval_suite_lighteval.py +105 -0
- src/backend/sort_queue.py +31 -0
- src/display/css_html_js.py +20 -0
- src/display/log_visualizer.py +39 -0
- src/envs.py +42 -0
- src/logging.py +36 -0
.gitattributes
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
.idea/
|
9 |
+
|
10 |
+
eval-queue/
|
11 |
+
eval-results/
|
12 |
+
eval-queue-bk/
|
13 |
+
eval-results-bk/
|
14 |
+
logs/
|
15 |
+
output.log
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Makefile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
ruff check --fix .
|
6 |
+
ruff format .
|
7 |
+
|
8 |
+
|
9 |
+
quality:
|
10 |
+
ruff check .
|
11 |
+
ruff format --check .
|
README.md
CHANGED
@@ -1,12 +1,19 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Backend
|
3 |
+
emoji: π₯
|
4 |
+
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.1.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
+
Depending on whether you want to use lighteval or lm_eval for your evaluations, you might need to complete the
|
14 |
+
requirements.txt file to contain relevant dependencies.
|
15 |
+
|
16 |
+
You'll also need to select, in app.py, whether you want to use the ligtheval or lm_eval by selecting the correct
|
17 |
+
import and commenting the other.
|
18 |
+
|
19 |
+
All env variables that you should need to edit to launch the evaluations should be in `envs`.
|
app.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from functools import partial
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
+
|
7 |
+
# Choose ligtheval or harness backend
|
8 |
+
# from main_backend_harness import run_auto_eval
|
9 |
+
from main_backend_lighteval import run_auto_eval
|
10 |
+
|
11 |
+
from src.display.css_html_js import dark_mode_gradio_js
|
12 |
+
from src.display.log_visualizer import log_file_to_html_string
|
13 |
+
from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
|
14 |
+
from src.logging import configure_root_logger, log_file, setup_logger
|
15 |
+
|
16 |
+
|
17 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
18 |
+
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
19 |
+
logging.getLogger("absl").setLevel(logging.WARNING)
|
20 |
+
configure_root_logger()
|
21 |
+
|
22 |
+
logging.basicConfig(level=logging.INFO)
|
23 |
+
logger = setup_logger(__name__)
|
24 |
+
|
25 |
+
|
26 |
+
intro_md = """
|
27 |
+
# Intro
|
28 |
+
This is a visual for the auto evaluator.
|
29 |
+
"""
|
30 |
+
|
31 |
+
links_md = f"""
|
32 |
+
# Important links
|
33 |
+
|
34 |
+
| Description | Link |
|
35 |
+
|-----------------|------|
|
36 |
+
| Leaderboard | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
|
37 |
+
| Queue Repo | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
|
38 |
+
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
39 |
+
"""
|
40 |
+
|
41 |
+
|
42 |
+
def auto_eval():
|
43 |
+
logger.info("Triggering Auto Eval")
|
44 |
+
run_auto_eval()
|
45 |
+
|
46 |
+
|
47 |
+
reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
|
48 |
+
|
49 |
+
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
50 |
+
gr.Markdown(intro_md)
|
51 |
+
with gr.Tab("Application"):
|
52 |
+
output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
|
53 |
+
with gr.Row():
|
54 |
+
download_button = gr.DownloadButton("Download Log File", value=log_file)
|
55 |
+
with gr.Accordion("Log View Configuration", open=False):
|
56 |
+
reverse_order_checkbox.render()
|
57 |
+
# Add a button that when pressed, triggers run_auto_eval
|
58 |
+
button = gr.Button("Manually Run Evaluation")
|
59 |
+
gr.Markdown(links_md)
|
60 |
+
|
61 |
+
# dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
|
62 |
+
|
63 |
+
button.click(fn=auto_eval, inputs=[], outputs=[])
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
scheduler = BackgroundScheduler()
|
67 |
+
scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
|
68 |
+
scheduler.start()
|
69 |
+
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
custom_tasks.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ruff: noqa: F405, F403, F401
|
2 |
+
"""
|
3 |
+
Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
|
4 |
+
|
5 |
+
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
6 |
+
|
7 |
+
Author:
|
8 |
+
"""
|
9 |
+
|
10 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
11 |
+
from lighteval.tasks.requests import Doc
|
12 |
+
|
13 |
+
|
14 |
+
## EVAL WITH NO SUBSET ##
|
15 |
+
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
16 |
+
# attached to it, and one evaluation possible.
|
17 |
+
task = LightevalTaskConfig(
|
18 |
+
name="myothertask",
|
19 |
+
prompt_function="prompt_fn", # must be defined in the file or imported from lighteval.tasks.default_prompts
|
20 |
+
suite=["community"],
|
21 |
+
hf_repo="",
|
22 |
+
hf_subset="default",
|
23 |
+
hf_avail_splits=[],
|
24 |
+
evaluation_splits=[],
|
25 |
+
few_shots_split="",
|
26 |
+
few_shots_select="",
|
27 |
+
metric=[""],
|
28 |
+
)
|
29 |
+
|
30 |
+
## EVALS WITH SUBSET
|
31 |
+
# This is how you create a subset task (like MMLU), which has several subset
|
32 |
+
# each being its own evaluation task.
|
33 |
+
|
34 |
+
# fmt: off
|
35 |
+
SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
|
36 |
+
# fmt: on
|
37 |
+
|
38 |
+
|
39 |
+
class CustomSubsetTask(LightevalTaskConfig):
|
40 |
+
def __init__(
|
41 |
+
self,
|
42 |
+
name,
|
43 |
+
hf_subset,
|
44 |
+
):
|
45 |
+
super().__init__(
|
46 |
+
name=name,
|
47 |
+
hf_subset=hf_subset,
|
48 |
+
prompt_function="prompt_fn", # must be defined in this file or imported from lighteval.tasks.default_prompts
|
49 |
+
hf_repo="",
|
50 |
+
metric=[""],
|
51 |
+
hf_avail_splits=[],
|
52 |
+
evaluation_splits=[],
|
53 |
+
few_shots_split="",
|
54 |
+
few_shots_select="",
|
55 |
+
suite=["community"],
|
56 |
+
generation_size=-1,
|
57 |
+
stop_sequence=None,
|
58 |
+
output_regex=None,
|
59 |
+
frozen=False,
|
60 |
+
)
|
61 |
+
|
62 |
+
|
63 |
+
## DEFINE YOUR PROMPT FUNCTIONS
|
64 |
+
# Define as many as you need for your different tasks
|
65 |
+
def prompt_fn(line, task_name: str = None):
|
66 |
+
"""Defines how to go from a dataset line to a doc object.
|
67 |
+
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
68 |
+
about what this function should do in the README.
|
69 |
+
"""
|
70 |
+
return Doc(
|
71 |
+
task_name=task_name,
|
72 |
+
query="",
|
73 |
+
choices=[],
|
74 |
+
gold_index=0,
|
75 |
+
instruction="",
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
## STORE YOUR EVALS
|
80 |
+
SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
|
81 |
+
_TASKS = SUBSET_TASKS + [task]
|
82 |
+
|
83 |
+
## MODULE LOGIC
|
84 |
+
# You should not need to touch this
|
85 |
+
# Convert to dict for lighteval
|
86 |
+
TASKS_TABLE = [task.as_dict() for task in _TASKS]
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
print(t["name"] for t in TASKS_TABLE)
|
90 |
+
print(len(TASKS_TABLE))
|
main_backend_harness.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
from src.backend.manage_requests import (
|
7 |
+
FAILED_STATUS,
|
8 |
+
FINISHED_STATUS,
|
9 |
+
PENDING_STATUS,
|
10 |
+
RUNNING_STATUS,
|
11 |
+
check_completed_evals,
|
12 |
+
get_eval_requests,
|
13 |
+
set_eval_request,
|
14 |
+
)
|
15 |
+
from src.backend.run_eval_suite_harness import run_evaluation
|
16 |
+
from src.backend.sort_queue import sort_models_by_priority
|
17 |
+
from src.envs import (
|
18 |
+
API,
|
19 |
+
DEVICE,
|
20 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
21 |
+
EVAL_RESULTS_PATH_BACKEND,
|
22 |
+
LIMIT,
|
23 |
+
NUM_FEWSHOT,
|
24 |
+
QUEUE_REPO,
|
25 |
+
RESULTS_REPO,
|
26 |
+
TASKS_HARNESS,
|
27 |
+
TOKEN,
|
28 |
+
)
|
29 |
+
from src.logging import setup_logger
|
30 |
+
|
31 |
+
|
32 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
33 |
+
|
34 |
+
# logging.basicConfig(level=logging.ERROR)
|
35 |
+
logger = setup_logger(__name__)
|
36 |
+
pp = pprint.PrettyPrinter(width=80)
|
37 |
+
|
38 |
+
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=RESULTS_REPO,
|
41 |
+
revision="main",
|
42 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
43 |
+
repo_type="dataset",
|
44 |
+
max_workers=60,
|
45 |
+
token=TOKEN,
|
46 |
+
)
|
47 |
+
snapshot_download(
|
48 |
+
repo_id=QUEUE_REPO,
|
49 |
+
revision="main",
|
50 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
51 |
+
repo_type="dataset",
|
52 |
+
max_workers=60,
|
53 |
+
token=TOKEN,
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def run_auto_eval():
|
58 |
+
current_pending_status = [PENDING_STATUS]
|
59 |
+
|
60 |
+
# pull the eval dataset from the hub and parse any eval requests
|
61 |
+
# check completed evals and set them to finished
|
62 |
+
check_completed_evals(
|
63 |
+
api=API,
|
64 |
+
checked_status=RUNNING_STATUS,
|
65 |
+
completed_status=FINISHED_STATUS,
|
66 |
+
failed_status=FAILED_STATUS,
|
67 |
+
hf_repo=QUEUE_REPO,
|
68 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
69 |
+
hf_repo_results=RESULTS_REPO,
|
70 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
71 |
+
)
|
72 |
+
|
73 |
+
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
74 |
+
eval_requests = get_eval_requests(
|
75 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
76 |
+
)
|
77 |
+
# Sort the evals by priority (first submitted first run)
|
78 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
79 |
+
|
80 |
+
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
81 |
+
|
82 |
+
if len(eval_requests) == 0:
|
83 |
+
return
|
84 |
+
|
85 |
+
eval_request = eval_requests[0]
|
86 |
+
logger.info(pp.pformat(eval_request))
|
87 |
+
|
88 |
+
set_eval_request(
|
89 |
+
api=API,
|
90 |
+
eval_request=eval_request,
|
91 |
+
set_to_status=RUNNING_STATUS,
|
92 |
+
hf_repo=QUEUE_REPO,
|
93 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
94 |
+
)
|
95 |
+
|
96 |
+
run_evaluation(
|
97 |
+
eval_request=eval_request,
|
98 |
+
task_names=TASKS_HARNESS,
|
99 |
+
num_fewshot=NUM_FEWSHOT,
|
100 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
101 |
+
results_repo=RESULTS_REPO,
|
102 |
+
batch_size="auto",
|
103 |
+
device=DEVICE,
|
104 |
+
limit=LIMIT,
|
105 |
+
)
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
run_auto_eval()
|
main_backend_lighteval.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
from src.backend.manage_requests import (
|
7 |
+
FAILED_STATUS,
|
8 |
+
FINISHED_STATUS,
|
9 |
+
PENDING_STATUS,
|
10 |
+
RUNNING_STATUS,
|
11 |
+
check_completed_evals,
|
12 |
+
get_eval_requests,
|
13 |
+
set_eval_request,
|
14 |
+
)
|
15 |
+
from src.backend.run_eval_suite_lighteval import run_evaluation
|
16 |
+
from src.backend.sort_queue import sort_models_by_priority
|
17 |
+
from src.envs import (
|
18 |
+
ACCELERATOR,
|
19 |
+
API,
|
20 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
21 |
+
EVAL_RESULTS_PATH_BACKEND,
|
22 |
+
LIMIT,
|
23 |
+
QUEUE_REPO,
|
24 |
+
REGION,
|
25 |
+
RESULTS_REPO,
|
26 |
+
TASKS_LIGHTEVAL,
|
27 |
+
TOKEN,
|
28 |
+
VENDOR,
|
29 |
+
)
|
30 |
+
from src.logging import setup_logger
|
31 |
+
|
32 |
+
|
33 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
34 |
+
|
35 |
+
logger = setup_logger(__name__)
|
36 |
+
|
37 |
+
# logging.basicConfig(level=logging.ERROR)
|
38 |
+
pp = pprint.PrettyPrinter(width=80)
|
39 |
+
|
40 |
+
snapshot_download(
|
41 |
+
repo_id=RESULTS_REPO,
|
42 |
+
revision="main",
|
43 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
44 |
+
repo_type="dataset",
|
45 |
+
max_workers=60,
|
46 |
+
token=TOKEN,
|
47 |
+
)
|
48 |
+
snapshot_download(
|
49 |
+
repo_id=QUEUE_REPO,
|
50 |
+
revision="main",
|
51 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
52 |
+
repo_type="dataset",
|
53 |
+
max_workers=60,
|
54 |
+
token=TOKEN,
|
55 |
+
)
|
56 |
+
|
57 |
+
|
58 |
+
def run_auto_eval():
|
59 |
+
current_pending_status = [PENDING_STATUS]
|
60 |
+
|
61 |
+
# pull the eval dataset from the hub and parse any eval requests
|
62 |
+
# check completed evals and set them to finished
|
63 |
+
check_completed_evals(
|
64 |
+
api=API,
|
65 |
+
checked_status=RUNNING_STATUS,
|
66 |
+
completed_status=FINISHED_STATUS,
|
67 |
+
failed_status=FAILED_STATUS,
|
68 |
+
hf_repo=QUEUE_REPO,
|
69 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
70 |
+
hf_repo_results=RESULTS_REPO,
|
71 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
72 |
+
)
|
73 |
+
|
74 |
+
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
75 |
+
eval_requests = get_eval_requests(
|
76 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
77 |
+
)
|
78 |
+
# Sort the evals by priority (first submitted first run)
|
79 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
80 |
+
|
81 |
+
logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
82 |
+
|
83 |
+
if len(eval_requests) == 0:
|
84 |
+
return
|
85 |
+
|
86 |
+
eval_request = eval_requests[0]
|
87 |
+
logger.info(pp.pformat(eval_request))
|
88 |
+
|
89 |
+
set_eval_request(
|
90 |
+
api=API,
|
91 |
+
eval_request=eval_request,
|
92 |
+
set_to_status=RUNNING_STATUS,
|
93 |
+
hf_repo=QUEUE_REPO,
|
94 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
95 |
+
)
|
96 |
+
|
97 |
+
# This needs to be done
|
98 |
+
# instance_size, instance_type = get_instance_for_model(eval_request)
|
99 |
+
# For GPU
|
100 |
+
# instance_size, instance_type = "small", "g4dn.xlarge"
|
101 |
+
# For CPU
|
102 |
+
# Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
|
103 |
+
instance_size, instance_type = "x4", "intel-icl"
|
104 |
+
logger.info(
|
105 |
+
f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
|
106 |
+
)
|
107 |
+
|
108 |
+
run_evaluation(
|
109 |
+
eval_request=eval_request,
|
110 |
+
task_names=TASKS_LIGHTEVAL,
|
111 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
112 |
+
batch_size=1,
|
113 |
+
accelerator=ACCELERATOR,
|
114 |
+
region=REGION,
|
115 |
+
vendor=VENDOR,
|
116 |
+
instance_size=instance_size,
|
117 |
+
instance_type=instance_type,
|
118 |
+
limit=LIMIT,
|
119 |
+
)
|
120 |
+
|
121 |
+
logger.info(
|
122 |
+
f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
run_auto_eval()
|
pyproject.toml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
line-length = 119
|
3 |
+
|
4 |
+
[tool.ruff.lint]
|
5 |
+
select = ["C", "E", "F", "I", "W"]
|
6 |
+
ignore = ["E501"] # line too long (the formatter is taking care of this)
|
7 |
+
|
8 |
+
[tool.ruff.lint.isort]
|
9 |
+
lines-after-imports = 2
|
10 |
+
known-local-folder = ["src"]
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
+
click==8.1.3
|
3 |
+
huggingface-hub>=0.18.0
|
4 |
+
python-dateutil==2.8.2
|
5 |
+
requests==2.28.2
|
6 |
+
tqdm==4.65.0
|
7 |
+
accelerate>=0.26.0
|
8 |
+
sentencepiece
|
9 |
+
|
10 |
+
# Evaluation suites
|
11 |
+
lighteval>=0.5.0
|
12 |
+
lm_eval==0.4.3
|
13 |
+
|
14 |
+
# Log Visualizer
|
15 |
+
BeautifulSoup4==4.12.2
|
16 |
+
lxml==4.9.3
|
17 |
+
rich==13.3.4
|
scripts/create_request_file.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import pprint
|
4 |
+
import re
|
5 |
+
from datetime import datetime, timezone
|
6 |
+
|
7 |
+
import click
|
8 |
+
from colorama import Fore
|
9 |
+
from huggingface_hub import HfApi, snapshot_download
|
10 |
+
|
11 |
+
from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
12 |
+
|
13 |
+
|
14 |
+
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
|
15 |
+
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
16 |
+
weight_types = ("Original", "Delta", "Adapter")
|
17 |
+
|
18 |
+
|
19 |
+
def get_model_size(model_info, precision: str):
|
20 |
+
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
21 |
+
try:
|
22 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
23 |
+
except (AttributeError, TypeError):
|
24 |
+
try:
|
25 |
+
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
+
model_size = size_match.group(0)
|
27 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
28 |
+
except AttributeError:
|
29 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
30 |
+
|
31 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
32 |
+
model_size = size_factor * model_size
|
33 |
+
return model_size
|
34 |
+
|
35 |
+
|
36 |
+
def main():
|
37 |
+
api = HfApi()
|
38 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
|
41 |
+
)
|
42 |
+
|
43 |
+
model_name = click.prompt("Enter model name")
|
44 |
+
revision = click.prompt("Enter revision", default="main")
|
45 |
+
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
46 |
+
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
47 |
+
weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
|
48 |
+
base_model = click.prompt("Enter base model", default="")
|
49 |
+
status = click.prompt("Enter status", default="FINISHED")
|
50 |
+
|
51 |
+
try:
|
52 |
+
model_info = api.model_info(repo_id=model_name, revision=revision)
|
53 |
+
except Exception as e:
|
54 |
+
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
55 |
+
return 1
|
56 |
+
|
57 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
58 |
+
|
59 |
+
try:
|
60 |
+
license = model_info.cardData["license"]
|
61 |
+
except Exception:
|
62 |
+
license = "?"
|
63 |
+
|
64 |
+
eval_entry = {
|
65 |
+
"model": model_name,
|
66 |
+
"base_model": base_model,
|
67 |
+
"revision": revision,
|
68 |
+
"private": False,
|
69 |
+
"precision": precision,
|
70 |
+
"weight_type": weight_type,
|
71 |
+
"status": status,
|
72 |
+
"submitted_time": current_time,
|
73 |
+
"model_type": model_type,
|
74 |
+
"likes": model_info.likes,
|
75 |
+
"params": model_size,
|
76 |
+
"license": license,
|
77 |
+
}
|
78 |
+
|
79 |
+
user_name = ""
|
80 |
+
model_path = model_name
|
81 |
+
if "/" in model_name:
|
82 |
+
user_name = model_name.split("/")[0]
|
83 |
+
model_path = model_name.split("/")[1]
|
84 |
+
|
85 |
+
pprint.pprint(eval_entry)
|
86 |
+
|
87 |
+
if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
|
88 |
+
click.echo("continuing...")
|
89 |
+
|
90 |
+
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
91 |
+
os.makedirs(out_dir, exist_ok=True)
|
92 |
+
out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
|
93 |
+
|
94 |
+
with open(out_path, "w") as f:
|
95 |
+
f.write(json.dumps(eval_entry))
|
96 |
+
|
97 |
+
api.upload_file(
|
98 |
+
path_or_fileobj=out_path,
|
99 |
+
path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
|
100 |
+
repo_id=QUEUE_REPO,
|
101 |
+
repo_type="dataset",
|
102 |
+
commit_message=f"Add {model_name} to eval queue",
|
103 |
+
)
|
104 |
+
else:
|
105 |
+
click.echo("aborting...")
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
main()
|
scripts/fix_harness_import.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This file should be used after pip install -r requirements.
|
2 |
+
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
+
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
|
8 |
+
import lm_eval
|
9 |
+
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
lm_eval_path = lm_eval.__path__[0]
|
13 |
+
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
src/backend/manage_requests.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi, snapshot_download
|
7 |
+
|
8 |
+
from src.envs import TOKEN
|
9 |
+
from src.logging import setup_logger
|
10 |
+
|
11 |
+
|
12 |
+
logger = setup_logger(__name__)
|
13 |
+
|
14 |
+
PENDING_STATUS = "PENDING"
|
15 |
+
RUNNING_STATUS = "RUNNING"
|
16 |
+
FINISHED_STATUS = "FINISHED"
|
17 |
+
FAILED_STATUS = "FAILED"
|
18 |
+
|
19 |
+
|
20 |
+
@dataclass
|
21 |
+
class EvalRequest:
|
22 |
+
"""This class represents one evaluation request file."""
|
23 |
+
|
24 |
+
model: str
|
25 |
+
status: str
|
26 |
+
json_filepath: str
|
27 |
+
weight_type: str = "Original"
|
28 |
+
model_type: Optional[str] = None # pretrained, fine-tuned, etc. - define your own categories in
|
29 |
+
precision: str = "" # float16, bfloat16
|
30 |
+
revision: str = "main" # commit hash
|
31 |
+
submitted_time: Optional[str] = (
|
32 |
+
"2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
33 |
+
)
|
34 |
+
likes: Optional[int] = 0
|
35 |
+
params: Optional[int] = None
|
36 |
+
license: Optional[str] = ""
|
37 |
+
base_model: Optional[str] = ""
|
38 |
+
private: Optional[bool] = False
|
39 |
+
|
40 |
+
def get_model_args(self):
|
41 |
+
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
42 |
+
the evaluation suite you chose.
|
43 |
+
"""
|
44 |
+
model_args = f"pretrained={self.model},revision={self.revision}"
|
45 |
+
|
46 |
+
if self.precision in ["float16", "bfloat16"]:
|
47 |
+
model_args += f",dtype={self.precision}"
|
48 |
+
|
49 |
+
# Quantized models need some added config, the install of bits and bytes, etc
|
50 |
+
else:
|
51 |
+
raise Exception(f"Unknown precision {self.precision}.")
|
52 |
+
|
53 |
+
return model_args
|
54 |
+
|
55 |
+
|
56 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
57 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
58 |
+
json_filepath = eval_request.json_filepath
|
59 |
+
|
60 |
+
with open(json_filepath) as fp:
|
61 |
+
data = json.load(fp)
|
62 |
+
|
63 |
+
data["status"] = set_to_status
|
64 |
+
|
65 |
+
with open(json_filepath, "w") as f:
|
66 |
+
f.write(json.dumps(data))
|
67 |
+
|
68 |
+
api.upload_file(
|
69 |
+
path_or_fileobj=json_filepath,
|
70 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
71 |
+
repo_id=hf_repo,
|
72 |
+
repo_type="dataset",
|
73 |
+
)
|
74 |
+
|
75 |
+
|
76 |
+
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
77 |
+
"""Gets all pending evaluation requests and return a list in which private
|
78 |
+
models appearing first, followed by public models sorted by the number of
|
79 |
+
likes.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
`list[EvalRequest]`: a list of model info dicts.
|
83 |
+
"""
|
84 |
+
snapshot_download(
|
85 |
+
repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
|
86 |
+
)
|
87 |
+
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
88 |
+
|
89 |
+
eval_requests = []
|
90 |
+
for json_filepath in json_files:
|
91 |
+
with open(json_filepath) as fp:
|
92 |
+
data = json.load(fp)
|
93 |
+
if data["status"] in job_status:
|
94 |
+
data["json_filepath"] = json_filepath
|
95 |
+
eval_request = EvalRequest(**data)
|
96 |
+
eval_requests.append(eval_request)
|
97 |
+
|
98 |
+
return eval_requests
|
99 |
+
|
100 |
+
|
101 |
+
def eval_was_running(eval_request: EvalRequest):
|
102 |
+
"""Checks whether a file says it's RUNNING to determine whether to FAIL"""
|
103 |
+
json_filepath = eval_request.json_filepath
|
104 |
+
|
105 |
+
with open(json_filepath) as fp:
|
106 |
+
data = json.load(fp)
|
107 |
+
|
108 |
+
status = data["status"]
|
109 |
+
return status == RUNNING_STATUS
|
110 |
+
|
111 |
+
|
112 |
+
def check_completed_evals(
|
113 |
+
api: HfApi,
|
114 |
+
hf_repo: str,
|
115 |
+
local_dir: str,
|
116 |
+
checked_status: str,
|
117 |
+
completed_status: str,
|
118 |
+
failed_status: str,
|
119 |
+
hf_repo_results: str,
|
120 |
+
local_dir_results: str,
|
121 |
+
):
|
122 |
+
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
123 |
+
snapshot_download(
|
124 |
+
repo_id=hf_repo_results,
|
125 |
+
revision="main",
|
126 |
+
local_dir=local_dir_results,
|
127 |
+
repo_type="dataset",
|
128 |
+
max_workers=60,
|
129 |
+
token=TOKEN,
|
130 |
+
)
|
131 |
+
|
132 |
+
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
133 |
+
|
134 |
+
for eval_request in running_evals:
|
135 |
+
model = eval_request.model
|
136 |
+
logger.info("====================================")
|
137 |
+
logger.info(f"Checking {model}")
|
138 |
+
|
139 |
+
output_path = model
|
140 |
+
output_file = f"{local_dir_results}/{output_path}/results*.json"
|
141 |
+
output_file_exists = len(glob.glob(output_file)) > 0
|
142 |
+
|
143 |
+
if output_file_exists:
|
144 |
+
logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
145 |
+
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
146 |
+
else:
|
147 |
+
if eval_was_running(eval_request=eval_request):
|
148 |
+
logger.info(f"No result file found for {model} setting it to {failed_status}")
|
149 |
+
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite_harness.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
from datetime import datetime
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Union
|
6 |
+
|
7 |
+
from lm_eval import evaluator, utils
|
8 |
+
from lm_eval.tasks import TaskManager
|
9 |
+
|
10 |
+
from src.backend.manage_requests import EvalRequest
|
11 |
+
from src.envs import API
|
12 |
+
from src.logging import setup_logger
|
13 |
+
|
14 |
+
|
15 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
16 |
+
logger = setup_logger(__name__)
|
17 |
+
|
18 |
+
|
19 |
+
def run_evaluation(
|
20 |
+
eval_request: EvalRequest,
|
21 |
+
task_names: list,
|
22 |
+
num_fewshot: int,
|
23 |
+
batch_size: Union[int, str],
|
24 |
+
device: str,
|
25 |
+
local_dir: str,
|
26 |
+
results_repo: str,
|
27 |
+
limit: int = None,
|
28 |
+
):
|
29 |
+
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
33 |
+
task_names (list): Tasks to launch
|
34 |
+
num_fewshot (int): Number of few shots to use
|
35 |
+
batch_size (int or str): Selected batch size or 'auto'
|
36 |
+
device (str): "cpu" or "cuda:0", depending on what you assigned to the space
|
37 |
+
local_dir (str): Where to save the results locally
|
38 |
+
results_repo (str): To which repository to upload the results
|
39 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
_type_: _description_
|
43 |
+
"""
|
44 |
+
if limit:
|
45 |
+
logger.info(
|
46 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
47 |
+
)
|
48 |
+
|
49 |
+
task_manager = TaskManager()
|
50 |
+
all_tasks = task_manager.all_tasks
|
51 |
+
task_names = utils.pattern_match(task_names, all_tasks)
|
52 |
+
|
53 |
+
logger.info(f"Selected Tasks: {task_names}")
|
54 |
+
|
55 |
+
results = evaluator.simple_evaluate(
|
56 |
+
model="hf",
|
57 |
+
model_args=eval_request.get_model_args(),
|
58 |
+
tasks=task_names,
|
59 |
+
num_fewshot=num_fewshot,
|
60 |
+
batch_size=batch_size,
|
61 |
+
device=device,
|
62 |
+
limit=limit,
|
63 |
+
write_out=True, # Whether to write out an example document and model input, for checking task integrity
|
64 |
+
)
|
65 |
+
|
66 |
+
results["config"]["model_dtype"] = eval_request.precision
|
67 |
+
results["config"]["model_name"] = eval_request.model
|
68 |
+
results["config"]["model_sha"] = eval_request.revision
|
69 |
+
|
70 |
+
dumped = json.dumps(results, indent=2)
|
71 |
+
logger.info(dumped)
|
72 |
+
|
73 |
+
results_path = Path(local_dir, eval_request.model, f"results_{datetime.now()}.json")
|
74 |
+
results_path.parent.mkdir(exist_ok=True, parents=True)
|
75 |
+
results_path.write_text(dumped)
|
76 |
+
|
77 |
+
logger.info(evaluator.make_table(results))
|
78 |
+
|
79 |
+
API.upload_file(
|
80 |
+
path_or_fileobj=results_path,
|
81 |
+
path_in_repo=results_path.relative_to(local_dir).as_posix(),
|
82 |
+
repo_id=results_repo,
|
83 |
+
repo_type="dataset",
|
84 |
+
)
|
85 |
+
|
86 |
+
return results
|
src/backend/run_eval_suite_lighteval.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
|
5 |
+
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
6 |
+
from lighteval.models.model_config import InferenceEndpointModelConfig
|
7 |
+
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
8 |
+
|
9 |
+
from src.backend.manage_requests import EvalRequest
|
10 |
+
from src.envs import OWNER
|
11 |
+
from src.logging import setup_logger
|
12 |
+
|
13 |
+
|
14 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
15 |
+
logger = setup_logger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
SPECIAL_CHARACTERS_PATTERN = re.compile(r"[^a-zA-Z0-9-]")
|
19 |
+
|
20 |
+
|
21 |
+
def run_evaluation(
|
22 |
+
eval_request: EvalRequest,
|
23 |
+
task_names: str,
|
24 |
+
batch_size: int,
|
25 |
+
local_dir: str,
|
26 |
+
accelerator: str,
|
27 |
+
region: str,
|
28 |
+
vendor: str,
|
29 |
+
instance_size: str,
|
30 |
+
instance_type: str,
|
31 |
+
limit=None,
|
32 |
+
):
|
33 |
+
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
37 |
+
task_names (list): Tasks to launch
|
38 |
+
batch_size (int): Selected batch size
|
39 |
+
accelerator (str): Inference endpoint parameter for running the evaluation
|
40 |
+
region (str): Inference endpoint parameter for running the evaluation
|
41 |
+
vendor (str): Inference endpoint parameter for running the evaluation
|
42 |
+
instance_size (str): Inference endpoint parameter for running the evaluation
|
43 |
+
instance_type (str): Inference endpoint parameter for running the evaluation
|
44 |
+
local_dir (str): Where to save the results locally
|
45 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
46 |
+
"""
|
47 |
+
|
48 |
+
if limit:
|
49 |
+
logger.info(
|
50 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
51 |
+
)
|
52 |
+
|
53 |
+
evaluation_tracker = EvaluationTracker(
|
54 |
+
output_dir="./results",
|
55 |
+
save_details=True,
|
56 |
+
push_to_hub=True,
|
57 |
+
push_to_tensorboard=False,
|
58 |
+
hub_results_org=OWNER,
|
59 |
+
public=False,
|
60 |
+
)
|
61 |
+
|
62 |
+
pipeline_params = PipelineParameters(
|
63 |
+
launcher_type=ParallelismManager.ACCELERATE,
|
64 |
+
override_batch_size=batch_size,
|
65 |
+
max_samples=limit,
|
66 |
+
use_chat_template=False,
|
67 |
+
system_prompt=None,
|
68 |
+
# custom_tasks_directory="custom_tasks.py", # TODO: pass if using a custom task
|
69 |
+
)
|
70 |
+
|
71 |
+
model_config = InferenceEndpointModelConfig(
|
72 |
+
# Endpoint parameters
|
73 |
+
name=SPECIAL_CHARACTERS_PATTERN.sub("-", eval_request.model.lower()),
|
74 |
+
repository=eval_request.model,
|
75 |
+
accelerator=accelerator,
|
76 |
+
vendor=vendor,
|
77 |
+
region=region,
|
78 |
+
instance_size=instance_size,
|
79 |
+
instance_type=instance_type,
|
80 |
+
should_reuse_existing=False,
|
81 |
+
model_dtype=eval_request.precision,
|
82 |
+
revision=eval_request.revision,
|
83 |
+
)
|
84 |
+
|
85 |
+
pipeline = Pipeline(
|
86 |
+
tasks=task_names,
|
87 |
+
pipeline_parameters=pipeline_params,
|
88 |
+
evaluation_tracker=evaluation_tracker,
|
89 |
+
model_config=model_config,
|
90 |
+
)
|
91 |
+
|
92 |
+
try:
|
93 |
+
pipeline.evaluate()
|
94 |
+
pipeline.show_results()
|
95 |
+
pipeline.save_and_push_results()
|
96 |
+
results = pipeline.get_results()
|
97 |
+
|
98 |
+
dumped = json.dumps(results, indent=2)
|
99 |
+
logger.info(dumped)
|
100 |
+
|
101 |
+
except Exception: # if eval failed, we force a cleanup
|
102 |
+
pipeline.model.cleanup()
|
103 |
+
raise
|
104 |
+
|
105 |
+
return results
|
src/backend/sort_queue.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
from src.backend.manage_requests import EvalRequest
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class ModelMetadata:
|
10 |
+
likes: int = 0
|
11 |
+
size: int = 15
|
12 |
+
|
13 |
+
|
14 |
+
# All the functions below sort the models in the queue based on different parameters
|
15 |
+
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
+
private_models = [model for model in models if model.private]
|
17 |
+
public_models = [model for model in models if not model.private]
|
18 |
+
|
19 |
+
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
+
|
21 |
+
|
22 |
+
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
23 |
+
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
24 |
+
|
25 |
+
|
26 |
+
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
27 |
+
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
28 |
+
|
29 |
+
|
30 |
+
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
31 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/display/css_html_js.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
style_content = """
|
2 |
+
pre, code {
|
3 |
+
background-color: #272822;
|
4 |
+
}
|
5 |
+
.scrollable {
|
6 |
+
font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
|
7 |
+
height: 500px;
|
8 |
+
overflow: auto;
|
9 |
+
}
|
10 |
+
"""
|
11 |
+
dark_mode_gradio_js = """
|
12 |
+
function refresh() {
|
13 |
+
const url = new URL(window.location);
|
14 |
+
|
15 |
+
if (url.searchParams.get('__theme') !== 'dark') {
|
16 |
+
url.searchParams.set('__theme', 'dark');
|
17 |
+
window.location.href = url.href;
|
18 |
+
}
|
19 |
+
}
|
20 |
+
"""
|
src/display/log_visualizer.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import StringIO
|
2 |
+
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from rich.console import Console
|
5 |
+
from rich.syntax import Syntax
|
6 |
+
|
7 |
+
from src.display.css_html_js import style_content
|
8 |
+
from src.envs import NUM_LINES_VISUALIZE
|
9 |
+
from src.logging import log_file
|
10 |
+
|
11 |
+
|
12 |
+
def log_file_to_html_string(reverse=True):
|
13 |
+
with open(log_file, "rt") as f:
|
14 |
+
lines = f.readlines()
|
15 |
+
lines = lines[-NUM_LINES_VISUALIZE:]
|
16 |
+
|
17 |
+
if reverse:
|
18 |
+
lines = reversed(lines)
|
19 |
+
|
20 |
+
output = "".join(lines)
|
21 |
+
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
|
22 |
+
|
23 |
+
console = Console(record=True, width=150, style="#272822", file=StringIO())
|
24 |
+
console.print(syntax)
|
25 |
+
html_content = console.export_html(inline_styles=True)
|
26 |
+
|
27 |
+
# Parse the HTML content using BeautifulSoup
|
28 |
+
soup = BeautifulSoup(html_content, "lxml")
|
29 |
+
|
30 |
+
# Modify the <pre> tag and add custom styles
|
31 |
+
pre_tag = soup.pre
|
32 |
+
pre_tag["class"] = "scrollable"
|
33 |
+
del pre_tag["style"]
|
34 |
+
|
35 |
+
# Add your custom styles and the .scrollable CSS to the <style> tag
|
36 |
+
style_tag = soup.style
|
37 |
+
style_tag.append(style_content)
|
38 |
+
|
39 |
+
return soup.prettify()
|
src/envs.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
|
6 |
+
# Info to change for your repository
|
7 |
+
# ----------------------------------
|
8 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
9 |
+
|
10 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
|
11 |
+
|
12 |
+
# For harness evaluations
|
13 |
+
DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
|
14 |
+
LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
|
15 |
+
NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
|
16 |
+
TASKS_HARNESS = ["anli_r1", "logiqa"]
|
17 |
+
|
18 |
+
# For lighteval evaluations
|
19 |
+
ACCELERATOR = "cpu"
|
20 |
+
REGION = "us-east-1"
|
21 |
+
VENDOR = "aws"
|
22 |
+
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
23 |
+
# To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
|
24 |
+
|
25 |
+
# ---------------------------------------------------
|
26 |
+
REPO_ID = f"{OWNER}/backend"
|
27 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
28 |
+
RESULTS_REPO = f"{OWNER}/results"
|
29 |
+
|
30 |
+
# If you setup a cache later, just change HF_HOME
|
31 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
32 |
+
|
33 |
+
# Local caches
|
34 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
35 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
36 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
37 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
38 |
+
|
39 |
+
REFRESH_RATE = 10 * 60 # 10 min
|
40 |
+
NUM_LINES_VISUALIZE = 300
|
41 |
+
|
42 |
+
API = HfApi(token=TOKEN)
|
src/logging.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
|
5 |
+
proj_dir = Path(__file__).parents[1]
|
6 |
+
|
7 |
+
log_file = proj_dir / "output.log"
|
8 |
+
|
9 |
+
|
10 |
+
def setup_logger(name: str):
|
11 |
+
logger = logging.getLogger(name)
|
12 |
+
logger.setLevel(logging.INFO)
|
13 |
+
|
14 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
15 |
+
|
16 |
+
# Create a file handler to write logs to a file
|
17 |
+
file_handler = logging.FileHandler(log_file)
|
18 |
+
file_handler.setLevel(logging.INFO)
|
19 |
+
file_handler.setFormatter(formatter)
|
20 |
+
logger.addHandler(file_handler)
|
21 |
+
|
22 |
+
return logger
|
23 |
+
|
24 |
+
|
25 |
+
def configure_root_logger():
|
26 |
+
# Configure the root logger
|
27 |
+
logging.basicConfig(level=logging.INFO)
|
28 |
+
root_logger = logging.getLogger()
|
29 |
+
|
30 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
31 |
+
|
32 |
+
file_handler = logging.FileHandler(log_file)
|
33 |
+
file_handler.setLevel(logging.INFO)
|
34 |
+
file_handler.setFormatter(formatter)
|
35 |
+
|
36 |
+
root_logger.addHandler(file_handler)
|