ag2435 commited on
Commit
4ab6298
Β·
verified Β·
1 Parent(s): 609d373

copying backend space from demo leaderboard

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+ .idea/
9
+
10
+ eval-queue/
11
+ eval-results/
12
+ eval-queue-bk/
13
+ eval-results-bk/
14
+ logs/
15
+ output.log
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ ruff check --fix .
6
+ ruff format .
7
+
8
+
9
+ quality:
10
+ ruff check .
11
+ ruff format --check .
README.md CHANGED
@@ -1,12 +1,19 @@
1
  ---
2
- title: Arxiv Classifier Leaderboard Backend
3
- emoji: πŸ“‰
4
- colorFrom: red
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.16.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: Backend
3
+ emoji: πŸ₯‡
4
+ colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.1.0
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
  ---
12
 
13
+ Depending on whether you want to use lighteval or lm_eval for your evaluations, you might need to complete the
14
+ requirements.txt file to contain relevant dependencies.
15
+
16
+ You'll also need to select, in app.py, whether you want to use the ligtheval or lm_eval by selecting the correct
17
+ import and commenting the other.
18
+
19
+ All env variables that you should need to edit to launch the evaluations should be in `envs`.
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from functools import partial
3
+
4
+ import gradio as gr
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+
7
+ # Choose ligtheval or harness backend
8
+ # from main_backend_harness import run_auto_eval
9
+ from main_backend_lighteval import run_auto_eval
10
+
11
+ from src.display.css_html_js import dark_mode_gradio_js
12
+ from src.display.log_visualizer import log_file_to_html_string
13
+ from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
14
+ from src.logging import configure_root_logger, log_file, setup_logger
15
+
16
+
17
+ logging.getLogger("httpx").setLevel(logging.WARNING)
18
+ logging.getLogger("numexpr").setLevel(logging.WARNING)
19
+ logging.getLogger("absl").setLevel(logging.WARNING)
20
+ configure_root_logger()
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = setup_logger(__name__)
24
+
25
+
26
+ intro_md = """
27
+ # Intro
28
+ This is a visual for the auto evaluator.
29
+ """
30
+
31
+ links_md = f"""
32
+ # Important links
33
+
34
+ | Description | Link |
35
+ |-----------------|------|
36
+ | Leaderboard | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
37
+ | Queue Repo | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
38
+ | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
39
+ """
40
+
41
+
42
+ def auto_eval():
43
+ logger.info("Triggering Auto Eval")
44
+ run_auto_eval()
45
+
46
+
47
+ reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
48
+
49
+ with gr.Blocks(js=dark_mode_gradio_js) as demo:
50
+ gr.Markdown(intro_md)
51
+ with gr.Tab("Application"):
52
+ output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
53
+ with gr.Row():
54
+ download_button = gr.DownloadButton("Download Log File", value=log_file)
55
+ with gr.Accordion("Log View Configuration", open=False):
56
+ reverse_order_checkbox.render()
57
+ # Add a button that when pressed, triggers run_auto_eval
58
+ button = gr.Button("Manually Run Evaluation")
59
+ gr.Markdown(links_md)
60
+
61
+ # dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
62
+
63
+ button.click(fn=auto_eval, inputs=[], outputs=[])
64
+
65
+ if __name__ == "__main__":
66
+ scheduler = BackgroundScheduler()
67
+ scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
68
+ scheduler.start()
69
+ demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
custom_tasks.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: F405, F403, F401
2
+ """
3
+ Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
4
+
5
+ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
6
+
7
+ Author:
8
+ """
9
+
10
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
11
+ from lighteval.tasks.requests import Doc
12
+
13
+
14
+ ## EVAL WITH NO SUBSET ##
15
+ # This is how you create a simple tasks (like hellaswag) which has one single subset
16
+ # attached to it, and one evaluation possible.
17
+ task = LightevalTaskConfig(
18
+ name="myothertask",
19
+ prompt_function="prompt_fn", # must be defined in the file or imported from lighteval.tasks.default_prompts
20
+ suite=["community"],
21
+ hf_repo="",
22
+ hf_subset="default",
23
+ hf_avail_splits=[],
24
+ evaluation_splits=[],
25
+ few_shots_split="",
26
+ few_shots_select="",
27
+ metric=[""],
28
+ )
29
+
30
+ ## EVALS WITH SUBSET
31
+ # This is how you create a subset task (like MMLU), which has several subset
32
+ # each being its own evaluation task.
33
+
34
+ # fmt: off
35
+ SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
36
+ # fmt: on
37
+
38
+
39
+ class CustomSubsetTask(LightevalTaskConfig):
40
+ def __init__(
41
+ self,
42
+ name,
43
+ hf_subset,
44
+ ):
45
+ super().__init__(
46
+ name=name,
47
+ hf_subset=hf_subset,
48
+ prompt_function="prompt_fn", # must be defined in this file or imported from lighteval.tasks.default_prompts
49
+ hf_repo="",
50
+ metric=[""],
51
+ hf_avail_splits=[],
52
+ evaluation_splits=[],
53
+ few_shots_split="",
54
+ few_shots_select="",
55
+ suite=["community"],
56
+ generation_size=-1,
57
+ stop_sequence=None,
58
+ output_regex=None,
59
+ frozen=False,
60
+ )
61
+
62
+
63
+ ## DEFINE YOUR PROMPT FUNCTIONS
64
+ # Define as many as you need for your different tasks
65
+ def prompt_fn(line, task_name: str = None):
66
+ """Defines how to go from a dataset line to a doc object.
67
+ Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
68
+ about what this function should do in the README.
69
+ """
70
+ return Doc(
71
+ task_name=task_name,
72
+ query="",
73
+ choices=[],
74
+ gold_index=0,
75
+ instruction="",
76
+ )
77
+
78
+
79
+ ## STORE YOUR EVALS
80
+ SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
81
+ _TASKS = SUBSET_TASKS + [task]
82
+
83
+ ## MODULE LOGIC
84
+ # You should not need to touch this
85
+ # Convert to dict for lighteval
86
+ TASKS_TABLE = [task.as_dict() for task in _TASKS]
87
+
88
+ if __name__ == "__main__":
89
+ print(t["name"] for t in TASKS_TABLE)
90
+ print(len(TASKS_TABLE))
main_backend_harness.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pprint
3
+
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from src.backend.manage_requests import (
7
+ FAILED_STATUS,
8
+ FINISHED_STATUS,
9
+ PENDING_STATUS,
10
+ RUNNING_STATUS,
11
+ check_completed_evals,
12
+ get_eval_requests,
13
+ set_eval_request,
14
+ )
15
+ from src.backend.run_eval_suite_harness import run_evaluation
16
+ from src.backend.sort_queue import sort_models_by_priority
17
+ from src.envs import (
18
+ API,
19
+ DEVICE,
20
+ EVAL_REQUESTS_PATH_BACKEND,
21
+ EVAL_RESULTS_PATH_BACKEND,
22
+ LIMIT,
23
+ NUM_FEWSHOT,
24
+ QUEUE_REPO,
25
+ RESULTS_REPO,
26
+ TASKS_HARNESS,
27
+ TOKEN,
28
+ )
29
+ from src.logging import setup_logger
30
+
31
+
32
+ logging.getLogger("openai").setLevel(logging.WARNING)
33
+
34
+ # logging.basicConfig(level=logging.ERROR)
35
+ logger = setup_logger(__name__)
36
+ pp = pprint.PrettyPrinter(width=80)
37
+
38
+
39
+ snapshot_download(
40
+ repo_id=RESULTS_REPO,
41
+ revision="main",
42
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
43
+ repo_type="dataset",
44
+ max_workers=60,
45
+ token=TOKEN,
46
+ )
47
+ snapshot_download(
48
+ repo_id=QUEUE_REPO,
49
+ revision="main",
50
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
51
+ repo_type="dataset",
52
+ max_workers=60,
53
+ token=TOKEN,
54
+ )
55
+
56
+
57
+ def run_auto_eval():
58
+ current_pending_status = [PENDING_STATUS]
59
+
60
+ # pull the eval dataset from the hub and parse any eval requests
61
+ # check completed evals and set them to finished
62
+ check_completed_evals(
63
+ api=API,
64
+ checked_status=RUNNING_STATUS,
65
+ completed_status=FINISHED_STATUS,
66
+ failed_status=FAILED_STATUS,
67
+ hf_repo=QUEUE_REPO,
68
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
69
+ hf_repo_results=RESULTS_REPO,
70
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
71
+ )
72
+
73
+ # Get all eval request that are PENDING, if you want to run other evals, change this parameter
74
+ eval_requests = get_eval_requests(
75
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
76
+ )
77
+ # Sort the evals by priority (first submitted first run)
78
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
79
+
80
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
81
+
82
+ if len(eval_requests) == 0:
83
+ return
84
+
85
+ eval_request = eval_requests[0]
86
+ logger.info(pp.pformat(eval_request))
87
+
88
+ set_eval_request(
89
+ api=API,
90
+ eval_request=eval_request,
91
+ set_to_status=RUNNING_STATUS,
92
+ hf_repo=QUEUE_REPO,
93
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
94
+ )
95
+
96
+ run_evaluation(
97
+ eval_request=eval_request,
98
+ task_names=TASKS_HARNESS,
99
+ num_fewshot=NUM_FEWSHOT,
100
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
101
+ results_repo=RESULTS_REPO,
102
+ batch_size="auto",
103
+ device=DEVICE,
104
+ limit=LIMIT,
105
+ )
106
+
107
+
108
+ if __name__ == "__main__":
109
+ run_auto_eval()
main_backend_lighteval.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pprint
3
+
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from src.backend.manage_requests import (
7
+ FAILED_STATUS,
8
+ FINISHED_STATUS,
9
+ PENDING_STATUS,
10
+ RUNNING_STATUS,
11
+ check_completed_evals,
12
+ get_eval_requests,
13
+ set_eval_request,
14
+ )
15
+ from src.backend.run_eval_suite_lighteval import run_evaluation
16
+ from src.backend.sort_queue import sort_models_by_priority
17
+ from src.envs import (
18
+ ACCELERATOR,
19
+ API,
20
+ EVAL_REQUESTS_PATH_BACKEND,
21
+ EVAL_RESULTS_PATH_BACKEND,
22
+ LIMIT,
23
+ QUEUE_REPO,
24
+ REGION,
25
+ RESULTS_REPO,
26
+ TASKS_LIGHTEVAL,
27
+ TOKEN,
28
+ VENDOR,
29
+ )
30
+ from src.logging import setup_logger
31
+
32
+
33
+ logging.getLogger("openai").setLevel(logging.WARNING)
34
+
35
+ logger = setup_logger(__name__)
36
+
37
+ # logging.basicConfig(level=logging.ERROR)
38
+ pp = pprint.PrettyPrinter(width=80)
39
+
40
+ snapshot_download(
41
+ repo_id=RESULTS_REPO,
42
+ revision="main",
43
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
44
+ repo_type="dataset",
45
+ max_workers=60,
46
+ token=TOKEN,
47
+ )
48
+ snapshot_download(
49
+ repo_id=QUEUE_REPO,
50
+ revision="main",
51
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
52
+ repo_type="dataset",
53
+ max_workers=60,
54
+ token=TOKEN,
55
+ )
56
+
57
+
58
+ def run_auto_eval():
59
+ current_pending_status = [PENDING_STATUS]
60
+
61
+ # pull the eval dataset from the hub and parse any eval requests
62
+ # check completed evals and set them to finished
63
+ check_completed_evals(
64
+ api=API,
65
+ checked_status=RUNNING_STATUS,
66
+ completed_status=FINISHED_STATUS,
67
+ failed_status=FAILED_STATUS,
68
+ hf_repo=QUEUE_REPO,
69
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
70
+ hf_repo_results=RESULTS_REPO,
71
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
72
+ )
73
+
74
+ # Get all eval request that are PENDING, if you want to run other evals, change this parameter
75
+ eval_requests = get_eval_requests(
76
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
77
+ )
78
+ # Sort the evals by priority (first submitted first run)
79
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
80
+
81
+ logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
82
+
83
+ if len(eval_requests) == 0:
84
+ return
85
+
86
+ eval_request = eval_requests[0]
87
+ logger.info(pp.pformat(eval_request))
88
+
89
+ set_eval_request(
90
+ api=API,
91
+ eval_request=eval_request,
92
+ set_to_status=RUNNING_STATUS,
93
+ hf_repo=QUEUE_REPO,
94
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
95
+ )
96
+
97
+ # This needs to be done
98
+ # instance_size, instance_type = get_instance_for_model(eval_request)
99
+ # For GPU
100
+ # instance_size, instance_type = "small", "g4dn.xlarge"
101
+ # For CPU
102
+ # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
103
+ instance_size, instance_type = "x4", "intel-icl"
104
+ logger.info(
105
+ f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
106
+ )
107
+
108
+ run_evaluation(
109
+ eval_request=eval_request,
110
+ task_names=TASKS_LIGHTEVAL,
111
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
112
+ batch_size=1,
113
+ accelerator=ACCELERATOR,
114
+ region=REGION,
115
+ vendor=VENDOR,
116
+ instance_size=instance_size,
117
+ instance_type=instance_type,
118
+ limit=LIMIT,
119
+ )
120
+
121
+ logger.info(
122
+ f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
123
+ )
124
+
125
+
126
+ if __name__ == "__main__":
127
+ run_auto_eval()
pyproject.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ line-length = 119
3
+
4
+ [tool.ruff.lint]
5
+ select = ["C", "E", "F", "I", "W"]
6
+ ignore = ["E501"] # line too long (the formatter is taking care of this)
7
+
8
+ [tool.ruff.lint.isort]
9
+ lines-after-imports = 2
10
+ known-local-folder = ["src"]
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ click==8.1.3
3
+ huggingface-hub>=0.18.0
4
+ python-dateutil==2.8.2
5
+ requests==2.28.2
6
+ tqdm==4.65.0
7
+ accelerate>=0.26.0
8
+ sentencepiece
9
+
10
+ # Evaluation suites
11
+ lighteval>=0.5.0
12
+ lm_eval==0.4.3
13
+
14
+ # Log Visualizer
15
+ BeautifulSoup4==4.12.2
16
+ lxml==4.9.3
17
+ rich==13.3.4
scripts/create_request_file.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pprint
4
+ import re
5
+ from datetime import datetime, timezone
6
+
7
+ import click
8
+ from colorama import Fore
9
+ from huggingface_hub import HfApi, snapshot_download
10
+
11
+ from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
12
+
13
+
14
+ precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
15
+ model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
16
+ weight_types = ("Original", "Delta", "Adapter")
17
+
18
+
19
+ def get_model_size(model_info, precision: str):
20
+ size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
21
+ try:
22
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
23
+ except (AttributeError, TypeError):
24
+ try:
25
+ size_match = re.search(size_pattern, model_info.modelId.lower())
26
+ model_size = size_match.group(0)
27
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
28
+ except AttributeError:
29
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
30
+
31
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
32
+ model_size = size_factor * model_size
33
+ return model_size
34
+
35
+
36
+ def main():
37
+ api = HfApi()
38
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
39
+ snapshot_download(
40
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
41
+ )
42
+
43
+ model_name = click.prompt("Enter model name")
44
+ revision = click.prompt("Enter revision", default="main")
45
+ precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
46
+ model_type = click.prompt("Enter model type", type=click.Choice(model_types))
47
+ weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
48
+ base_model = click.prompt("Enter base model", default="")
49
+ status = click.prompt("Enter status", default="FINISHED")
50
+
51
+ try:
52
+ model_info = api.model_info(repo_id=model_name, revision=revision)
53
+ except Exception as e:
54
+ print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
55
+ return 1
56
+
57
+ model_size = get_model_size(model_info=model_info, precision=precision)
58
+
59
+ try:
60
+ license = model_info.cardData["license"]
61
+ except Exception:
62
+ license = "?"
63
+
64
+ eval_entry = {
65
+ "model": model_name,
66
+ "base_model": base_model,
67
+ "revision": revision,
68
+ "private": False,
69
+ "precision": precision,
70
+ "weight_type": weight_type,
71
+ "status": status,
72
+ "submitted_time": current_time,
73
+ "model_type": model_type,
74
+ "likes": model_info.likes,
75
+ "params": model_size,
76
+ "license": license,
77
+ }
78
+
79
+ user_name = ""
80
+ model_path = model_name
81
+ if "/" in model_name:
82
+ user_name = model_name.split("/")[0]
83
+ model_path = model_name.split("/")[1]
84
+
85
+ pprint.pprint(eval_entry)
86
+
87
+ if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
88
+ click.echo("continuing...")
89
+
90
+ out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
91
+ os.makedirs(out_dir, exist_ok=True)
92
+ out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
93
+
94
+ with open(out_path, "w") as f:
95
+ f.write(json.dumps(eval_entry))
96
+
97
+ api.upload_file(
98
+ path_or_fileobj=out_path,
99
+ path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
100
+ repo_id=QUEUE_REPO,
101
+ repo_type="dataset",
102
+ commit_message=f"Add {model_name} to eval queue",
103
+ )
104
+ else:
105
+ click.echo("aborting...")
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()
scripts/fix_harness_import.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This file should be used after pip install -r requirements.
2
+ It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
+ It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
+ """
5
+
6
+ import os
7
+
8
+ import lm_eval
9
+
10
+
11
+ if __name__ == "__main__":
12
+ lm_eval_path = lm_eval.__path__[0]
13
+ os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
src/backend/manage_requests.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from huggingface_hub import HfApi, snapshot_download
7
+
8
+ from src.envs import TOKEN
9
+ from src.logging import setup_logger
10
+
11
+
12
+ logger = setup_logger(__name__)
13
+
14
+ PENDING_STATUS = "PENDING"
15
+ RUNNING_STATUS = "RUNNING"
16
+ FINISHED_STATUS = "FINISHED"
17
+ FAILED_STATUS = "FAILED"
18
+
19
+
20
+ @dataclass
21
+ class EvalRequest:
22
+ """This class represents one evaluation request file."""
23
+
24
+ model: str
25
+ status: str
26
+ json_filepath: str
27
+ weight_type: str = "Original"
28
+ model_type: Optional[str] = None # pretrained, fine-tuned, etc. - define your own categories in
29
+ precision: str = "" # float16, bfloat16
30
+ revision: str = "main" # commit hash
31
+ submitted_time: Optional[str] = (
32
+ "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
33
+ )
34
+ likes: Optional[int] = 0
35
+ params: Optional[int] = None
36
+ license: Optional[str] = ""
37
+ base_model: Optional[str] = ""
38
+ private: Optional[bool] = False
39
+
40
+ def get_model_args(self):
41
+ """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
42
+ the evaluation suite you chose.
43
+ """
44
+ model_args = f"pretrained={self.model},revision={self.revision}"
45
+
46
+ if self.precision in ["float16", "bfloat16"]:
47
+ model_args += f",dtype={self.precision}"
48
+
49
+ # Quantized models need some added config, the install of bits and bytes, etc
50
+ else:
51
+ raise Exception(f"Unknown precision {self.precision}.")
52
+
53
+ return model_args
54
+
55
+
56
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
57
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
58
+ json_filepath = eval_request.json_filepath
59
+
60
+ with open(json_filepath) as fp:
61
+ data = json.load(fp)
62
+
63
+ data["status"] = set_to_status
64
+
65
+ with open(json_filepath, "w") as f:
66
+ f.write(json.dumps(data))
67
+
68
+ api.upload_file(
69
+ path_or_fileobj=json_filepath,
70
+ path_in_repo=json_filepath.replace(local_dir, ""),
71
+ repo_id=hf_repo,
72
+ repo_type="dataset",
73
+ )
74
+
75
+
76
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
77
+ """Gets all pending evaluation requests and return a list in which private
78
+ models appearing first, followed by public models sorted by the number of
79
+ likes.
80
+
81
+ Returns:
82
+ `list[EvalRequest]`: a list of model info dicts.
83
+ """
84
+ snapshot_download(
85
+ repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
86
+ )
87
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
88
+
89
+ eval_requests = []
90
+ for json_filepath in json_files:
91
+ with open(json_filepath) as fp:
92
+ data = json.load(fp)
93
+ if data["status"] in job_status:
94
+ data["json_filepath"] = json_filepath
95
+ eval_request = EvalRequest(**data)
96
+ eval_requests.append(eval_request)
97
+
98
+ return eval_requests
99
+
100
+
101
+ def eval_was_running(eval_request: EvalRequest):
102
+ """Checks whether a file says it's RUNNING to determine whether to FAIL"""
103
+ json_filepath = eval_request.json_filepath
104
+
105
+ with open(json_filepath) as fp:
106
+ data = json.load(fp)
107
+
108
+ status = data["status"]
109
+ return status == RUNNING_STATUS
110
+
111
+
112
+ def check_completed_evals(
113
+ api: HfApi,
114
+ hf_repo: str,
115
+ local_dir: str,
116
+ checked_status: str,
117
+ completed_status: str,
118
+ failed_status: str,
119
+ hf_repo_results: str,
120
+ local_dir_results: str,
121
+ ):
122
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
123
+ snapshot_download(
124
+ repo_id=hf_repo_results,
125
+ revision="main",
126
+ local_dir=local_dir_results,
127
+ repo_type="dataset",
128
+ max_workers=60,
129
+ token=TOKEN,
130
+ )
131
+
132
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
133
+
134
+ for eval_request in running_evals:
135
+ model = eval_request.model
136
+ logger.info("====================================")
137
+ logger.info(f"Checking {model}")
138
+
139
+ output_path = model
140
+ output_file = f"{local_dir_results}/{output_path}/results*.json"
141
+ output_file_exists = len(glob.glob(output_file)) > 0
142
+
143
+ if output_file_exists:
144
+ logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
145
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
146
+ else:
147
+ if eval_was_running(eval_request=eval_request):
148
+ logger.info(f"No result file found for {model} setting it to {failed_status}")
149
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite_harness.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+ from lm_eval import evaluator, utils
8
+ from lm_eval.tasks import TaskManager
9
+
10
+ from src.backend.manage_requests import EvalRequest
11
+ from src.envs import API
12
+ from src.logging import setup_logger
13
+
14
+
15
+ logging.getLogger("openai").setLevel(logging.WARNING)
16
+ logger = setup_logger(__name__)
17
+
18
+
19
+ def run_evaluation(
20
+ eval_request: EvalRequest,
21
+ task_names: list,
22
+ num_fewshot: int,
23
+ batch_size: Union[int, str],
24
+ device: str,
25
+ local_dir: str,
26
+ results_repo: str,
27
+ limit: int = None,
28
+ ):
29
+ """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
30
+
31
+ Args:
32
+ eval_request (EvalRequest): Input evaluation request file representation
33
+ task_names (list): Tasks to launch
34
+ num_fewshot (int): Number of few shots to use
35
+ batch_size (int or str): Selected batch size or 'auto'
36
+ device (str): "cpu" or "cuda:0", depending on what you assigned to the space
37
+ local_dir (str): Where to save the results locally
38
+ results_repo (str): To which repository to upload the results
39
+ limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
40
+
41
+ Returns:
42
+ _type_: _description_
43
+ """
44
+ if limit:
45
+ logger.info(
46
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
47
+ )
48
+
49
+ task_manager = TaskManager()
50
+ all_tasks = task_manager.all_tasks
51
+ task_names = utils.pattern_match(task_names, all_tasks)
52
+
53
+ logger.info(f"Selected Tasks: {task_names}")
54
+
55
+ results = evaluator.simple_evaluate(
56
+ model="hf",
57
+ model_args=eval_request.get_model_args(),
58
+ tasks=task_names,
59
+ num_fewshot=num_fewshot,
60
+ batch_size=batch_size,
61
+ device=device,
62
+ limit=limit,
63
+ write_out=True, # Whether to write out an example document and model input, for checking task integrity
64
+ )
65
+
66
+ results["config"]["model_dtype"] = eval_request.precision
67
+ results["config"]["model_name"] = eval_request.model
68
+ results["config"]["model_sha"] = eval_request.revision
69
+
70
+ dumped = json.dumps(results, indent=2)
71
+ logger.info(dumped)
72
+
73
+ results_path = Path(local_dir, eval_request.model, f"results_{datetime.now()}.json")
74
+ results_path.parent.mkdir(exist_ok=True, parents=True)
75
+ results_path.write_text(dumped)
76
+
77
+ logger.info(evaluator.make_table(results))
78
+
79
+ API.upload_file(
80
+ path_or_fileobj=results_path,
81
+ path_in_repo=results_path.relative_to(local_dir).as_posix(),
82
+ repo_id=results_repo,
83
+ repo_type="dataset",
84
+ )
85
+
86
+ return results
src/backend/run_eval_suite_lighteval.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import re
4
+
5
+ from lighteval.logging.evaluation_tracker import EvaluationTracker
6
+ from lighteval.models.model_config import InferenceEndpointModelConfig
7
+ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
8
+
9
+ from src.backend.manage_requests import EvalRequest
10
+ from src.envs import OWNER
11
+ from src.logging import setup_logger
12
+
13
+
14
+ logging.getLogger("openai").setLevel(logging.WARNING)
15
+ logger = setup_logger(__name__)
16
+
17
+
18
+ SPECIAL_CHARACTERS_PATTERN = re.compile(r"[^a-zA-Z0-9-]")
19
+
20
+
21
+ def run_evaluation(
22
+ eval_request: EvalRequest,
23
+ task_names: str,
24
+ batch_size: int,
25
+ local_dir: str,
26
+ accelerator: str,
27
+ region: str,
28
+ vendor: str,
29
+ instance_size: str,
30
+ instance_type: str,
31
+ limit=None,
32
+ ):
33
+ """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
34
+
35
+ Args:
36
+ eval_request (EvalRequest): Input evaluation request file representation
37
+ task_names (list): Tasks to launch
38
+ batch_size (int): Selected batch size
39
+ accelerator (str): Inference endpoint parameter for running the evaluation
40
+ region (str): Inference endpoint parameter for running the evaluation
41
+ vendor (str): Inference endpoint parameter for running the evaluation
42
+ instance_size (str): Inference endpoint parameter for running the evaluation
43
+ instance_type (str): Inference endpoint parameter for running the evaluation
44
+ local_dir (str): Where to save the results locally
45
+ limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
46
+ """
47
+
48
+ if limit:
49
+ logger.info(
50
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
51
+ )
52
+
53
+ evaluation_tracker = EvaluationTracker(
54
+ output_dir="./results",
55
+ save_details=True,
56
+ push_to_hub=True,
57
+ push_to_tensorboard=False,
58
+ hub_results_org=OWNER,
59
+ public=False,
60
+ )
61
+
62
+ pipeline_params = PipelineParameters(
63
+ launcher_type=ParallelismManager.ACCELERATE,
64
+ override_batch_size=batch_size,
65
+ max_samples=limit,
66
+ use_chat_template=False,
67
+ system_prompt=None,
68
+ # custom_tasks_directory="custom_tasks.py", # TODO: pass if using a custom task
69
+ )
70
+
71
+ model_config = InferenceEndpointModelConfig(
72
+ # Endpoint parameters
73
+ name=SPECIAL_CHARACTERS_PATTERN.sub("-", eval_request.model.lower()),
74
+ repository=eval_request.model,
75
+ accelerator=accelerator,
76
+ vendor=vendor,
77
+ region=region,
78
+ instance_size=instance_size,
79
+ instance_type=instance_type,
80
+ should_reuse_existing=False,
81
+ model_dtype=eval_request.precision,
82
+ revision=eval_request.revision,
83
+ )
84
+
85
+ pipeline = Pipeline(
86
+ tasks=task_names,
87
+ pipeline_parameters=pipeline_params,
88
+ evaluation_tracker=evaluation_tracker,
89
+ model_config=model_config,
90
+ )
91
+
92
+ try:
93
+ pipeline.evaluate()
94
+ pipeline.show_results()
95
+ pipeline.save_and_push_results()
96
+ results = pipeline.get_results()
97
+
98
+ dumped = json.dumps(results, indent=2)
99
+ logger.info(dumped)
100
+
101
+ except Exception: # if eval failed, we force a cleanup
102
+ pipeline.model.cleanup()
103
+ raise
104
+
105
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ from src.backend.manage_requests import EvalRequest
6
+
7
+
8
+ @dataclass
9
+ class ModelMetadata:
10
+ likes: int = 0
11
+ size: int = 15
12
+
13
+
14
+ # All the functions below sort the models in the queue based on different parameters
15
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
+ private_models = [model for model in models if model.private]
17
+ public_models = [model for model in models if not model.private]
18
+
19
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
+
21
+
22
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
23
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
24
+
25
+
26
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
27
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
28
+
29
+
30
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
31
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/display/css_html_js.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ style_content = """
2
+ pre, code {
3
+ background-color: #272822;
4
+ }
5
+ .scrollable {
6
+ font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
7
+ height: 500px;
8
+ overflow: auto;
9
+ }
10
+ """
11
+ dark_mode_gradio_js = """
12
+ function refresh() {
13
+ const url = new URL(window.location);
14
+
15
+ if (url.searchParams.get('__theme') !== 'dark') {
16
+ url.searchParams.set('__theme', 'dark');
17
+ window.location.href = url.href;
18
+ }
19
+ }
20
+ """
src/display/log_visualizer.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
+
3
+ from bs4 import BeautifulSoup
4
+ from rich.console import Console
5
+ from rich.syntax import Syntax
6
+
7
+ from src.display.css_html_js import style_content
8
+ from src.envs import NUM_LINES_VISUALIZE
9
+ from src.logging import log_file
10
+
11
+
12
+ def log_file_to_html_string(reverse=True):
13
+ with open(log_file, "rt") as f:
14
+ lines = f.readlines()
15
+ lines = lines[-NUM_LINES_VISUALIZE:]
16
+
17
+ if reverse:
18
+ lines = reversed(lines)
19
+
20
+ output = "".join(lines)
21
+ syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
22
+
23
+ console = Console(record=True, width=150, style="#272822", file=StringIO())
24
+ console.print(syntax)
25
+ html_content = console.export_html(inline_styles=True)
26
+
27
+ # Parse the HTML content using BeautifulSoup
28
+ soup = BeautifulSoup(html_content, "lxml")
29
+
30
+ # Modify the <pre> tag and add custom styles
31
+ pre_tag = soup.pre
32
+ pre_tag["class"] = "scrollable"
33
+ del pre_tag["style"]
34
+
35
+ # Add your custom styles and the .scrollable CSS to the <style> tag
36
+ style_tag = soup.style
37
+ style_tag.append(style_content)
38
+
39
+ return soup.prettify()
src/envs.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+
6
+ # Info to change for your repository
7
+ # ----------------------------------
8
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
9
+
10
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
11
+
12
+ # For harness evaluations
13
+ DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
14
+ LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
15
+ NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
16
+ TASKS_HARNESS = ["anli_r1", "logiqa"]
17
+
18
+ # For lighteval evaluations
19
+ ACCELERATOR = "cpu"
20
+ REGION = "us-east-1"
21
+ VENDOR = "aws"
22
+ TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
23
+ # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
24
+
25
+ # ---------------------------------------------------
26
+ REPO_ID = f"{OWNER}/backend"
27
+ QUEUE_REPO = f"{OWNER}/requests"
28
+ RESULTS_REPO = f"{OWNER}/results"
29
+
30
+ # If you setup a cache later, just change HF_HOME
31
+ CACHE_PATH = os.getenv("HF_HOME", ".")
32
+
33
+ # Local caches
34
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
35
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
36
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
37
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
38
+
39
+ REFRESH_RATE = 10 * 60 # 10 min
40
+ NUM_LINES_VISUALIZE = 300
41
+
42
+ API = HfApi(token=TOKEN)
src/logging.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+
5
+ proj_dir = Path(__file__).parents[1]
6
+
7
+ log_file = proj_dir / "output.log"
8
+
9
+
10
+ def setup_logger(name: str):
11
+ logger = logging.getLogger(name)
12
+ logger.setLevel(logging.INFO)
13
+
14
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
15
+
16
+ # Create a file handler to write logs to a file
17
+ file_handler = logging.FileHandler(log_file)
18
+ file_handler.setLevel(logging.INFO)
19
+ file_handler.setFormatter(formatter)
20
+ logger.addHandler(file_handler)
21
+
22
+ return logger
23
+
24
+
25
+ def configure_root_logger():
26
+ # Configure the root logger
27
+ logging.basicConfig(level=logging.INFO)
28
+ root_logger = logging.getLogger()
29
+
30
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
31
+
32
+ file_handler = logging.FileHandler(log_file)
33
+ file_handler.setLevel(logging.INFO)
34
+ file_handler.setFormatter(formatter)
35
+
36
+ root_logger.addHandler(file_handler)