LOGO = '<img src="https://raw.githubusercontent.com/huggingface/optimum-benchmark/main/logo.png">' TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ</h1>""" INTRODUCTION = """ The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors. Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking: - Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically. - Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically. """ ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3> <ul> <li>To avoid communication-dependent results, only one GPU is used.</li> <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li> <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.</li> <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li> <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li> </ul> """ EXAMPLE_CONFIG = """ Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark: ```yaml defaults: - backend: pytorch - _base_ # inheriting from base config - _self_ # for hydra 1.1 compatibility experiment_name: pytorch+cuda+float16+gptq-4bit+exllama-v1 device: cuda backend: no_weights: true torch_dtype: float16 quantization_scheme: gptq quantization_config: bits: 4 use_cuda_fp16: false use_exllama: true exllama_config: version: 1 ``` Where the base config is: ```yaml defaults: - benchmark: inference # default benchmark - launcher: process # isolated process launcher - experiment # inheriting from experiment config - _self_ # for hydra 1.1 compatibility - override hydra/job_logging: colorlog # colorful logging - override hydra/hydra_logging: colorlog # colorful logging hydra: run: dir: dataset/${oc.env:HOSTNAME}/${experiment_name}/${model} job: chdir: true env_set: COUNTRY_ISO_CODE: FRA OVERRIDE_BENCHMARKS: 0 CUDA_VISIBLE_DEVICES: 0 CUDA_DEVICE_ORDER: PCI_BUS_ID backend: continuous_isolation: true benchmark: duration: 10 memory: true energy: true input_shapes: batch_size: 1 sequence_length: 256 new_tokens: 256 hub_kwargs: trust_remote_code: true ``` """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results." CITATION_BUTTON = r"""@misc{llm-perf-leaderboard, author = {Ilyas Moutawwakil, RΓ©gis Pierrard}, title = {LLM-Perf Leaderboard}, year = {2023}, publisher = {Hugging Face}, howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}", @software{optimum-benchmark, author = {Ilyas Moutawwakil, RΓ©gis Pierrard}, publisher = {Hugging Face}, title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.}, } """