Spaces:
Running
Running
| import os | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| from .utils import process_quantization_scheme, process_arch | |
| LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| COLUMNS_MAPPING = { | |
| "Model": "Model π€", | |
| "experiment_name": "Experiment π§ͺ", | |
| # primary measurements | |
| "forward.latency(s)": "Prefill (s)", | |
| "decode.throughput(tokens/s)": "Decode (tokens/s)", | |
| "generate.max_memory_allocated(MB)": "Memory (MB)", | |
| "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)", | |
| # deployment settings | |
| "backend.name": "Backend π", | |
| "backend.torch_dtype": "DType π₯", | |
| "optimization": "Optimization π οΈ", | |
| "quantization": "Quantization ποΈ", | |
| # additional measurements | |
| "Size": "Params (B)", | |
| "Arch": "Architecture ποΈ", | |
| "Score": "Open LLM Score (%)", | |
| "generate.latency(s)": "End-to-End (s)", | |
| "generate.throughput(tokens/s)": "End-to-End (tokens/s)", | |
| "generate.max_memory_reserved(MB)": "Reserved Memory (MB)", | |
| "generate.max_memory_used(MB)": "Used Memory (MB)", | |
| } | |
| SORTING_COLUMNS = [ | |
| "Open LLM Score (%)", | |
| "Decode (tokens/s)", | |
| "Prefill (s)", | |
| ] | |
| SORTING_ASCENDING = [False, True, False] | |
| def get_llm_df(): | |
| # commented for now since scraping script is not working | |
| hf_hub_download( | |
| repo_id=LLM_PERF_DATASET_REPO, | |
| filename="open-llm.csv", | |
| local_dir="dataset", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| llm_df = pd.read_csv("dataset/open-llm.csv") | |
| return llm_df | |
| def get_perf_df(machine: str = "hf-dgx-01"): | |
| hf_hub_download( | |
| repo_id=LLM_PERF_DATASET_REPO, | |
| filename=f"{machine}/perf-report.csv", | |
| local_dir="dataset", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv") | |
| return perf_df | |
| def get_llm_perf_df(machine: str = "hf-dgx-01"): | |
| # get dataframes | |
| llm_df = get_llm_df() | |
| perf_df = get_perf_df(machine=machine) | |
| llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model") | |
| # some assertions | |
| assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1 | |
| assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1 | |
| assert llm_perf_df["benchmark.new_tokens"].nunique() == 1 | |
| # transpose energy consumption | |
| llm_perf_df["generate.energy_consumption(tokens/kWh)"] = ( | |
| 1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1) | |
| ).astype(int) | |
| # fix nan values | |
| llm_perf_df.loc[ | |
| llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1, | |
| "generate.energy_consumption(tokens/kWh)", | |
| ] = pd.NA | |
| # add optimization column | |
| llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply( | |
| lambda x: ( | |
| "BetterTransformer" | |
| if x["backend.to_bettertransformer"] | |
| else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None") | |
| ), | |
| axis=1, | |
| ) | |
| # add quantization scheme | |
| llm_perf_df["quantization"] = llm_perf_df[ | |
| [ | |
| "backend.quantization_scheme", | |
| "backend.quantization_config.bits", | |
| "backend.quantization_config.version", | |
| "backend.quantization_config.load_in_4bit", | |
| "backend.quantization_config.load_in_8bit", | |
| "backend.quantization_config.exllama_config.version", | |
| ] | |
| ].apply(lambda x: process_quantization_scheme(x), axis=1) | |
| # process experiment name | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", "")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply( | |
| lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x | |
| ) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA")) | |
| llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2")) | |
| # add arch | |
| llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch) | |
| # filter columns | |
| llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())] | |
| # rename columns | |
| llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True) | |
| # sort by metric | |
| llm_perf_df.sort_values( | |
| by=SORTING_COLUMNS, | |
| ascending=SORTING_ASCENDING, | |
| inplace=True, | |
| ) | |
| return llm_perf_df | |