Spaces:
Sleeping
Sleeping
File size: 8,025 Bytes
dc16fd5 fae8f1e 192a2fd fae8f1e cafecc7 fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 192a2fd fae8f1e 3216863 fae8f1e 6653064 fae8f1e 6653064 d0929ed fae8f1e f4405b7 dc16fd5 fae8f1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import pandas as pd
from tabulate import tabulate
from io import StringIO
def calculate_llm_metrics(num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window):
output = StringIO()
# Print input to output buffer
print(f" num_gpu = {num_gpu}, prompt_size = {prompt_size} tokens, response_size = {response_size} tokens", file=output)
print(f" n_concurrent_request = {n_concurrent_request}, avg_context_window = {avg_context_window} tokens", file=output)
# Define variables
gpu_specs = [
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
{"name": "L40", "fp16_tflops": 181, "memory_gb": 48, "memory_bandwidth_gbps": 864},
{"name": "L40s", "fp16_tflops": 362, "memory_gb": 48, "memory_bandwidth_gbps": 864},
{"name": "A100 40 GB", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
{"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
{"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 1935},
{"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 2039},
{"name": "H100 PCIe", "fp16_tflops": 1513, "memory_gb": 80, "memory_bandwidth_gbps": 2000},
{"name": "H100 SXM", "fp16_tflops": 1979, "memory_gb": 80, "memory_bandwidth_gbps": 3350},
{"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
]
model_specs = [
{"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
{"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
{"name": "Llama-3.1-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 131072, "d_head": 128},
{"name": "Llama-3.1-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 131072, "d_head": 128},
{"name": "Mistral-7B-v0.3", "params_billion": 7, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 32768, "d_head": 128},
{"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
{"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
{"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
]
BYTES_IN_GB = 1_073_741_824
def calc_kv_cache_size_per_token(n_layers, d_model):
return 2 * 2 * n_layers * d_model / BYTES_IN_GB
def calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window):
kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
return target_gpu_mem
print(f"\n******************** Estimate LLM Memory Footprint ********************", file=output)
memory_footprint_table = []
for model_spec in model_specs:
kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
memory_footprint = calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window)
memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
memory_df = pd.DataFrame(memory_footprint_table, columns=['Model', 'KV Cache Size per Token', 'Memory Footprint'])
print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'), file=output)
def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
return result if result >= 0 else "OOM"
def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
result = (2 * model_params_billion / num_gpu) / fp16_tflops
return result if result >= 0 else "OOM"
def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
return result if result >= 0 else "OOM"
def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
if isinstance(prefill_time, str) or isinstance(generation_time, str):
return "OOM"
return (prompt_size * prefill_time + response_size * generation_time) / 1000
print(f"\n******************** Estimate LLM Capacity and Latency ******************** ", file=output)
capacity_latency_table = []
for model in model_specs:
kv_cache_size = calc_kv_cache_size_per_token(model['n_layers'], model['d_model'])
for gpu in gpu_specs:
kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size)
prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
capacity_df = pd.DataFrame(capacity_latency_table, columns=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'])
print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'), file=output)
return output.getvalue(), memory_df, capacity_df
# Create Gradio interface
with gr.Blocks(title="LLM Calculator") as demo:
gr.Markdown("# A calculator to estimate the memory footprint, capacity, and latency on VMware Private AI with NVIDIA")
with gr.Row():
with gr.Column():
num_gpu = gr.Slider(minimum=1, maximum=1000, value=1, step=1, label="Number of GPUs")
prompt_size = gr.Slider(minimum=1, maximum=8192, value=4096, step=1, label="Prompt Size (tokens)")
response_size = gr.Slider(minimum=1, maximum=4096, value=256, step=1, label="Response Size (tokens)")
n_concurrent_request = gr.Slider(minimum=1, maximum=1000, value=10, step=1, label="Number of Concurrent Requests")
avg_context_window = gr.Slider(minimum=1, maximum=262144, value=32768, step=1, label="Average Context Window (tokens)")
calculate_button = gr.Button("Calculate")
with gr.Row():
with gr.Column():
text_output = gr.Textbox(label="Detailed Output", lines=10)
with gr.Row():
with gr.Column():
memory_table = gr.Dataframe(label="Memory Footprint Results")
with gr.Row():
with gr.Column():
capacity_table = gr.Dataframe(label="Capacity and Latency Results")
calculate_button.click(
calculate_llm_metrics,
inputs=[num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window],
outputs=[text_output, memory_table, capacity_table]
)
if __name__ == "__main__":
demo.launch() |