Update README.md
Browse files
README.md
CHANGED
|
@@ -236,6 +236,8 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
| 236 |
| latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
|
| 237 |
| latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
|
| 238 |
|
|
|
|
|
|
|
| 239 |
<details>
|
| 240 |
<summary> Reproduce latency benchmarks </summary>
|
| 241 |
|
|
@@ -245,8 +247,13 @@ git clone [email protected]:vllm-project/vllm.git
|
|
| 245 |
cd vllm
|
| 246 |
VLLM_USE_PRECOMPILED=1 pip install --editable .
|
| 247 |
```
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
| 249 |
**2. Latency benchmarking**
|
|
|
|
|
|
|
| 250 |
```Shell
|
| 251 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
|
| 252 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
|
|
|
| 236 |
| latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
|
| 237 |
| latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
|
| 238 |
|
| 239 |
+
Note: tested with `fbgemm-gpu-genai` installed.
|
| 240 |
+
|
| 241 |
<details>
|
| 242 |
<summary> Reproduce latency benchmarks </summary>
|
| 243 |
|
|
|
|
| 247 |
cd vllm
|
| 248 |
VLLM_USE_PRECOMPILED=1 pip install --editable .
|
| 249 |
```
|
| 250 |
+
To use fbgemm kernels:
|
| 251 |
+
```Shell
|
| 252 |
+
pip install fbgemm-gpu-genai
|
| 253 |
+
```
|
| 254 |
**2. Latency benchmarking**
|
| 255 |
+
|
| 256 |
+
|
| 257 |
```Shell
|
| 258 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
|
| 259 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|