Update README.md
Browse files
README.md
CHANGED
|
@@ -18,13 +18,13 @@ pipeline_tag: text-generation
|
|
| 18 |
# Inference with vLLM
|
| 19 |
```Shell
|
| 20 |
# Server
|
| 21 |
-
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-
|
| 22 |
```
|
| 23 |
|
| 24 |
```Shell
|
| 25 |
# Client
|
| 26 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 27 |
-
"model": "pytorch/Qwen3-32B-
|
| 28 |
"messages": [
|
| 29 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 30 |
],
|
|
@@ -41,7 +41,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
|
|
| 41 |
import torch
|
| 42 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 43 |
|
| 44 |
-
model_name = "pytorch/Qwen3-32B-
|
| 45 |
|
| 46 |
# load the tokenizer and the model
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -119,7 +119,7 @@ Optionally, upload to your HF hub
|
|
| 119 |
```Py
|
| 120 |
USER_ID = "YOUR_USER_ID"
|
| 121 |
MODEL_NAME = model_id.split("/")[-1]
|
| 122 |
-
save_to = f"{USER_ID}/{MODEL_NAME}-
|
| 123 |
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
| 124 |
tokenizer.push_to_hub(save_to)
|
| 125 |
```
|
|
@@ -129,7 +129,7 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
|
|
| 129 |
|
| 130 |
| Benchmark | | |
|
| 131 |
|----------------------------------|----------------|---------------------------|
|
| 132 |
-
| | Qwen3-32B | Qwen3-32B-
|
| 133 |
| **General** | | |
|
| 134 |
| mmlu | 80.71 | 80.67 |
|
| 135 |
| bbh | 37.49 | 38.01 |
|
|
@@ -151,9 +151,9 @@ https://github.com/EleutherAI/lm-evaluation-harness#install
|
|
| 151 |
lm_eval --model hf --model_args pretrained=Qwen/Qwen3-32B --tasks mmlu --device cuda:0 --batch_size 8
|
| 152 |
```
|
| 153 |
|
| 154 |
-
## float8 dynamic quantization (
|
| 155 |
```Shell
|
| 156 |
-
export MODEL=pytorch/Qwen3-32B-
|
| 157 |
# or
|
| 158 |
# export MODEL=Qwen/Qwen3-32B
|
| 159 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
|
@@ -164,7 +164,7 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
|
|
| 164 |
|
| 165 |
| Memory (tested on H100) | | |
|
| 166 |
|----------------------------------|----------------|-------------------------------|
|
| 167 |
-
| | Qwen3-32B | Qwen3-32B-
|
| 168 |
| Peak Memory | 65.72 GB | 34.54 GB (47.44% reduction) |
|
| 169 |
|
| 170 |
<details>
|
|
@@ -175,7 +175,7 @@ Code
|
|
| 175 |
import torch
|
| 176 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 177 |
|
| 178 |
-
model_name = "Qwen/Qwen3-32B" # pytorch/Qwen3-32B-
|
| 179 |
|
| 180 |
# load the tokenizer and the model
|
| 181 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -232,7 +232,7 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
| 232 |
|
| 233 |
| Benchmark (Tested on H100) | | |
|
| 234 |
|----------------------------------|----------------|-------------------------------|
|
| 235 |
-
| | Qwen3-32B | Qwen3-32B-
|
| 236 |
| latency (batch_size=1) | 9.1s | 5.77s (1.58x speedup) |
|
| 237 |
| latency (batch_size=128) | 12.45s | 8.40s (1.48x speedup) |
|
| 238 |
|
|
@@ -248,7 +248,7 @@ VLLM_USE_PRECOMPILED=1 pip install --editable .
|
|
| 248 |
|
| 249 |
**2. Latency benchmarking**
|
| 250 |
```Shell
|
| 251 |
-
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-
|
| 252 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 253 |
```
|
| 254 |
</details>
|
|
|
|
| 18 |
# Inference with vLLM
|
| 19 |
```Shell
|
| 20 |
# Server
|
| 21 |
+
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-FP8 --tokenizer Qwen/Qwen3-32B -O3
|
| 22 |
```
|
| 23 |
|
| 24 |
```Shell
|
| 25 |
# Client
|
| 26 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 27 |
+
"model": "pytorch/Qwen3-32B-FP8",
|
| 28 |
"messages": [
|
| 29 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 30 |
],
|
|
|
|
| 41 |
import torch
|
| 42 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 43 |
|
| 44 |
+
model_name = "pytorch/Qwen3-32B-FP8"
|
| 45 |
|
| 46 |
# load the tokenizer and the model
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 119 |
```Py
|
| 120 |
USER_ID = "YOUR_USER_ID"
|
| 121 |
MODEL_NAME = model_id.split("/")[-1]
|
| 122 |
+
save_to = f"{USER_ID}/{MODEL_NAME}-FP8"
|
| 123 |
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
| 124 |
tokenizer.push_to_hub(save_to)
|
| 125 |
```
|
|
|
|
| 129 |
|
| 130 |
| Benchmark | | |
|
| 131 |
|----------------------------------|----------------|---------------------------|
|
| 132 |
+
| | Qwen3-32B | Qwen3-32B-FP8 |
|
| 133 |
| **General** | | |
|
| 134 |
| mmlu | 80.71 | 80.67 |
|
| 135 |
| bbh | 37.49 | 38.01 |
|
|
|
|
| 151 |
lm_eval --model hf --model_args pretrained=Qwen/Qwen3-32B --tasks mmlu --device cuda:0 --batch_size 8
|
| 152 |
```
|
| 153 |
|
| 154 |
+
## float8 dynamic quantization (FP8)
|
| 155 |
```Shell
|
| 156 |
+
export MODEL=pytorch/Qwen3-32B-FP8
|
| 157 |
# or
|
| 158 |
# export MODEL=Qwen/Qwen3-32B
|
| 159 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
|
|
|
| 164 |
|
| 165 |
| Memory (tested on H100) | | |
|
| 166 |
|----------------------------------|----------------|-------------------------------|
|
| 167 |
+
| | Qwen3-32B | Qwen3-32B-FP8 |
|
| 168 |
| Peak Memory | 65.72 GB | 34.54 GB (47.44% reduction) |
|
| 169 |
|
| 170 |
<details>
|
|
|
|
| 175 |
import torch
|
| 176 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 177 |
|
| 178 |
+
model_name = "Qwen/Qwen3-32B" # pytorch/Qwen3-32B-FP8
|
| 179 |
|
| 180 |
# load the tokenizer and the model
|
| 181 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 232 |
|
| 233 |
| Benchmark (Tested on H100) | | |
|
| 234 |
|----------------------------------|----------------|-------------------------------|
|
| 235 |
+
| | Qwen3-32B | Qwen3-32B-FP8 |
|
| 236 |
| latency (batch_size=1) | 9.1s | 5.77s (1.58x speedup) |
|
| 237 |
| latency (batch_size=128) | 12.45s | 8.40s (1.48x speedup) |
|
| 238 |
|
|
|
|
| 248 |
|
| 249 |
**2. Latency benchmarking**
|
| 250 |
```Shell
|
| 251 |
+
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
|
| 252 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 253 |
```
|
| 254 |
</details>
|