pytorch
/

Qwen3-32B-FP8

@@ -18,13 +18,13 @@ pipeline_tag: text-generation
 # Inference with vLLM
 ```Shell
 # Server
-VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-float8dq --tokenizer Qwen/Qwen3-32B -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
-  "model": "pytorch/Qwen3-32B-float8dq",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
@@ -41,7 +41,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "pytorch/Qwen3-32B-float8dq"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -119,7 +119,7 @@ Optionally, upload to your HF hub
 ```Py
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
-save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
 quantized_model.push_to_hub(save_to, safe_serialization=False)
 tokenizer.push_to_hub(save_to)
 ```
@@ -129,7 +129,7 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
 | Benchmark                        |                |                           |
 |----------------------------------|----------------|---------------------------|
-|                                  | Qwen3-32B      | Qwen3-32B-float8dq        |
 | **General**                      |                |                           |
 | mmlu                             | 80.71          | 80.67                     |
 | bbh                              | 37.49          | 38.01                     |
@@ -151,9 +151,9 @@ https://github.com/EleutherAI/lm-evaluation-harness#install
 lm_eval --model hf --model_args pretrained=Qwen/Qwen3-32B --tasks mmlu --device cuda:0 --batch_size 8
 ```
-## float8 dynamic quantization (float8dq)
 ```Shell
-export MODEL=pytorch/Qwen3-32B-float8dq
 # or
 # export MODEL=Qwen/Qwen3-32B
 lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
@@ -164,7 +164,7 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
 | Memory (tested on H100)          |                |                               |
 |----------------------------------|----------------|-------------------------------|
-|                                  | Qwen3-32B      | Qwen3-32B-float8dq            |
 | Peak Memory                      | 65.72 GB       | 34.54 GB (47.44% reduction)   |
 <details>
@@ -175,7 +175,7 @@ Code
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "Qwen/Qwen3-32B" # pytorch/Qwen3-32B-float8dq
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -232,7 +232,7 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
 | Benchmark  (Tested on H100)      |                |                               |
 |----------------------------------|----------------|-------------------------------|
-|                                  | Qwen3-32B      | Qwen3-32B-float8dq            |
 | latency (batch_size=1)           | 9.1s           | 5.77s (1.58x speedup)         |
 | latency (batch_size=128)         | 12.45s         | 8.40s (1.48x speedup)         |
@@ -248,7 +248,7 @@ VLLM_USE_PRECOMPILED=1 pip install --editable .
 **2. Latency benchmarking**
 ```Shell
-export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-float8dq
 VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
 ```
 </details>

 # Inference with vLLM
 ```Shell
 # Server
+VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Qwen3-32B-FP8 --tokenizer Qwen/Qwen3-32B -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "pytorch/Qwen3-32B-FP8",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "pytorch/Qwen3-32B-FP8"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 ```Py
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
+save_to = f"{USER_ID}/{MODEL_NAME}-FP8"
 quantized_model.push_to_hub(save_to, safe_serialization=False)
 tokenizer.push_to_hub(save_to)
 ```
 | Benchmark                        |                |                           |
 |----------------------------------|----------------|---------------------------|
+|                                  | Qwen3-32B      | Qwen3-32B-FP8        |
 | **General**                      |                |                           |
 | mmlu                             | 80.71          | 80.67                     |
 | bbh                              | 37.49          | 38.01                     |
 lm_eval --model hf --model_args pretrained=Qwen/Qwen3-32B --tasks mmlu --device cuda:0 --batch_size 8
 ```
+## float8 dynamic quantization (FP8)
 ```Shell
+export MODEL=pytorch/Qwen3-32B-FP8
 # or
 # export MODEL=Qwen/Qwen3-32B
 lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
 | Memory (tested on H100)          |                |                               |
 |----------------------------------|----------------|-------------------------------|
+|                                  | Qwen3-32B      | Qwen3-32B-FP8            |
 | Peak Memory                      | 65.72 GB       | 34.54 GB (47.44% reduction)   |
 <details>
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Qwen/Qwen3-32B" # pytorch/Qwen3-32B-FP8
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 | Benchmark  (Tested on H100)      |                |                               |
 |----------------------------------|----------------|-------------------------------|
+|                                  | Qwen3-32B      | Qwen3-32B-FP8            |
 | latency (batch_size=1)           | 9.1s           | 5.77s (1.58x speedup)         |
 | latency (batch_size=128)         | 12.45s         | 8.40s (1.48x speedup)         |
 **2. Latency benchmarking**
 ```Shell
+export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
 VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
 ```
 </details>