Update README.md
Browse files
README.md
CHANGED
|
@@ -37,13 +37,19 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
|
| 37 |
|
| 38 |
model_id = "Qwen/Qwen3-32B"
|
| 39 |
|
|
|
|
| 40 |
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
| 41 |
quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
| 42 |
quantization_config = TorchAoConfig(quant_type=quant_config)
|
| 43 |
-
quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 45 |
|
| 46 |
-
|
| 47 |
prompt = "Give me a short introduction to large language model."
|
| 48 |
messages = [
|
| 49 |
{"role": "user", "content": prompt}
|
|
@@ -75,6 +81,14 @@ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("
|
|
| 75 |
|
| 76 |
print("thinking content:", thinking_content)
|
| 77 |
print("content:", content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
```
|
| 79 |
|
| 80 |
# 4. Model Quality
|
|
|
|
| 37 |
|
| 38 |
model_id = "Qwen/Qwen3-32B"
|
| 39 |
|
| 40 |
+
## Step 1: Convert to float8
|
| 41 |
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
| 42 |
quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
| 43 |
quantization_config = TorchAoConfig(quant_type=quant_config)
|
| 44 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(
|
| 45 |
+
model_id,
|
| 46 |
+
device_map="auto",
|
| 47 |
+
torch_dtype=torch.bfloat16,
|
| 48 |
+
quantization_config=quantization_config,
|
| 49 |
+
)
|
| 50 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 51 |
|
| 52 |
+
## Step 2: Sanity check
|
| 53 |
prompt = "Give me a short introduction to large language model."
|
| 54 |
messages = [
|
| 55 |
{"role": "user", "content": prompt}
|
|
|
|
| 81 |
|
| 82 |
print("thinking content:", thinking_content)
|
| 83 |
print("content:", content)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# Step 3: Upload to HF
|
| 87 |
+
USER_ID = "YOUR_USER_ID"
|
| 88 |
+
MODEL_NAME = model_id.split("/")[-1]
|
| 89 |
+
save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
|
| 90 |
+
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
| 91 |
+
tokenizer.push_to_hub(save_to)
|
| 92 |
```
|
| 93 |
|
| 94 |
# 4. Model Quality
|