Update README.md
Browse files
README.md
CHANGED
|
@@ -11,7 +11,7 @@ language:
|
|
| 11 |
|
| 12 |
# AWQ-INT4 google/gemma-3-12b-it model
|
| 13 |
|
| 14 |
-
- **Developed by:**
|
| 15 |
- **License:** apache-2.0
|
| 16 |
- **Quantized from Model :** google/gemma-3-12b-it
|
| 17 |
- **Quantization Method :** AWQ-INT4
|
|
@@ -33,14 +33,14 @@ pip install torchao
|
|
| 33 |
Then we can serve with the following command:
|
| 34 |
```Shell
|
| 35 |
# Server
|
| 36 |
-
export MODEL=
|
| 37 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 38 |
```
|
| 39 |
|
| 40 |
```Shell
|
| 41 |
# Client
|
| 42 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 43 |
-
"model": "
|
| 44 |
"messages": [
|
| 45 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 46 |
],
|
|
@@ -69,7 +69,7 @@ Example:
|
|
| 69 |
import torch
|
| 70 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 71 |
|
| 72 |
-
model_name = "
|
| 73 |
|
| 74 |
# load the tokenizer and the model
|
| 75 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -240,7 +240,7 @@ lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it --tasks mmlu --
|
|
| 240 |
|
| 241 |
## AWQ-INT4
|
| 242 |
```Shell
|
| 243 |
-
export MODEL=
|
| 244 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 245 |
```
|
| 246 |
</details>
|
|
@@ -268,8 +268,8 @@ We can use the following code to get a sense of peak memory usage during inferen
|
|
| 268 |
import torch
|
| 269 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 270 |
|
| 271 |
-
# use "google/gemma-3-12b-it" or "
|
| 272 |
-
model_id = "
|
| 273 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", torch_dtype=torch.bfloat16)
|
| 274 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 275 |
|
|
@@ -349,7 +349,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
| 349 |
|
| 350 |
### AWQ-INT4
|
| 351 |
```Shell
|
| 352 |
-
export MODEL=
|
| 353 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 354 |
```
|
| 355 |
</details>
|
|
|
|
| 11 |
|
| 12 |
# AWQ-INT4 google/gemma-3-12b-it model
|
| 13 |
|
| 14 |
+
- **Developed by:** pytorch
|
| 15 |
- **License:** apache-2.0
|
| 16 |
- **Quantized from Model :** google/gemma-3-12b-it
|
| 17 |
- **Quantization Method :** AWQ-INT4
|
|
|
|
| 33 |
Then we can serve with the following command:
|
| 34 |
```Shell
|
| 35 |
# Server
|
| 36 |
+
export MODEL=pytorch/gemma-3-12b-it-AWQ-INT4
|
| 37 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 38 |
```
|
| 39 |
|
| 40 |
```Shell
|
| 41 |
# Client
|
| 42 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 43 |
+
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
|
| 44 |
"messages": [
|
| 45 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 46 |
],
|
|
|
|
| 69 |
import torch
|
| 70 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 71 |
|
| 72 |
+
model_name = "pytorch/gemma-3-12b-it-AWQ-INT4"
|
| 73 |
|
| 74 |
# load the tokenizer and the model
|
| 75 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 240 |
|
| 241 |
## AWQ-INT4
|
| 242 |
```Shell
|
| 243 |
+
export MODEL=pytorch/gemma-3-12b-it-AWQ-INT4
|
| 244 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 245 |
```
|
| 246 |
</details>
|
|
|
|
| 268 |
import torch
|
| 269 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 270 |
|
| 271 |
+
# use "google/gemma-3-12b-it" or "pytorch/gemma-3-12b-it-AWQ-INT4"
|
| 272 |
+
model_id = "pytorch/gemma-3-12b-it-AWQ-INT4"
|
| 273 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", torch_dtype=torch.bfloat16)
|
| 274 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 275 |
|
|
|
|
| 349 |
|
| 350 |
### AWQ-INT4
|
| 351 |
```Shell
|
| 352 |
+
export MODEL=pytorch/gemma-3-12b-it-AWQ-INT4
|
| 353 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 354 |
```
|
| 355 |
</details>
|