|
--- |
|
license: apache-2.0 |
|
train: false |
|
inference: false |
|
pipeline_tag: text-generation |
|
--- |
|
This is an <a href="https://github.com/mobiusml/hqq/">HQQ</a> all 4-bit (group-size=64) quantized <a href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M"> Qwen2.5-7B-Instruct-1M</a> model. |
|
|
|
|
|
## Usage |
|
First, install the dependecies: |
|
``` |
|
pip install git+https://github.com/mobiusml/hqq.git; |
|
pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend |
|
pip install bitblas #to use the bitblas backend |
|
``` |
|
|
|
Then you can use the sample code below: |
|
``` Python |
|
import torch |
|
device = 'cuda:0' |
|
backend = 'torchao_int4' #'torchao_int4' #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit) |
|
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16 |
|
cache_dir = None |
|
model_id = 'mobiuslabsgmbh/Qwen2.5-7B-Instruct-1M_4bitgs64_hqq_hf' |
|
|
|
is_prequantized = 'hqq_hf' in model_id |
|
######################################################################## |
|
#Load model |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype=compute_dtype, |
|
cache_dir=cache_dir, |
|
device_map=device, |
|
attn_implementation="sdpa", |
|
low_cpu_mem_usage=True, |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) |
|
|
|
#Save model before patching |
|
# model.save_pretrained(saved_quant_model) |
|
# tokenizer.save_pretrained(saved_quant_model) |
|
|
|
#Patching |
|
from hqq.utils.patching import prepare_for_inference |
|
prepare_for_inference(model, backend=backend, verbose=True) |
|
|
|
#Load GemLite cache |
|
if(backend == 'gemlite'): |
|
import gemlite |
|
gemlite.core.GEMLITE_TRITON_RESTRICT_M = True |
|
gemlite.core.GemLiteLinear.load_config('/tmp/gemlite_config.json') |
|
|
|
######################################################################## |
|
# ##Inference Using a custom hqq generator - currently manual compile breaks with pre-quantized llama models :( |
|
# from hqq.utils.generation_hf import HFGenerator |
|
# gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=False).enable_cuda_graph() |
|
|
|
# out = gen.generate("Write an essay about large language models.", print_tokens=True) |
|
|
|
######################################################################## |
|
#Inference with model,generate() |
|
from hqq.utils.generation_hf import patch_model_for_compiled_runtime |
|
|
|
patch_model_for_compiled_runtime(model, tokenizer) |
|
|
|
prompt = "Write an essay about large language models." |
|
inputs = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True) |
|
outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id) |
|
#print(tokenizer.decode(outputs[0]) |
|
|
|
######################################################################## |
|
#Save gemlite cache |
|
if(backend == 'gemlite'): |
|
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') |
|
``` |
|
|
|
Use in <a href="https://github.com/vllm-project/vllm/">vllm</a>: |
|
```Python |
|
from vllm import LLM |
|
from vllm.sampling_params import SamplingParams |
|
|
|
model_id = "mobiuslabsgmbh/Qwen2.5-7B-Instruct-1M_4bitgs64_hqq_hf" |
|
|
|
llm = LLM(model=model_id, max_model_len=4096, enable_chunked_prefill=False) |
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024) |
|
outputs = llm.generate(["What is the capital of Germany?"], sampling_params) |
|
print(outputs[0].outputs[0].text) |
|
``` |