nytopop
/

Qwen2.5-Coder-7B-Instruct.w8a8

Text Generation

text-generation-inference

8-bit precision

compressed-tensors

Model card Files Files and versions

Qwen2.5-Coder-7B-Instruct.w8a8 / README.md

nytopop's picture

Upload folder using huggingface_hub

daf6d82 verified 7 months ago

|

history blame contribute delete

1.54 kB

	---
	library_name: transformers
	license: apache-2.0
	base_model: Qwen/Qwen2.5-Coder-7B-Instruct
	---

	```python
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from datasets import load_dataset
	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import GPTQModifier
	from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

	model_id = "Qwen/Qwen2.5-Coder-7B-Instruct"
	model_out = "Qwen2.5-Coder-7B-Instruct.w8a8"

	num_samples = 128
	max_seq_len = 4096

	tokenizer = AutoTokenizer.from_pretrained(model_id)

	def preprocess_fn(example):
	return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}

	ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
	ds = ds.shuffle().select(range(num_samples))
	ds = ds.map(preprocess_fn)

	recipe = [
	SmoothQuantModifier(
	smoothing_strength=0.7,
	mappings=[
	[["re:.q_proj", "re:.k_proj", "re:.v_proj"], "re:.input_layernorm"],
	[["re:.gate_proj", "re:.up_proj"], "re:.*post_attention_layernorm"],
	[["re:.down_proj"], "re:.up_proj"],
	],
	),
	GPTQModifier(
	sequential=True,
	targets="Linear",
	scheme="W8A8",
	ignore=["lm_head"],
	dampening_frac=0.01,
	)
	]

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype="bfloat16",
	)

	oneshot(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=max_seq_len,
	num_calibration_samples=num_samples,
	)

	model.save_pretrained(model_out)
	```