enferAI
/

DeepSeek-R1-Distill-Qwen-14B-FP8

Text Generation

text-generation-inference

Inference Endpoints

compressed-tensors

Model card Files Files and versions Community

DeepSeek-R1-Distill-Qwen-14B-FP8 / compress.py

emilss's picture

Add model files

c0fc8fe 10 days ago

history blame contribute delete

3 kB

	import torch
	from datasets import load_dataset
	from transformers import AutoTokenizer
	from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
	from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
	import gc

	torch.cuda.empty_cache()
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	recipe = """
	quant_stage:
	quant_modifiers:
	QuantizationModifier:
	ignore: ["lm_head"]
	config_groups:
	group_0:
	weights:
	num_bits: 8
	type: float
	strategy: tensor
	dynamic: false
	symmetric: true
	input_activations:
	num_bits: 8
	type: float
	strategy: tensor
	dynamic: false
	symmetric: true
	targets: ["Linear"]
	"""

	model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
	model_name = model_stub.split("/")[-1]

	device_map = calculate_offload_device_map(
	model_stub,
	reserve_for_hessians=True,
	num_gpus=1,
	torch_dtype=torch.bfloat16,
	max_memory={0: "18GiB", "cpu": "96GiB"}
	)

	model = SparseAutoModelForCausalLM.from_pretrained(
	model_stub,
	torch_dtype=torch.bfloat16,
	device_map=device_map,
	low_cpu_mem_usage=True,
	offload_folder="offload_folder",
	offload_state_dict=True
	)

	torch.cuda.empty_cache()
	gc.collect()

	tokenizer = AutoTokenizer.from_pretrained(model_stub)

	output_dir = f"./{model_name}-FP8"

	NUM_CALIBRATION_SAMPLES = 256
	MAX_SEQUENCE_LENGTH = 2048

	raw_dataset = load_dataset(
	"HuggingFaceH4/ultrachat_200k",
	split="train_sft"
	)
	raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset))))

	def preprocess_function(examples):
	texts = [tokenizer.apply_chat_template(messages, tokenize=False)
	for messages in examples["messages"]]

	tokenized = tokenizer(
	texts,
	max_length=MAX_SEQUENCE_LENGTH,
	padding="max_length",
	truncation=True,
	return_tensors="pt"
	)

	tokenized["labels"] = tokenized["input_ids"].clone()

	return tokenized

	processed_dataset = raw_dataset.map(
	preprocess_function,
	batched=True,
	remove_columns=raw_dataset.column_names,
	desc="Processing dataset",
	)

	oneshot(
	model=model,
	output_dir=output_dir,
	dataset=processed_dataset,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,
	fp16=False,
	bf16=True,
	save_compressed=True,
	learning_rate=1e-5,
	num_train_epochs=1,
	logging_steps=10,
	save_strategy="no",
	remove_unused_columns=False,
	push_to_hub=False,
	preprocessing_num_workers=4,
	dataloader_num_workers=2
	)