emilss's picture
Add model files
c0fc8fe
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
import gc
torch.cuda.empty_cache()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
targets: ["Linear"]
"""
model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
model_name = model_stub.split("/")[-1]
device_map = calculate_offload_device_map(
model_stub,
reserve_for_hessians=True,
num_gpus=1,
torch_dtype=torch.bfloat16,
max_memory={0: "18GiB", "cpu": "96GiB"}
)
model = SparseAutoModelForCausalLM.from_pretrained(
model_stub,
torch_dtype=torch.bfloat16,
device_map=device_map,
low_cpu_mem_usage=True,
offload_folder="offload_folder",
offload_state_dict=True
)
torch.cuda.empty_cache()
gc.collect()
tokenizer = AutoTokenizer.from_pretrained(model_stub)
output_dir = f"./{model_name}-FP8"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048
raw_dataset = load_dataset(
"HuggingFaceH4/ultrachat_200k",
split="train_sft"
)
raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset))))
def preprocess_function(examples):
texts = [tokenizer.apply_chat_template(messages, tokenize=False)
for messages in examples["messages"]]
tokenized = tokenizer(
texts,
max_length=MAX_SEQUENCE_LENGTH,
padding="max_length",
truncation=True,
return_tensors="pt"
)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
processed_dataset = raw_dataset.map(
preprocess_function,
batched=True,
remove_columns=raw_dataset.column_names,
desc="Processing dataset",
)
oneshot(
model=model,
output_dir=output_dir,
dataset=processed_dataset,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
fp16=False,
bf16=True,
save_compressed=True,
learning_rate=1e-5,
num_train_epochs=1,
logging_steps=10,
save_strategy="no",
remove_unused_columns=False,
push_to_hub=False,
preprocessing_num_workers=4,
dataloader_num_workers=2
)