|
import torch |
|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot |
|
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map |
|
import gc |
|
|
|
torch.cuda.empty_cache() |
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
torch.backends.cudnn.allow_tf32 = True |
|
|
|
recipe = """ |
|
quant_stage: |
|
quant_modifiers: |
|
QuantizationModifier: |
|
ignore: ["lm_head"] |
|
config_groups: |
|
group_0: |
|
weights: |
|
num_bits: 8 |
|
type: float |
|
strategy: tensor |
|
dynamic: false |
|
symmetric: true |
|
input_activations: |
|
num_bits: 8 |
|
type: float |
|
strategy: tensor |
|
dynamic: false |
|
symmetric: true |
|
targets: ["Linear"] |
|
""" |
|
|
|
model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" |
|
model_name = model_stub.split("/")[-1] |
|
|
|
device_map = calculate_offload_device_map( |
|
model_stub, |
|
reserve_for_hessians=True, |
|
num_gpus=1, |
|
torch_dtype=torch.bfloat16, |
|
max_memory={0: "18GiB", "cpu": "96GiB"} |
|
) |
|
|
|
model = SparseAutoModelForCausalLM.from_pretrained( |
|
model_stub, |
|
torch_dtype=torch.bfloat16, |
|
device_map=device_map, |
|
low_cpu_mem_usage=True, |
|
offload_folder="offload_folder", |
|
offload_state_dict=True |
|
) |
|
|
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_stub) |
|
|
|
output_dir = f"./{model_name}-FP8" |
|
|
|
NUM_CALIBRATION_SAMPLES = 256 |
|
MAX_SEQUENCE_LENGTH = 2048 |
|
|
|
raw_dataset = load_dataset( |
|
"HuggingFaceH4/ultrachat_200k", |
|
split="train_sft" |
|
) |
|
raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset)))) |
|
|
|
def preprocess_function(examples): |
|
texts = [tokenizer.apply_chat_template(messages, tokenize=False) |
|
for messages in examples["messages"]] |
|
|
|
tokenized = tokenizer( |
|
texts, |
|
max_length=MAX_SEQUENCE_LENGTH, |
|
padding="max_length", |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
|
|
tokenized["labels"] = tokenized["input_ids"].clone() |
|
|
|
return tokenized |
|
|
|
processed_dataset = raw_dataset.map( |
|
preprocess_function, |
|
batched=True, |
|
remove_columns=raw_dataset.column_names, |
|
desc="Processing dataset", |
|
) |
|
|
|
oneshot( |
|
model=model, |
|
output_dir=output_dir, |
|
dataset=processed_dataset, |
|
recipe=recipe, |
|
max_seq_length=MAX_SEQUENCE_LENGTH, |
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
fp16=False, |
|
bf16=True, |
|
save_compressed=True, |
|
learning_rate=1e-5, |
|
num_train_epochs=1, |
|
logging_steps=10, |
|
save_strategy="no", |
|
remove_unused_columns=False, |
|
push_to_hub=False, |
|
preprocessing_num_workers=4, |
|
dataloader_num_workers=2 |
|
) |
|
|