Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from trl import SFTTrainer, SFTTrainingArguments | |
MODEL_ID = "Salesforce/codegen-350M-multi" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID) | |
# 1) load your local JSONL | |
ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train") | |
# 2) tokenize & format | |
def tokenize(example): | |
prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n" | |
output = example['comments'] | |
text = prompt + tokenizer.decode(tokenizer.encode(str(output), add_special_tokens=False)) | |
tokens = tokenizer(text, truncation=True, max_length=512) | |
tokens["labels"] = tokens["input_ids"].copy() | |
return tokens | |
ds = ds.map(tokenize, remove_columns=ds.column_names, batched=False) | |
# 3) SFT arguments | |
training_args = SFTTrainingArguments( | |
output_dir="sft-model", | |
per_device_train_batch_size=2, | |
gradient_accumulation_steps=8, | |
learning_rate=2e-5, | |
max_train_steps=500, | |
logging_steps=50, | |
save_steps=200 | |
) | |
# 4) kick off the trainer | |
trainer = SFTTrainer(model, tokenizer, args=training_args, train_dataset=ds) | |
trainer.train() | |
trainer.save_model("sft-model") |