File size: 2,635 Bytes
894ef1a
 
 
47f1e3a
894ef1a
 
 
 
47f1e3a
 
 
 
 
894ef1a
47f1e3a
 
 
 
 
894ef1a
 
47f1e3a
 
 
 
894ef1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47f1e3a
 
894ef1a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# train_sft.py
import sys
import json
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer

DRY_RUN = "--dry-run" in sys.argv

MODEL_ID = "Salesforce/codegen-350M-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model     = AutoModelForCausalLM.from_pretrained(MODEL_ID)

# 1) load your JSONL
ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train")

# 2) tokenize & format
def tokenize(example):
    prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n"
    output = example.get("comments", example.get("comment", []))
    text = prompt + tokenizer.decode(tokenizer.encode(json.dumps(output, ensure_ascii=False), add_special_tokens=False))
    tokens = tokenizer(text, truncation=True, max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# In dry‐run, only map a couple examples
if DRY_RUN:
    sample = ds.select(range(2))
    print("Sample examples before tokenization:")
    for ex in sample:
        print(ex)
    tokenized = sample.map(tokenize, remove_columns=sample.column_names)
    print("\nAfter tokenization, examples look like:")
    for ex in tokenized:
        print({k: ex[k] for k in ["input_ids","labels"]})
else:
    ds = ds.map(tokenize, remove_columns=ds.column_names)

# 3) configure args
training_args = TrainingArguments(
    output_dir       = "sft-model",      # where to write checkpoints
    overwrite_output_dir = True,         

    do_train         = True,             # we’re doing a train run
    num_train_epochs = 3,                # full passes over the data
    per_device_train_batch_size = 2,     
    gradient_accumulation_steps = 8,     

    learning_rate    = 2e-5,
    max_steps        = 500,              # total optimization steps (overrides epochs)

    logging_strategy = "steps",
    logging_steps    = 50,               
    save_strategy    = "steps",
    save_steps       = 200,

    fp16             = False,            # no half‐precision on CPU
    report_to        = None,             # disable WandB/others
)

# 4) instantiate trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=(tokenized if DRY_RUN else ds),
)

print(f"\n✅ DRY-RUN: Trainer instantiated:\n  – model: {type(model)}\n  – tokenizer: {type(tokenizer)}\n  – train_dataset size: {len(tokenized if DRY_RUN else ds)}")
print(f"  – SFTTrainingArguments: {training_args}")

if not DRY_RUN:
    # only run the real training if you didn’t pass --dry-run
    trainer.train()
    trainer.save_model("sft-model")