Spaces:
Sleeping
Sleeping
fix: fix SFT training file
Browse files- requirements.txt +4 -3
- train_sft.py +57 -20
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
transformers
|
2 |
accelerate
|
3 |
torch
|
4 |
gradio
|
5 |
-
trl
|
6 |
-
datasets
|
|
|
|
1 |
+
transformers>=4.33.3,<5.0.0
|
2 |
accelerate
|
3 |
torch
|
4 |
gradio
|
5 |
+
trl>=0.7.0
|
6 |
+
datasets
|
7 |
+
peft==0.5.0
|
train_sft.py
CHANGED
@@ -1,37 +1,74 @@
|
|
|
|
|
|
|
|
1 |
from datasets import load_dataset
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
-
from trl import SFTTrainer
|
|
|
|
|
4 |
|
5 |
MODEL_ID = "Salesforce/codegen-350M-multi"
|
6 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
7 |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
|
8 |
|
9 |
-
# 1) load your
|
10 |
ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train")
|
11 |
|
12 |
# 2) tokenize & format
|
13 |
def tokenize(example):
|
14 |
prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n"
|
15 |
-
output = example[
|
16 |
-
text = prompt + tokenizer.decode(tokenizer.encode(
|
17 |
tokens = tokenizer(text, truncation=True, max_length=512)
|
18 |
tokens["labels"] = tokens["input_ids"].copy()
|
19 |
return tokens
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
)
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
1 |
+
# train_sft.py
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
from datasets import load_dataset
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
|
6 |
+
from trl import SFTTrainer
|
7 |
+
|
8 |
+
DRY_RUN = "--dry-run" in sys.argv
|
9 |
|
10 |
MODEL_ID = "Salesforce/codegen-350M-multi"
|
11 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
12 |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
|
13 |
|
14 |
+
# 1) load your JSONL
|
15 |
ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train")
|
16 |
|
17 |
# 2) tokenize & format
|
18 |
def tokenize(example):
|
19 |
prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n"
|
20 |
+
output = example.get("comments", example.get("comment", []))
|
21 |
+
text = prompt + tokenizer.decode(tokenizer.encode(json.dumps(output, ensure_ascii=False), add_special_tokens=False))
|
22 |
tokens = tokenizer(text, truncation=True, max_length=512)
|
23 |
tokens["labels"] = tokens["input_ids"].copy()
|
24 |
return tokens
|
25 |
|
26 |
+
# In dry‐run, only map a couple examples
|
27 |
+
if DRY_RUN:
|
28 |
+
sample = ds.select(range(2))
|
29 |
+
print("Sample examples before tokenization:")
|
30 |
+
for ex in sample:
|
31 |
+
print(ex)
|
32 |
+
tokenized = sample.map(tokenize, remove_columns=sample.column_names)
|
33 |
+
print("\nAfter tokenization, examples look like:")
|
34 |
+
for ex in tokenized:
|
35 |
+
print({k: ex[k] for k in ["input_ids","labels"]})
|
36 |
+
else:
|
37 |
+
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
38 |
+
|
39 |
+
# 3) configure args
|
40 |
+
training_args = TrainingArguments(
|
41 |
+
output_dir = "sft-model", # where to write checkpoints
|
42 |
+
overwrite_output_dir = True,
|
43 |
+
|
44 |
+
do_train = True, # we’re doing a train run
|
45 |
+
num_train_epochs = 3, # full passes over the data
|
46 |
+
per_device_train_batch_size = 2,
|
47 |
+
gradient_accumulation_steps = 8,
|
48 |
+
|
49 |
+
learning_rate = 2e-5,
|
50 |
+
max_steps = 500, # total optimization steps (overrides epochs)
|
51 |
+
|
52 |
+
logging_strategy = "steps",
|
53 |
+
logging_steps = 50,
|
54 |
+
save_strategy = "steps",
|
55 |
+
save_steps = 200,
|
56 |
+
|
57 |
+
fp16 = False, # no half‐precision on CPU
|
58 |
+
report_to = None, # disable WandB/others
|
59 |
+
)
|
60 |
+
|
61 |
+
# 4) instantiate trainer
|
62 |
+
trainer = SFTTrainer(
|
63 |
+
model=model,
|
64 |
+
args=training_args,
|
65 |
+
train_dataset=(tokenized if DRY_RUN else ds),
|
66 |
)
|
67 |
|
68 |
+
print(f"\n✅ DRY-RUN: Trainer instantiated:\n – model: {type(model)}\n – tokenizer: {type(tokenizer)}\n – train_dataset size: {len(tokenized if DRY_RUN else ds)}")
|
69 |
+
print(f" – SFTTrainingArguments: {training_args}")
|
70 |
+
|
71 |
+
if not DRY_RUN:
|
72 |
+
# only run the real training if you didn’t pass --dry-run
|
73 |
+
trainer.train()
|
74 |
+
trainer.save_model("sft-model")
|