Erpg12 commited on
Commit
894ef1a
·
1 Parent(s): 6d242bd

fix: fix SFT training file

Browse files
Files changed (2) hide show
  1. requirements.txt +4 -3
  2. train_sft.py +57 -20
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- transformers
2
  accelerate
3
  torch
4
  gradio
5
- trl
6
- datasets
 
 
1
+ transformers>=4.33.3,<5.0.0
2
  accelerate
3
  torch
4
  gradio
5
+ trl>=0.7.0
6
+ datasets
7
+ peft==0.5.0
train_sft.py CHANGED
@@ -1,37 +1,74 @@
 
 
 
1
  from datasets import load_dataset
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- from trl import SFTTrainer, SFTTrainingArguments
 
 
4
 
5
  MODEL_ID = "Salesforce/codegen-350M-multi"
6
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
7
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
8
 
9
- # 1) load your local JSONL
10
  ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train")
11
 
12
  # 2) tokenize & format
13
  def tokenize(example):
14
  prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n"
15
- output = example['comments']
16
- text = prompt + tokenizer.decode(tokenizer.encode(str(output), add_special_tokens=False))
17
  tokens = tokenizer(text, truncation=True, max_length=512)
18
  tokens["labels"] = tokens["input_ids"].copy()
19
  return tokens
20
 
21
- ds = ds.map(tokenize, remove_columns=ds.column_names, batched=False)
22
-
23
- # 3) SFT arguments
24
- training_args = SFTTrainingArguments(
25
- output_dir="sft-model",
26
- per_device_train_batch_size=2,
27
- gradient_accumulation_steps=8,
28
- learning_rate=2e-5,
29
- max_train_steps=500,
30
- logging_steps=50,
31
- save_steps=200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
 
34
- # 4) kick off the trainer
35
- trainer = SFTTrainer(model, tokenizer, args=training_args, train_dataset=ds)
36
- trainer.train()
37
- trainer.save_model("sft-model")
 
 
 
 
1
+ # train_sft.py
2
+ import sys
3
+ import json
4
  from datasets import load_dataset
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
6
+ from trl import SFTTrainer
7
+
8
+ DRY_RUN = "--dry-run" in sys.argv
9
 
10
  MODEL_ID = "Salesforce/codegen-350M-multi"
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
13
 
14
+ # 1) load your JSONL
15
  ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train")
16
 
17
  # 2) tokenize & format
18
  def tokenize(example):
19
  prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n"
20
+ output = example.get("comments", example.get("comment", []))
21
+ text = prompt + tokenizer.decode(tokenizer.encode(json.dumps(output, ensure_ascii=False), add_special_tokens=False))
22
  tokens = tokenizer(text, truncation=True, max_length=512)
23
  tokens["labels"] = tokens["input_ids"].copy()
24
  return tokens
25
 
26
+ # In dry‐run, only map a couple examples
27
+ if DRY_RUN:
28
+ sample = ds.select(range(2))
29
+ print("Sample examples before tokenization:")
30
+ for ex in sample:
31
+ print(ex)
32
+ tokenized = sample.map(tokenize, remove_columns=sample.column_names)
33
+ print("\nAfter tokenization, examples look like:")
34
+ for ex in tokenized:
35
+ print({k: ex[k] for k in ["input_ids","labels"]})
36
+ else:
37
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
38
+
39
+ # 3) configure args
40
+ training_args = TrainingArguments(
41
+ output_dir = "sft-model", # where to write checkpoints
42
+ overwrite_output_dir = True,
43
+
44
+ do_train = True, # we’re doing a train run
45
+ num_train_epochs = 3, # full passes over the data
46
+ per_device_train_batch_size = 2,
47
+ gradient_accumulation_steps = 8,
48
+
49
+ learning_rate = 2e-5,
50
+ max_steps = 500, # total optimization steps (overrides epochs)
51
+
52
+ logging_strategy = "steps",
53
+ logging_steps = 50,
54
+ save_strategy = "steps",
55
+ save_steps = 200,
56
+
57
+ fp16 = False, # no half‐precision on CPU
58
+ report_to = None, # disable WandB/others
59
+ )
60
+
61
+ # 4) instantiate trainer
62
+ trainer = SFTTrainer(
63
+ model=model,
64
+ args=training_args,
65
+ train_dataset=(tokenized if DRY_RUN else ds),
66
  )
67
 
68
+ print(f"\n✅ DRY-RUN: Trainer instantiated:\n – model: {type(model)}\n – tokenizer: {type(tokenizer)}\n – train_dataset size: {len(tokenized if DRY_RUN else ds)}")
69
+ print(f" – SFTTrainingArguments: {training_args}")
70
+
71
+ if not DRY_RUN:
72
+ # only run the real training if you didn’t pass --dry-run
73
+ trainer.train()
74
+ trainer.save_model("sft-model")