rivapereira123 commited on
Commit
dbbaa64
·
verified ·
1 Parent(s): 464fae4

Update finetune_flan_t5.py

Browse files
Files changed (1) hide show
  1. finetune_flan_t5.py +23 -14
finetune_flan_t5.py CHANGED
@@ -8,22 +8,28 @@ from transformers import (
8
  from trl import SFTTrainer
9
  import torch
10
 
 
 
 
 
 
 
 
 
11
  # 1. Load and prepare dataset
12
  dataset = load_dataset("json", data_files="data/med_q_n_a_converted.jsonl", split="train")
13
 
14
- # Add 'text' field containing the formatted examples
15
- def add_text_field(example):
16
- example['text'] = f"### Instruction:\n{example['input']}\n\n### Response:\n{example['output']}"
17
- return example
18
-
19
- dataset = dataset.map(add_text_field)
20
 
21
  # 2. Load model and tokenizer
22
  model_name = "google/flan-t5-base"
23
  tokenizer = AutoTokenizer.from_pretrained(model_name)
24
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
25
 
26
- # 3. Training arguments
27
  training_args = TrainingArguments(
28
  output_dir="./flan-t5-medical-finetuned",
29
  per_device_train_batch_size=4,
@@ -34,24 +40,27 @@ training_args = TrainingArguments(
34
  save_strategy="epoch",
35
  evaluation_strategy="no",
36
  fp16=torch.cuda.is_available(),
37
- report_to="none"
 
 
 
38
  )
39
 
40
- # 4. Initialize SFTTrainer with correct configuration
41
  trainer = SFTTrainer(
42
  model=model,
43
  tokenizer=tokenizer,
44
  train_dataset=dataset,
45
  args=training_args,
46
- max_seq_length=512,
47
- dataset_text_field="text", # Field we created
48
  data_collator=DataCollatorForSeq2Seq(
49
  tokenizer,
50
  model=model,
51
- pad_to_multiple_of=8,
52
- return_tensors="pt",
53
  padding=True
54
- )
 
 
 
55
  )
56
 
57
  # 5. Start training
 
8
  from trl import SFTTrainer
9
  import torch
10
 
11
+ # First check and update packages if needed
12
+ def check_versions():
13
+ import subprocess
14
+ import sys
15
+ subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "transformers", "accelerate", "trl"])
16
+
17
+ check_versions()
18
+
19
  # 1. Load and prepare dataset
20
  dataset = load_dataset("json", data_files="data/med_q_n_a_converted.jsonl", split="train")
21
 
22
+ # Add formatted text field
23
+ dataset = dataset.map(lambda x: {
24
+ "text": f"### Instruction:\n{x['input']}\n\n### Response:\n{x['output']}"
25
+ })
 
 
26
 
27
  # 2. Load model and tokenizer
28
  model_name = "google/flan-t5-base"
29
  tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
31
 
32
+ # 3. Training arguments - modified to avoid deprecated parameters
33
  training_args = TrainingArguments(
34
  output_dir="./flan-t5-medical-finetuned",
35
  per_device_train_batch_size=4,
 
40
  save_strategy="epoch",
41
  evaluation_strategy="no",
42
  fp16=torch.cuda.is_available(),
43
+ report_to="none",
44
+ # Add these to avoid version conflicts
45
+ use_cpu=not torch.cuda.is_available(),
46
+ remove_unused_columns=False
47
  )
48
 
49
+ # 4. Initialize trainer with updated configuration
50
  trainer = SFTTrainer(
51
  model=model,
52
  tokenizer=tokenizer,
53
  train_dataset=dataset,
54
  args=training_args,
55
+ dataset_text_field="text",
 
56
  data_collator=DataCollatorForSeq2Seq(
57
  tokenizer,
58
  model=model,
 
 
59
  padding=True
60
+ ),
61
+ # Remove deprecated parameters
62
+ max_seq_length=None,
63
+ formatting_func=None
64
  )
65
 
66
  # 5. Start training