rivapereira123 commited on
Commit
b7f414a
·
verified ·
1 Parent(s): b52b1ab

Update finetune_flan_t5.py

Browse files
Files changed (1) hide show
  1. finetune_flan_t5.py +15 -8
finetune_flan_t5.py CHANGED
@@ -14,25 +14,32 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
14
 
15
  # Preprocess dataset
16
  def preprocess(example):
17
- input_text = example["instruction"]
18
  target_text = example["output"]
 
 
19
  tokenized = tokenizer(
20
  input_text,
21
  max_length=512,
22
  truncation=True,
23
  padding="max_length"
24
  )
25
- with tokenizer.as_target_tokenizer():
26
- tokenized["labels"] = tokenizer(
27
- target_text,
28
- max_length=128,
29
- truncation=True,
30
- padding="max_length"
31
- )["input_ids"]
 
 
 
32
  return tokenized
33
 
 
34
  tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
35
 
 
36
  # Define training arguments
37
  training_args = TrainingArguments(
38
  output_dir="./flan-t5-medical",
 
14
 
15
  # Preprocess dataset
16
  def preprocess(example):
17
+ input_text = example["input"]
18
  target_text = example["output"]
19
+
20
+ # Tokenize inputs
21
  tokenized = tokenizer(
22
  input_text,
23
  max_length=512,
24
  truncation=True,
25
  padding="max_length"
26
  )
27
+
28
+ # Tokenize targets
29
+ tokenized_target = tokenizer(
30
+ target_text,
31
+ max_length=128,
32
+ truncation=True,
33
+ padding="max_length"
34
+ )
35
+ tokenized["labels"] = tokenized_target["input_ids"]
36
+
37
  return tokenized
38
 
39
+ # Apply preprocessing
40
  tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
41
 
42
+
43
  # Define training arguments
44
  training_args = TrainingArguments(
45
  output_dir="./flan-t5-medical",