cconsti commited on
Commit
f05adaf
·
verified ·
1 Parent(s): 2be155a

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +8 -3
train.py CHANGED
@@ -17,15 +17,20 @@ tokenizer = T5Tokenizer.from_pretrained(model_name)
17
  model = T5ForConditionalGeneration.from_pretrained(model_name)
18
 
19
  # Tokenization function
20
- print("Sample data structure:", examples)
21
  def tokenize_function(examples):
22
- inputs = [ex["input"] for ex in examples]
23
- targets = [ex["output"] for ex in examples]
 
 
 
24
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
25
  labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
 
26
  model_inputs["labels"] = labels["input_ids"]
27
  return model_inputs
28
 
 
 
29
  # Apply tokenization
30
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
31
 
 
17
  model = T5ForConditionalGeneration.from_pretrained(model_name)
18
 
19
  # Tokenization function
 
20
  def tokenize_function(examples):
21
+ print("Sample data structure:", examples) # Move print inside function
22
+
23
+ inputs = examples["input"] # Make sure "input" matches dataset keys
24
+ targets = examples["output"]
25
+
26
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
27
  labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
28
+
29
  model_inputs["labels"] = labels["input_ids"]
30
  return model_inputs
31
 
32
+ return model_inputs
33
+
34
  # Apply tokenization
35
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
36