Update train.py
Browse files
train.py
CHANGED
@@ -17,15 +17,20 @@ tokenizer = T5Tokenizer.from_pretrained(model_name)
|
|
17 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
18 |
|
19 |
# Tokenization function
|
20 |
-
print("Sample data structure:", examples)
|
21 |
def tokenize_function(examples):
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
24 |
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
|
25 |
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
|
|
|
26 |
model_inputs["labels"] = labels["input_ids"]
|
27 |
return model_inputs
|
28 |
|
|
|
|
|
29 |
# Apply tokenization
|
30 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
31 |
|
|
|
17 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
18 |
|
19 |
# Tokenization function
|
|
|
20 |
def tokenize_function(examples):
|
21 |
+
print("Sample data structure:", examples) # Move print inside function
|
22 |
+
|
23 |
+
inputs = examples["input"] # Make sure "input" matches dataset keys
|
24 |
+
targets = examples["output"]
|
25 |
+
|
26 |
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
|
27 |
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
|
28 |
+
|
29 |
model_inputs["labels"] = labels["input_ids"]
|
30 |
return model_inputs
|
31 |
|
32 |
+
return model_inputs
|
33 |
+
|
34 |
# Apply tokenization
|
35 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
36 |
|