Update train.py
Browse files
train.py
CHANGED
@@ -1,40 +1,29 @@
|
|
1 |
-
import torch
|
2 |
-
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
|
3 |
-
from datasets import load_dataset
|
4 |
import os
|
|
|
|
|
5 |
|
6 |
-
#
|
7 |
os.environ["HF_HOME"] = "/app/hf_cache"
|
8 |
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
|
9 |
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
|
10 |
|
11 |
-
# Load dataset
|
12 |
-
dataset = load_dataset("tatsu-lab/alpaca") #
|
13 |
|
14 |
-
# Check
|
15 |
print("Dataset splits available:", dataset)
|
|
|
16 |
|
17 |
-
# If
|
18 |
if "test" not in dataset:
|
19 |
-
dataset = dataset["train"].train_test_split(test_size=0.1)
|
20 |
-
|
21 |
-
# Tokenize dataset
|
22 |
-
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
23 |
-
|
24 |
-
# Assign datasets
|
25 |
-
train_dataset = tokenized_datasets["train"]
|
26 |
-
eval_dataset = tokenized_datasets["test"] # This is now safely created
|
27 |
-
|
28 |
-
# Debug output
|
29 |
-
print("Dataset successfully split into train and test sets")
|
30 |
|
31 |
-
# Load
|
32 |
model_name = "t5-large"
|
33 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
34 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
35 |
|
36 |
-
#
|
37 |
-
# Define tokenization function before mapping
|
38 |
def tokenize_function(examples):
|
39 |
inputs = examples["input"] # Ensure this matches dataset key
|
40 |
targets = examples["output"]
|
@@ -45,51 +34,41 @@ def tokenize_function(examples):
|
|
45 |
model_inputs["labels"] = labels["input_ids"]
|
46 |
return model_inputs
|
47 |
|
48 |
-
#
|
49 |
-
print("Dataset splits available:", dataset)
|
50 |
-
|
51 |
-
# If "test" split is missing, create one
|
52 |
-
if "test" not in dataset:
|
53 |
-
dataset = dataset["train"].train_test_split(test_size=0.1)
|
54 |
-
|
55 |
-
# Tokenize dataset
|
56 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
57 |
|
58 |
-
#
|
59 |
train_dataset = tokenized_datasets["train"]
|
60 |
eval_dataset = tokenized_datasets["test"]
|
61 |
|
62 |
-
|
63 |
-
print("Dataset successfully split and tokenized")
|
64 |
-
|
65 |
|
66 |
-
#
|
67 |
-
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
68 |
-
|
69 |
-
# Training arguments
|
70 |
training_args = TrainingArguments(
|
71 |
output_dir="./t5-finetuned",
|
72 |
-
per_device_train_batch_size=2, #
|
73 |
-
per_device_eval_batch_size=2,
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
push_to_hub=False # Avoid pushing test model to Hugging Face Hub
|
80 |
)
|
81 |
|
82 |
-
# Trainer
|
83 |
trainer = Trainer(
|
84 |
model=model,
|
85 |
args=training_args,
|
86 |
-
train_dataset=train_dataset,
|
87 |
-
eval_dataset=eval_dataset,
|
88 |
)
|
89 |
|
90 |
-
|
91 |
-
# Train the model
|
92 |
trainer.train()
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from datasets import load_dataset
|
3 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
|
4 |
|
5 |
+
# Ensure Hugging Face cache directory is writable
|
6 |
os.environ["HF_HOME"] = "/app/hf_cache"
|
7 |
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
|
8 |
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
|
9 |
|
10 |
+
# Load dataset
|
11 |
+
dataset = load_dataset("tatsu-lab/alpaca") # Change if using your dataset
|
12 |
|
13 |
+
# Check dataset structure
|
14 |
print("Dataset splits available:", dataset)
|
15 |
+
print("Sample row:", dataset["train"][0])
|
16 |
|
17 |
+
# If no 'test' split exists, create one
|
18 |
if "test" not in dataset:
|
19 |
+
dataset = dataset["train"].train_test_split(test_size=0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# Load tokenizer & model
|
22 |
model_name = "t5-large"
|
23 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
24 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
25 |
|
26 |
+
# Define tokenization function
|
|
|
27 |
def tokenize_function(examples):
|
28 |
inputs = examples["input"] # Ensure this matches dataset key
|
29 |
targets = examples["output"]
|
|
|
34 |
model_inputs["labels"] = labels["input_ids"]
|
35 |
return model_inputs
|
36 |
|
37 |
+
# Tokenize dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
39 |
|
40 |
+
# Assign train & eval datasets
|
41 |
train_dataset = tokenized_datasets["train"]
|
42 |
eval_dataset = tokenized_datasets["test"]
|
43 |
|
44 |
+
print("Dataset successfully split and tokenized.")
|
|
|
|
|
45 |
|
46 |
+
# Define training arguments
|
|
|
|
|
|
|
47 |
training_args = TrainingArguments(
|
48 |
output_dir="./t5-finetuned",
|
49 |
+
per_device_train_batch_size=2, # Lowered to avoid memory issues
|
50 |
+
per_device_eval_batch_size=2,
|
51 |
+
num_train_epochs=1, # Test run (increase for full fine-tuning)
|
52 |
+
logging_steps=50,
|
53 |
+
evaluation_strategy="epoch",
|
54 |
+
save_strategy="epoch",
|
55 |
+
push_to_hub=False # Change to True to upload the model to HF Hub
|
|
|
56 |
)
|
57 |
|
58 |
+
# Set up Trainer
|
59 |
trainer = Trainer(
|
60 |
model=model,
|
61 |
args=training_args,
|
62 |
+
train_dataset=train_dataset,
|
63 |
+
eval_dataset=eval_dataset,
|
64 |
)
|
65 |
|
66 |
+
# Start fine-tuning
|
|
|
67 |
trainer.train()
|
68 |
|
69 |
+
print("Fine-tuning complete!")
|
70 |
+
|
71 |
+
# Save model locally
|
72 |
+
trainer.save_model("./t5-finetuned")
|
73 |
+
|
74 |
+
print("Model saved successfully!")
|