File size: 2,439 Bytes
a570747
6e565e2
 
a570747
6e565e2
a570747
 
 
60093e8
 
6e565e2
1d780ea
 
3cef3e3
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
3cef3e3
 
 
49eb74f
3cef3e3
6e565e2
3cef3e3
9810d0f
f05adaf
 
3cef3e3
 
f05adaf
3cef3e3
 
 
6e565e2
9810d0f
 
6e565e2
9810d0f
 
 
6e565e2
f05adaf
6e565e2
3cef3e3
75b0bbd
1d780ea
 
 
 
 
 
6e565e2
 
dce69d0
49eb74f
3cef3e3
 
6e565e2
3cef3e3
 
 
6e565e2
 
3cef3e3
 
6e565e2
3cef3e3
 
6e565e2
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
output_dir = "/tmp/t5-finetuned"
os.makedirs(output_dir, exist_ok=True)
# Load dataset
dataset = load_dataset("tatsu-lab/alpaca")
dataset["train"] = dataset["train"].select(range(5000))

# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])

# If no 'test' split exists, create one
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)

# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable(False) 

# Define tokenization function
def tokenize_function(examples):
    inputs = examples["input"]  # Ensure this matches dataset key
    targets = examples["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#  Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("Dataset successfully split and tokenized.")

#  Define training arguments
training_args = TrainingArguments(
    output_dir="/tmp/t5-finetuned",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,  # βœ… Train for 1 epoch only
    gradient_accumulation_steps=2,  # βœ… Reduce steps to speed up
    logging_steps=100,  # βœ… Log less frequently
    save_steps=500,  # βœ… Save less frequently
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    fp16=True
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start fine-tuning
trainer.train()

print("Fine-tuning complete!")

# Save model locally
trainer.save_model("./t5-finetuned")

print("Model saved successfully!")