File size: 2,439 Bytes
a570747 6e565e2 a570747 6e565e2 a570747 60093e8 6e565e2 1d780ea 3cef3e3 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 3cef3e3 49eb74f 3cef3e3 6e565e2 3cef3e3 9810d0f f05adaf 3cef3e3 f05adaf 3cef3e3 6e565e2 9810d0f 6e565e2 9810d0f 6e565e2 f05adaf 6e565e2 3cef3e3 75b0bbd 1d780ea 6e565e2 dce69d0 49eb74f 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
output_dir = "/tmp/t5-finetuned"
os.makedirs(output_dir, exist_ok=True)
# Load dataset
dataset = load_dataset("tatsu-lab/alpaca")
dataset["train"] = dataset["train"].select(range(5000))
# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])
# If no 'test' split exists, create one
if "test" not in dataset:
dataset = dataset["train"].train_test_split(test_size=0.1)
# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable(False)
# Define tokenization function
def tokenize_function(examples):
inputs = examples["input"] # Ensure this matches dataset key
targets = examples["output"]
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
print("Dataset successfully split and tokenized.")
# Define training arguments
training_args = TrainingArguments(
output_dir="/tmp/t5-finetuned",
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=1, # β
Train for 1 epoch only
gradient_accumulation_steps=2, # β
Reduce steps to speed up
logging_steps=100, # β
Log less frequently
save_steps=500, # β
Save less frequently
evaluation_strategy="epoch",
save_strategy="epoch",
push_to_hub=False,
fp16=True
)
# Set up Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Start fine-tuning
trainer.train()
print("Fine-tuning complete!")
# Save model locally
trainer.save_model("./t5-finetuned")
print("Model saved successfully!")
|