File size: 2,402 Bytes
a570747 6e565e2 a570747 6e565e2 a570747 60093e8 6e565e2 3cef3e3 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 3cef3e3 75b0bbd 3cef3e3 6e565e2 3cef3e3 9810d0f f05adaf 3cef3e3 f05adaf 3cef3e3 6e565e2 9810d0f 6e565e2 9810d0f 6e565e2 f05adaf 6e565e2 3cef3e3 75b0bbd 6e565e2 75b0bbd 6e565e2 dce69d0 75b0bbd 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
output_dir = "/tmp/t5-finetuned"
os.makedirs(output_dir, exist_ok=True)
# Load dataset
dataset = load_dataset("tatsu-lab/alpaca") # Change if using your dataset
# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])
# If no 'test' split exists, create one
if "test" not in dataset:
dataset = dataset["train"].train_test_split(test_size=0.1)
# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()
# Define tokenization function
def tokenize_function(examples):
inputs = examples["input"] # Ensure this matches dataset key
targets = examples["output"]
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
print("Dataset successfully split and tokenized.")
# Define training arguments
training_args = TrainingArguments(
output_dir="/tmp/t5-finetuned",
per_device_train_batch_size=1, # β
Reduce to 1 (was 2)
per_device_eval_batch_size=1, # β
Reduce to 1
num_train_epochs=1, # Test run (increase for full fine-tuning)
gradient_accumulation_steps=4, # β
Helps simulate larger batch size
logging_steps=50,
evaluation_strategy="epoch",
save_strategy="epoch",
push_to_hub=False,
fp16=True
)
# Set up Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Start fine-tuning
trainer.train()
print("Fine-tuning complete!")
# Save model locally
trainer.save_model("./t5-finetuned")
print("Model saved successfully!")
|