File size: 2,235 Bytes
a570747 6e565e2 a570747 6e565e2 a570747 3cef3e3 6e565e2 3cef3e3 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 816facc 6e565e2 3cef3e3 6e565e2 3cef3e3 9810d0f f05adaf 3cef3e3 f05adaf 3cef3e3 6e565e2 9810d0f 6e565e2 9810d0f 6e565e2 f05adaf 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 3cef3e3 6e565e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
# Load dataset
dataset = load_dataset("tatsu-lab/alpaca") # Change if using your dataset
# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])
# If no 'test' split exists, create one
if "test" not in dataset:
dataset = dataset["train"].train_test_split(test_size=0.1)
# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Define tokenization function
def tokenize_function(examples):
inputs = examples["input"] # Ensure this matches dataset key
targets = examples["output"]
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
print("Dataset successfully split and tokenized.")
# Define training arguments
training_args = TrainingArguments(
output_dir="./t5-finetuned",
per_device_train_batch_size=2, # Lowered to avoid memory issues
per_device_eval_batch_size=2,
num_train_epochs=1, # Test run (increase for full fine-tuning)
logging_steps=50,
evaluation_strategy="epoch",
save_strategy="epoch",
push_to_hub=False # Change to True to upload the model to HF Hub
)
# Set up Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Start fine-tuning
trainer.train()
print("Fine-tuning complete!")
# Save model locally
trainer.save_model("./t5-finetuned")
print("Model saved successfully!")
|