import os from datasets import load_dataset from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments # Ensure Hugging Face cache directory is writable os.environ["HF_HOME"] = "/app/hf_cache" os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache" os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache" os.makedirs("/app/t5-finetuned", exist_ok=True) # Load dataset dataset = load_dataset("tatsu-lab/alpaca") # Change if using your dataset # Check dataset structure print("Dataset splits available:", dataset) print("Sample row:", dataset["train"][0]) # If no 'test' split exists, create one if "test" not in dataset: dataset = dataset["train"].train_test_split(test_size=0.1) # Load tokenizer & model model_name = "t5-large" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Define tokenization function def tokenize_function(examples): inputs = examples["input"] # Ensure this matches dataset key targets = examples["output"] model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs # Tokenize dataset tokenized_datasets = dataset.map(tokenize_function, batched=True) # Assign train & eval datasets train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["test"] print("Dataset successfully split and tokenized.") # Define training arguments training_args = TrainingArguments( output_dir="/app/t5-finetuned", per_device_train_batch_size=2, # Lowered to avoid memory issues per_device_eval_batch_size=2, num_train_epochs=1, # Test run (increase for full fine-tuning) logging_steps=50, evaluation_strategy="epoch", save_strategy="epoch", push_to_hub=False # Change to True to upload the model to HF Hub ) # Set up Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) # Start fine-tuning trainer.train() print("Fine-tuning complete!") # Save model locally trainer.save_model("./t5-finetuned") print("Model saved successfully!")