# Import required libraries from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset from huggingface_hub import login import pandas as pd import os import torch import matplotlib.pyplot as plt # Step 1: Log in to Hugging Face # Students: Replace with your actual Hugging Face token from https://huggingface.co/settings/tokens hf_token = "YOUR_HUGGING_FACE_TOKEN" #Replace your YOUR_HUGGING_FACE_TOKEN here if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN": # Don't replace here raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' with your actual Hugging Face token") try: login(token=hf_token) print("Logged in to Hugging Face successfully") except Exception as e: raise ValueError(f"Failed to log in to Hugging Face: {str(e)}") # Step 2: Load and convert dataset # Students: Replace with your dataset file name (CSV or JSON) dataset_name = "dataset.json" # Change to "dataset.csv" if using CSV dataset_path = dataset_name if not os.path.exists(dataset_path): raise FileNotFoundError(f"Dataset file '{dataset_path}' not found in the project folder") if dataset_name.endswith('.csv'): # Convert CSV to JSON for consistency print(f"Converting {dataset_name} to JSON format...") try: df = pd.read_csv(dataset_path) df.to_json('dataset.json', orient='records', lines=True) dataset_path = 'dataset.json' except Exception as e: raise ValueError(f"Failed to convert CSV to JSON: {str(e)}") # Load dataset print(f"Loading dataset from {dataset_path}...") try: dataset = load_dataset('json', data_files=dataset_path) except Exception as e: raise ValueError(f"Failed to load dataset: {str(e)}") # Step 3: Split dataset into training and validation # 85% training, 15% validation to monitor model performance print("Splitting dataset into training and validation sets...") train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42) train_dataset = train_test_split['train'] eval_dataset = train_test_split['test'] # Step 4: Download and load tokenizer and model print("Downloading T5-small model and tokenizer...") try: tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') model.save_pretrained('./t5_small_weights') # Save model weights locally for fine-tuning tokenizer.save_pretrained('./t5_small_weights') print("Model and tokenizer saved to './t5_small_weights'") except Exception as e: raise ValueError(f"Failed to download or save model/tokenizer: {str(e)}") # Step 5: Preprocess dataset # This ensures the input questions and answers are properly tokenized for T5 def preprocess_data(examples): # Add "question:" prefix to inputs and clean whitespace inputs = ["question: " + q.strip() for q in examples['input']] targets = [r.strip() for r in examples['response']] # Tokenize inputs (questions) model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length') # Tokenize labels (answers) labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length') model_inputs['labels'] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids'] ] return model_inputs print("Preprocessing datasets...") try: processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) except Exception as e: raise ValueError(f"Failed to preprocess dataset: {str(e)}") # Step 6: Define training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=15, # Increased for better convergence per_device_train_batch_size=2, gradient_accumulation_steps=2, learning_rate=5e-4, # Increased for faster learning save_steps=500, save_total_limit=2, logging_steps=50, eval_strategy="steps", eval_steps=100, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, gradient_checkpointing=True, max_grad_norm=1.0, ) # Step 7: Initialize Trainer print("Initializing Trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=processed_train_dataset, eval_dataset=processed_eval_dataset, ) # Step 8: Train the model print("Starting training...") try: trainer.train() print("Training finished.") except Exception as e: raise ValueError(f"Training failed: {str(e)}") # Step 9: Plot training and validation loss print("Generating training and validation loss plot...") logs = trainer.state.log_history steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log] train_loss = [log['loss'] for log in logs if 'loss' in log] eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log] plt.figure(figsize=(10, 5)) if train_loss: plt.plot(steps[:len(train_loss)], train_loss, label='Training Loss') if eval_loss: plt.plot(steps[:len(eval_loss)], eval_loss, label='Validation Loss') plt.xlabel('Step') plt.ylabel('Loss') plt.title('Training and Validation Loss Over Time') plt.legend() plt.grid(True) plt.savefig('training_metrics.png') plt.show() # Step 10: Save the fine-tuned model final_model_save_path = './finetuned_t5' model.save_pretrained(final_model_save_path) tokenizer.save_pretrained(final_model_save_path) print(f"Model fine-tuned and saved to '{final_model_save_path}'") print("Training metrics plot saved as 'training_metrics.png'")