remiai3 commited on
Commit
b5d3966
·
verified ·
1 Parent(s): c231015

Upload t5_project_all_in_one.py

Browse files
Files changed (1) hide show
  1. t5_project_all_in_one.py +58 -45
t5_project_all_in_one.py CHANGED
@@ -8,28 +8,38 @@ import torch
8
  import matplotlib.pyplot as plt
9
 
10
  # Step 1: Log in to Hugging Face
11
- # Students: Replace "YOUR_HUGGING_FACE_TOKEN" with your actual Hugging Face token from https://huggingface.co/settings/tokens
12
- hf_token = "YOUR_HUGGING_FACE_TOKEN"
13
- if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN":
14
- raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' in the code with your actual Hugging Face token")
15
- login(token=hf_token)
16
- print("Logged in to Hugging Face successfully")
 
 
 
17
 
18
  # Step 2: Load and convert dataset
19
- # Students: Replace "dataset.csv" or "dataset.json" with your dataset file name
20
- dataset_name = "dataset.csv" # Change to "dataset.json" if using JSON
21
  dataset_path = dataset_name
 
 
22
  if dataset_name.endswith('.csv'):
23
- # Convert CSV to JSON for consistency
24
  print(f"Converting {dataset_name} to JSON format...")
25
- df = pd.read_csv(dataset_path)
26
- df.to_json('dataset.json', orient='records', lines=True)
27
- dataset_path = 'dataset.json'
 
 
 
28
 
29
  # Load dataset
30
  print(f"Loading dataset from {dataset_path}...")
31
- dataset = load_dataset('json', data_files=dataset_path)
32
-
 
 
33
  # Step 3: Split dataset into training and validation
34
  # 85% training, 15% validation to monitor model performance
35
  print("Splitting dataset into training and validation sets...")
@@ -39,12 +49,14 @@ eval_dataset = train_test_split['test']
39
 
40
  # Step 4: Download and load tokenizer and model
41
  print("Downloading T5-small model and tokenizer...")
42
- tokenizer = T5Tokenizer.from_pretrained('t5-small')
43
- model = T5ForConditionalGeneration.from_pretrained('t5-small')
44
- # Save model weights locally for fine-tuning
45
- model.save_pretrained('./t5_small_weights')
46
- tokenizer.save_pretrained('./t5_small_weights')
47
- print("Model and tokenizer saved to './t5_small_weights'")
 
 
48
 
49
  # Step 5: Preprocess dataset
50
  # This ensures the input questions and answers are properly tokenized for T5
@@ -52,43 +64,42 @@ def preprocess_data(examples):
52
  # Add "question:" prefix to inputs and clean whitespace
53
  inputs = ["question: " + q.strip() for q in examples['input']]
54
  targets = [r.strip() for r in examples['response']]
55
- # Tokenize inputs (questions)
56
  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
57
  # Tokenize labels (answers)
58
  labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')
59
- # Replace pad token IDs in labels with -100 to ignore them in loss calculation
60
  model_inputs['labels'] = [
61
  [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
62
  ]
63
  return model_inputs
64
 
65
- # Apply preprocessing to training and validation datasets
66
  print("Preprocessing datasets...")
67
- processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
68
- processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
 
 
 
69
 
70
  # Step 6: Define training arguments
71
- # These settings control how the model is fine-tuned
72
  training_args = TrainingArguments(
73
- output_dir='./results', # Directory to save training outputs
74
- num_train_epochs=10, # Number of training iterations over the dataset
75
- per_device_train_batch_size=2, # Batch size per device (GPU/CPU)
76
- gradient_accumulation_steps=2, # Accumulate gradients to simulate larger batch size
77
- learning_rate=3e-4, # Learning rate for optimization
78
- save_steps=500, # Save model checkpoint every 500 steps
79
- save_total_limit=2, # Keep only the last 2 checkpoints
80
- logging_steps=50, # Log training metrics every 50 steps
81
- eval_strategy="steps", # Evaluate model during training at regular intervals
82
- eval_steps=100, # Evaluate every 100 steps
83
- load_best_model_at_end=True, # Load the best model based on validation loss
84
- metric_for_best_model="eval_loss", # Use validation loss to select best model
85
- greater_is_better=False, # Lower validation loss is better
86
- gradient_checkpointing=True, # Save memory during training
87
- max_grad_norm=1.0, # Clip gradients to prevent exploding gradients
88
  )
89
 
90
  # Step 7: Initialize Trainer
91
- # The Trainer handles the fine-tuning process
92
  print("Initializing Trainer...")
93
  trainer = Trainer(
94
  model=model,
@@ -99,11 +110,13 @@ trainer = Trainer(
99
 
100
  # Step 8: Train the model
101
  print("Starting training...")
102
- trainer.train()
103
- print("Training finished.")
 
 
 
104
 
105
  # Step 9: Plot training and validation loss
106
- # This helps students visualize model performance
107
  print("Generating training and validation loss plot...")
108
  logs = trainer.state.log_history
109
  steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log]
 
8
  import matplotlib.pyplot as plt
9
 
10
  # Step 1: Log in to Hugging Face
11
+ # Students: Replace with your actual Hugging Face token from https://huggingface.co/settings/tokens
12
+ hf_token = "YOUR_HUGGING_FACE_TOKEN" #Replace your YOUR_HUGGING_FACE_TOKEN here
13
+ if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN": # Don't replace here
14
+ raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' with your actual Hugging Face token")
15
+ try:
16
+ login(token=hf_token)
17
+ print("Logged in to Hugging Face successfully")
18
+ except Exception as e:
19
+ raise ValueError(f"Failed to log in to Hugging Face: {str(e)}")
20
 
21
  # Step 2: Load and convert dataset
22
+ # Students: Replace with your dataset file name (CSV or JSON)
23
+ dataset_name = "dataset.json" # Change to "dataset.csv" if using CSV
24
  dataset_path = dataset_name
25
+ if not os.path.exists(dataset_path):
26
+ raise FileNotFoundError(f"Dataset file '{dataset_path}' not found in the project folder")
27
  if dataset_name.endswith('.csv'):
28
+ # Convert CSV to JSON for consistency
29
  print(f"Converting {dataset_name} to JSON format...")
30
+ try:
31
+ df = pd.read_csv(dataset_path)
32
+ df.to_json('dataset.json', orient='records', lines=True)
33
+ dataset_path = 'dataset.json'
34
+ except Exception as e:
35
+ raise ValueError(f"Failed to convert CSV to JSON: {str(e)}")
36
 
37
  # Load dataset
38
  print(f"Loading dataset from {dataset_path}...")
39
+ try:
40
+ dataset = load_dataset('json', data_files=dataset_path)
41
+ except Exception as e:
42
+ raise ValueError(f"Failed to load dataset: {str(e)}")
43
  # Step 3: Split dataset into training and validation
44
  # 85% training, 15% validation to monitor model performance
45
  print("Splitting dataset into training and validation sets...")
 
49
 
50
  # Step 4: Download and load tokenizer and model
51
  print("Downloading T5-small model and tokenizer...")
52
+ try:
53
+ tokenizer = T5Tokenizer.from_pretrained('t5-small')
54
+ model = T5ForConditionalGeneration.from_pretrained('t5-small')
55
+ model.save_pretrained('./t5_small_weights') # Save model weights locally for fine-tuning
56
+ tokenizer.save_pretrained('./t5_small_weights')
57
+ print("Model and tokenizer saved to './t5_small_weights'")
58
+ except Exception as e:
59
+ raise ValueError(f"Failed to download or save model/tokenizer: {str(e)}")
60
 
61
  # Step 5: Preprocess dataset
62
  # This ensures the input questions and answers are properly tokenized for T5
 
64
  # Add "question:" prefix to inputs and clean whitespace
65
  inputs = ["question: " + q.strip() for q in examples['input']]
66
  targets = [r.strip() for r in examples['response']]
67
+ # Tokenize inputs (questions)
68
  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
69
  # Tokenize labels (answers)
70
  labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')
 
71
  model_inputs['labels'] = [
72
  [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
73
  ]
74
  return model_inputs
75
 
 
76
  print("Preprocessing datasets...")
77
+ try:
78
+ processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
79
+ processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
80
+ except Exception as e:
81
+ raise ValueError(f"Failed to preprocess dataset: {str(e)}")
82
 
83
  # Step 6: Define training arguments
 
84
  training_args = TrainingArguments(
85
+ output_dir='./results',
86
+ num_train_epochs=15, # Increased for better convergence
87
+ per_device_train_batch_size=2,
88
+ gradient_accumulation_steps=2,
89
+ learning_rate=5e-4, # Increased for faster learning
90
+ save_steps=500,
91
+ save_total_limit=2,
92
+ logging_steps=50,
93
+ eval_strategy="steps",
94
+ eval_steps=100,
95
+ load_best_model_at_end=True,
96
+ metric_for_best_model="eval_loss",
97
+ greater_is_better=False,
98
+ gradient_checkpointing=True,
99
+ max_grad_norm=1.0,
100
  )
101
 
102
  # Step 7: Initialize Trainer
 
103
  print("Initializing Trainer...")
104
  trainer = Trainer(
105
  model=model,
 
110
 
111
  # Step 8: Train the model
112
  print("Starting training...")
113
+ try:
114
+ trainer.train()
115
+ print("Training finished.")
116
+ except Exception as e:
117
+ raise ValueError(f"Training failed: {str(e)}")
118
 
119
  # Step 9: Plot training and validation loss
 
120
  print("Generating training and validation loss plot...")
121
  logs = trainer.state.log_history
122
  steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log]