cconsti commited on
Commit
6e565e2
·
verified ·
1 Parent(s): 9810d0f

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +32 -53
train.py CHANGED
@@ -1,40 +1,29 @@
1
- import torch
2
- from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
3
- from datasets import load_dataset
4
  import os
 
 
5
 
6
- # Set Hugging Face cache environment variables
7
  os.environ["HF_HOME"] = "/app/hf_cache"
8
  os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
9
  os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
10
 
11
- # Load dataset (Replace this with your dataset)
12
- dataset = load_dataset("tatsu-lab/alpaca") # Example alternative dataset
13
 
14
- # Check available dataset splits
15
  print("Dataset splits available:", dataset)
 
16
 
17
- # If "test" split is missing, use a portion of "train" split
18
  if "test" not in dataset:
19
- dataset = dataset["train"].train_test_split(test_size=0.1) # Split 10% for testing
20
-
21
- # Tokenize dataset
22
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
23
-
24
- # Assign datasets
25
- train_dataset = tokenized_datasets["train"]
26
- eval_dataset = tokenized_datasets["test"] # This is now safely created
27
-
28
- # Debug output
29
- print("Dataset successfully split into train and test sets")
30
 
31
- # Load model and tokenizer
32
  model_name = "t5-large"
33
  tokenizer = T5Tokenizer.from_pretrained(model_name)
34
  model = T5ForConditionalGeneration.from_pretrained(model_name)
35
 
36
- # Tokenization function
37
- # Define tokenization function before mapping
38
  def tokenize_function(examples):
39
  inputs = examples["input"] # Ensure this matches dataset key
40
  targets = examples["output"]
@@ -45,51 +34,41 @@ def tokenize_function(examples):
45
  model_inputs["labels"] = labels["input_ids"]
46
  return model_inputs
47
 
48
- # Check dataset structure
49
- print("Dataset splits available:", dataset)
50
-
51
- # If "test" split is missing, create one
52
- if "test" not in dataset:
53
- dataset = dataset["train"].train_test_split(test_size=0.1)
54
-
55
- # Tokenize dataset
56
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
57
 
58
- # Assign train and eval datasets
59
  train_dataset = tokenized_datasets["train"]
60
  eval_dataset = tokenized_datasets["test"]
61
 
62
- # Debug output
63
- print("Dataset successfully split and tokenized")
64
-
65
 
66
- # Apply tokenization
67
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
68
-
69
- # Training arguments
70
  training_args = TrainingArguments(
71
  output_dir="./t5-finetuned",
72
- per_device_train_batch_size=2, # Smaller batch to avoid memory errors
73
- per_device_eval_batch_size=2, # Smaller eval batch
74
- save_total_limit=1, # Keep only 1 checkpoint
75
- num_train_epochs=1, # Quick test with 1 epoch
76
- logging_steps=50, # More frequent logging
77
- evaluation_strategy="epoch", # Evaluate only at the end of the epoch
78
- save_strategy="epoch", # Save only at the end of the epoch
79
- push_to_hub=False # Avoid pushing test model to Hugging Face Hub
80
  )
81
 
82
- # Trainer setup
83
  trainer = Trainer(
84
  model=model,
85
  args=training_args,
86
- train_dataset=train_dataset, # Now correctly assigned
87
- eval_dataset=eval_dataset, # No more KeyError
88
  )
89
 
90
-
91
- # Train the model
92
  trainer.train()
93
 
94
- # Save and push model to Hugging Face Hub
95
- trainer.push_to_hub("your-hf-username/t5-cover-letter")
 
 
 
 
 
 
 
 
1
  import os
2
+ from datasets import load_dataset
3
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
4
 
5
+ # Ensure Hugging Face cache directory is writable
6
  os.environ["HF_HOME"] = "/app/hf_cache"
7
  os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
8
  os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
9
 
10
+ # Load dataset
11
+ dataset = load_dataset("tatsu-lab/alpaca") # Change if using your dataset
12
 
13
+ # Check dataset structure
14
  print("Dataset splits available:", dataset)
15
+ print("Sample row:", dataset["train"][0])
16
 
17
+ # If no 'test' split exists, create one
18
  if "test" not in dataset:
19
+ dataset = dataset["train"].train_test_split(test_size=0.1)
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Load tokenizer & model
22
  model_name = "t5-large"
23
  tokenizer = T5Tokenizer.from_pretrained(model_name)
24
  model = T5ForConditionalGeneration.from_pretrained(model_name)
25
 
26
+ # Define tokenization function
 
27
  def tokenize_function(examples):
28
  inputs = examples["input"] # Ensure this matches dataset key
29
  targets = examples["output"]
 
34
  model_inputs["labels"] = labels["input_ids"]
35
  return model_inputs
36
 
37
+ # Tokenize dataset
 
 
 
 
 
 
 
38
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
39
 
40
+ # Assign train & eval datasets
41
  train_dataset = tokenized_datasets["train"]
42
  eval_dataset = tokenized_datasets["test"]
43
 
44
+ print("Dataset successfully split and tokenized.")
 
 
45
 
46
+ # Define training arguments
 
 
 
47
  training_args = TrainingArguments(
48
  output_dir="./t5-finetuned",
49
+ per_device_train_batch_size=2, # Lowered to avoid memory issues
50
+ per_device_eval_batch_size=2,
51
+ num_train_epochs=1, # Test run (increase for full fine-tuning)
52
+ logging_steps=50,
53
+ evaluation_strategy="epoch",
54
+ save_strategy="epoch",
55
+ push_to_hub=False # Change to True to upload the model to HF Hub
 
56
  )
57
 
58
+ # Set up Trainer
59
  trainer = Trainer(
60
  model=model,
61
  args=training_args,
62
+ train_dataset=train_dataset,
63
+ eval_dataset=eval_dataset,
64
  )
65
 
66
+ # Start fine-tuning
 
67
  trainer.train()
68
 
69
+ print("Fine-tuning complete!")
70
+
71
+ # Save model locally
72
+ trainer.save_model("./t5-finetuned")
73
+
74
+ print("Model saved successfully!")