Spaces:

cconsti
/

trial1

Runtime error

App Files Files Community

cconsti commited on Jan 31

Commit

6e565e2

verified ·

1 Parent(s): 9810d0f

Update train.py

Browse files

Files changed (1) hide show

train.py +32 -53

train.py CHANGED Viewed

@@ -1,40 +1,29 @@
-import torch
-from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
-from datasets import load_dataset
 import os
-# Set Hugging Face cache environment variables
 os.environ["HF_HOME"] = "/app/hf_cache"
 os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
 os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
-# Load dataset (Replace this with your dataset)
-dataset = load_dataset("tatsu-lab/alpaca")  # Example alternative dataset
-# Check available dataset splits
 print("Dataset splits available:", dataset)
-# If "test" split is missing, use a portion of "train" split
 if "test" not in dataset:
-    dataset = dataset["train"].train_test_split(test_size=0.1)  # Split 10% for testing
-# Tokenize dataset
-tokenized_datasets = dataset.map(tokenize_function, batched=True)
-# Assign datasets
-train_dataset = tokenized_datasets["train"]
-eval_dataset = tokenized_datasets["test"]  # This is now safely created
-# Debug output
-print("Dataset successfully split into train and test sets")
-# Load model and tokenizer
 model_name = "t5-large"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
 model = T5ForConditionalGeneration.from_pretrained(model_name)
-# Tokenization function
-# Define tokenization function before mapping
 def tokenize_function(examples):
     inputs = examples["input"]  # Ensure this matches dataset key
     targets = examples["output"]
@@ -45,51 +34,41 @@ def tokenize_function(examples):
     model_inputs["labels"] = labels["input_ids"]
     return model_inputs
-# Check dataset structure
-print("Dataset splits available:", dataset)
-# If "test" split is missing, create one
-if "test" not in dataset:
-    dataset = dataset["train"].train_test_split(test_size=0.1)
-# Tokenize dataset
 tokenized_datasets = dataset.map(tokenize_function, batched=True)
-# Assign train and eval datasets
 train_dataset = tokenized_datasets["train"]
 eval_dataset = tokenized_datasets["test"]
-# Debug output
-print("Dataset successfully split and tokenized")
-# Apply tokenization
-tokenized_datasets = dataset.map(tokenize_function, batched=True)
-# Training arguments
 training_args = TrainingArguments(
     output_dir="./t5-finetuned",
-    per_device_train_batch_size=2,  # Smaller batch to avoid memory errors
-    per_device_eval_batch_size=2,  # Smaller eval batch
-    save_total_limit=1,  # Keep only 1 checkpoint
-    num_train_epochs=1,  # Quick test with 1 epoch
-    logging_steps=50,  # More frequent logging
-    evaluation_strategy="epoch",  # Evaluate only at the end of the epoch
-    save_strategy="epoch",  # Save only at the end of the epoch
-    push_to_hub=False  # Avoid pushing test model to Hugging Face Hub
 )
-# Trainer setup
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=train_dataset,  # Now correctly assigned
-    eval_dataset=eval_dataset,  # No more KeyError
 )
-# Train the model
 trainer.train()
-# Save and push model to Hugging Face Hub
-trainer.push_to_hub("your-hf-username/t5-cover-letter")

 import os
+from datasets import load_dataset
+from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
+# Ensure Hugging Face cache directory is writable
 os.environ["HF_HOME"] = "/app/hf_cache"
 os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
 os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
+# Load dataset
+dataset = load_dataset("tatsu-lab/alpaca")  # Change if using your dataset
+# Check dataset structure
 print("Dataset splits available:", dataset)
+print("Sample row:", dataset["train"][0])
+# If no 'test' split exists, create one
 if "test" not in dataset:
+    dataset = dataset["train"].train_test_split(test_size=0.1)
+# Load tokenizer & model
 model_name = "t5-large"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
 model = T5ForConditionalGeneration.from_pretrained(model_name)
+# Define tokenization function
 def tokenize_function(examples):
     inputs = examples["input"]  # Ensure this matches dataset key
     targets = examples["output"]
     model_inputs["labels"] = labels["input_ids"]
     return model_inputs
+#  Tokenize dataset
 tokenized_datasets = dataset.map(tokenize_function, batched=True)
+#  Assign train & eval datasets
 train_dataset = tokenized_datasets["train"]
 eval_dataset = tokenized_datasets["test"]
+print("Dataset successfully split and tokenized.")
+#  Define training arguments
 training_args = TrainingArguments(
     output_dir="./t5-finetuned",
+    per_device_train_batch_size=2,  # Lowered to avoid memory issues
+    per_device_eval_batch_size=2,
+    num_train_epochs=1,  # Test run (increase for full fine-tuning)
+    logging_steps=50,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    push_to_hub=False  # Change to True to upload the model to HF Hub
 )
+# Set up Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
 )
+# Start fine-tuning
 trainer.train()
+print("Fine-tuning complete!")
+# Save model locally
+trainer.save_model("./t5-finetuned")
+print("Model saved successfully!")