Spaces:

cconsti
/

trial1

Runtime error

File size: 2,692 Bytes

3cef3e3
 
 
a570747
 
185d502
a570747
 
 
3cef3e3
 
d81adb9
3cef3e3
816facc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cef3e3
 
 
 
 
 
 
f05adaf
 
 
 
 
3cef3e3
 
f05adaf
3cef3e3
 
 
f05adaf
 
3cef3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816facc
 
3cef3e3
 
816facc
3cef3e3

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import os

# Set Hugging Face cache environment variables
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"

# Load dataset (Replace this with your dataset)
dataset = load_dataset("tatsu-lab/alpaca")  # Example alternative dataset

# Check available dataset splits
print("Dataset splits available:", dataset)

# If "test" split is missing, use a portion of "train" split
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)  # Split 10% for testing

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Assign datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]  # This is now safely created

# Debug output
print("Dataset successfully split into train and test sets")

# Load model and tokenizer
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    print("Sample data structure:", examples)  # Move print inside function

    inputs = examples["input"]  # Make sure "input" matches dataset keys
    targets = examples["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

    return model_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    per_device_train_batch_size=2,  # Smaller batch to avoid memory errors
    per_device_eval_batch_size=2,  # Smaller eval batch
    save_total_limit=1,  # Keep only 1 checkpoint
    num_train_epochs=1,  # Quick test with 1 epoch
    logging_steps=50,  # More frequent logging
    evaluation_strategy="epoch",  # Evaluate only at the end of the epoch
    save_strategy="epoch",  # Save only at the end of the epoch
    push_to_hub=False  # Avoid pushing test model to Hugging Face Hub
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Now correctly assigned
    eval_dataset=eval_dataset,  # No more KeyError
)


# Train the model
trainer.train()

# Save and push model to Hugging Face Hub
trainer.push_to_hub("your-hf-username/t5-cover-letter")