Spaces:

cconsti
/

trial1

Runtime error

File size: 2,235 Bytes

a570747
6e565e2
 
a570747
6e565e2
a570747
 
 
3cef3e3
6e565e2
 
3cef3e3
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
3cef3e3
 
 
 
6e565e2
3cef3e3
9810d0f
f05adaf
 
3cef3e3
 
f05adaf
3cef3e3
 
 
6e565e2
9810d0f
 
6e565e2
9810d0f
 
 
6e565e2
f05adaf
6e565e2
3cef3e3
 
6e565e2
 
 
 
 
 
 
3cef3e3
 
6e565e2
3cef3e3
 
 
6e565e2
 
3cef3e3
 
6e565e2
3cef3e3
 
6e565e2

import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"

# Load dataset
dataset = load_dataset("tatsu-lab/alpaca")  # Change if using your dataset

# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])

# If no 'test' split exists, create one
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)

# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define tokenization function
def tokenize_function(examples):
    inputs = examples["input"]  # Ensure this matches dataset key
    targets = examples["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#  Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("Dataset successfully split and tokenized.")

#  Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    per_device_train_batch_size=2,  # Lowered to avoid memory issues
    per_device_eval_batch_size=2,
    num_train_epochs=1,  # Test run (increase for full fine-tuning)
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False  # Change to True to upload the model to HF Hub
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start fine-tuning
trainer.train()

print("Fine-tuning complete!")

# Save model locally
trainer.save_model("./t5-finetuned")

print("Model saved successfully!")