|
import torch |
|
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments |
|
from datasets import load_dataset |
|
import os |
|
|
|
os.environ["HF_HOME"] = "/app/hf_cache" |
|
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache" |
|
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache" |
|
|
|
|
|
dataset = load_dataset("tatsu-lab/alpaca") |
|
|
|
|
|
model_name = "t5-large" |
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
|
|
def tokenize_function(examples): |
|
inputs = [ex["input"] for ex in examples] |
|
targets = [ex["output"] for ex in examples] |
|
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") |
|
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length") |
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./t5-finetuned", |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
save_total_limit=1, |
|
num_train_epochs=1, |
|
logging_steps=50, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
push_to_hub=False |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["test"], |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.push_to_hub("your-hf-username/t5-cover-letter") |
|
|