import os os.system("pip install sentencepiece") from datasets import load_dataset from transformers import T5Tokenizer # Load dataset dataset = load_dataset('json', data_files='dataset.json') # Load T5 tokenizer tokenizer = T5Tokenizer.from_pretrained('t5-small') # Preprocess function def preprocess_function(examples): inputs = ["question: " + q for q in examples['input']] targets = examples['response'] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length') labels = tokenizer(targets, max_length=32, truncation=True, padding='max_length') model_inputs['labels'] = labels['input_ids'] return model_inputs # Apply preprocessing processed_dataset = dataset.map(preprocess_function, batched=True) # Save processed dataset processed_dataset.save_to_disk('processed_dataset') print("Dataset preprocessed and saved to 'processed_dataset'")