from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset # Load a small pre-trained model and tokenizer model_name = "distilgpt2" # or choose another small model model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Add a pad token (setting it to eos_token is one common approach for GPT-based models) tokenizer.pad_token = tokenizer.eos_token # Or you can choose to add a new pad token, e.g., '[PAD]' # Load the dataset (Make sure data.json is in the correct location) train_data = load_dataset("json", data_files={"train": "data.json"}) # Preprocess the dataset def preprocess_function(examples): inputs = examples["input"] outputs = examples["output"] model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") labels = tokenizer(outputs, max_length=512, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs # Preprocess the train dataset using the map function train_dataset = train_data["train"].map(preprocess_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=4, logging_dir="./logs", ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) # Train the model trainer.train() # Save the fine-tuned model model.save_pretrained("./lockin_model") tokenizer.save_pretrained("./lockin_model")