Spaces:
Sleeping
Sleeping
| """ | |
| MT564 TinyLlama Training Script | |
| This script fine-tunes a TinyLlama model on MT564 format specifications data. | |
| we are using the model model_name = "sshleifer/tiny-gpt2" # Or "distilgpt2" | |
| for better meory | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Fine-tune TinyLlama on MT564 format specifications") | |
| parser.add_argument( | |
| "--model_name", | |
| type=str, | |
| #default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| default="sshleifer/tiny-gpt2", | |
| help="Hugging Face model ID or path to local model" | |
| ) | |
| parser.add_argument( | |
| "--training_data", | |
| type=str, | |
| default="mt564_training_data.json", | |
| help="Path to the MT564 training data JSON file" | |
| ) | |
| parser.add_argument( | |
| "--output_dir", | |
| type=str, | |
| #default="./mt564_tinyllama_model", | |
| default="sshleifer/tiny-gpt2", | |
| help="Directory to save the fine-tuned model" | |
| ) | |
| parser.add_argument( | |
| "--epochs", | |
| type=int, | |
| default=3, | |
| help="Number of training epochs" | |
| ) | |
| parser.add_argument( | |
| "--batch_size", | |
| type=int, | |
| default=2, | |
| help="Training batch size" | |
| ) | |
| parser.add_argument( | |
| "--gradient_accumulation_steps", | |
| type=int, | |
| default=4, | |
| help="Number of update steps to accumulate before performing a backward/update pass" | |
| ) | |
| parser.add_argument( | |
| "--learning_rate", | |
| type=float, | |
| default=2e-5, | |
| help="Learning rate" | |
| ) | |
| parser.add_argument( | |
| "--max_length", | |
| type=int, | |
| default=512, | |
| help="Maximum sequence length" | |
| ) | |
| parser.add_argument( | |
| "--fp16", | |
| action="store_true", | |
| help="Whether to use fp16 16-bit (mixed) precision training" | |
| ) | |
| parser.add_argument( | |
| "--eval_ratio", | |
| type=float, | |
| default=0.1, | |
| help="Ratio of data to use for evaluation" | |
| ) | |
| return parser.parse_args() | |
| def format_training_data(data): | |
| """Format training data for the model""" | |
| formatted_data = [] | |
| for item in data: | |
| # Format as a chat-like conversation for TinyLlama | |
| formatted_text = f"<|im_start|>user\n{item['instruction']}<|im_end|>\n<|im_start|>assistant\n{item['response']}<|im_end|>" | |
| formatted_data.append({"text": formatted_text}) | |
| return formatted_data | |
| def main(): | |
| args = parse_args() | |
| try: | |
| # Import necessary libraries | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| Trainer, | |
| TrainingArguments, | |
| DataCollatorForLanguageModeling | |
| ) | |
| from datasets import Dataset | |
| except ImportError: | |
| logger.error("Required libraries not installed. Please install torch, transformers, and datasets.") | |
| return | |
| # Load training data | |
| logger.info(f"Loading training data from {args.training_data}") | |
| with open(args.training_data, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Format data for training | |
| formatted_data = format_training_data(data) | |
| logger.info(f"Formatted {len(formatted_data)} training examples") | |
| # Create dataset | |
| dataset = Dataset.from_list(formatted_data) | |
| # Split into training and evaluation sets | |
| dataset = dataset.train_test_split(test_size=args.eval_ratio) | |
| # Load tokenizer | |
| logger.info(f"Loading tokenizer for {args.model_name}") | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |
| # Ensure the tokenizer has a padding token | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Tokenize datasets | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=args.max_length | |
| ) | |
| logger.info("Tokenizing datasets") | |
| tokenized_train = dataset["train"].map(tokenize_function, batched=True) | |
| tokenized_eval = dataset["test"].map(tokenize_function, batched=True) | |
| # Create label column for causal language modeling | |
| tokenized_train = tokenized_train.map( | |
| lambda examples: {"labels": examples["input_ids"]}, | |
| batched=True | |
| ) | |
| tokenized_eval = tokenized_eval.map( | |
| lambda examples: {"labels": examples["input_ids"]}, | |
| batched=True | |
| ) | |
| # Remove the text column as it's no longer needed | |
| tokenized_train = tokenized_train.remove_columns(["text"]) | |
| tokenized_eval = tokenized_eval.remove_columns(["text"]) | |
| # Load model | |
| logger.info(f"Loading model {args.model_name}") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model_name, | |
| torch_dtype=torch.float16 if args.fp16 and torch.cuda.is_available() else torch.float32 | |
| ) | |
| # to save MAC memory | |
| import torch | |
| device = torch.device("cpu") | |
| model.to(device) | |
| # Set up training arguments | |
| training_args = TrainingArguments( | |
| output_dir=args.output_dir, | |
| overwrite_output_dir=True, | |
| num_train_epochs=args.epochs, | |
| per_device_train_batch_size=args.batch_size, | |
| per_device_eval_batch_size=args.batch_size, | |
| gradient_accumulation_steps=args.gradient_accumulation_steps, | |
| learning_rate=args.learning_rate, | |
| weight_decay=0.01, | |
| warmup_steps=100, | |
| logging_dir=f"{args.output_dir}/logs", | |
| logging_steps=10, | |
| eval_steps=100, | |
| save_steps=100, | |
| evaluation_strategy="steps", | |
| save_total_limit=2, | |
| fp16=args.fp16 and torch.cuda.is_available(), | |
| report_to="none" | |
| ) | |
| # Set up data collator | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False | |
| ) | |
| # Set up trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train, | |
| eval_dataset=tokenized_eval, | |
| data_collator=data_collator, | |
| ) | |
| # Start training | |
| logger.info("Starting fine-tuning") | |
| trainer.train() | |
| # Save the model | |
| logger.info(f"Saving model to {args.output_dir}") | |
| trainer.save_model(args.output_dir) | |
| tokenizer.save_pretrained(args.output_dir) | |
| logger.info("Fine-tuning complete!") | |
| if __name__ == "__main__": | |
| main() |