In [None]:
!pip install tqdm
!pip install transformers==4.40.1
!pip install sentencepiece
!pip install datasets
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install trl
!pip install triton
!pip install bitsandbytes
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install xformers
!pip install pytorch-cuda==12.1 torch xformers
#!pip install --no-deps xformers trl peft accelerate bitsandbytes
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install hyperopt
!pip install optuna

In [None]:
!python -m xformers.info
!python -m bitsandbytes
!nvidia-smi


In [None]:
import json
import torch
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
print(torch.__version__)
print(torch.version.cuda)

In [None]:
# Defining the configuration for the base model, LoRA and training
config = {
 "hugging_face_username":"ruslanmv",
 "model_config": {
 "base_model":"meta-llama/Meta-Llama-3-8B-Instruct", # The base model
 "finetuned_model":"ruslanmv/Medical-Mind-Llama-3-8b", # The finetuned model
 "max_seq_length": 2048, # The maximum sequence length
 # "dtype":torch.float16, # The data type
 # "dtype": torch.float32, # Use float32 instead of half CUDA capability < 8
 "dtype" : None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

 "load_in_4bit": True, # Load the model in 4-bit
 },
 "lora_config": {
 "r": 16, # The number of LoRA layers 8, 16, 32, 64
 "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
 "gate_proj", "up_proj", "down_proj"], # The target modules
 "lora_alpha":16, # The alpha value for LoRA
 #"lora_alpha":15, # The alpha value for LoRA by search grid
 "lora_dropout":0, # The dropout value for LoRA
 "bias":"none", # The bias for LoRA
 "use_gradient_checkpointing":True, # Use gradient checkpointing
 "use_rslora":False, # Use RSLora
 "use_dora":False, # Use DoRa
 "loftq_config":None # The LoFTQ configuration
 },

 "training_config": {
 "per_device_train_batch_size": 2, # The batch size
 #"per_device_train_batch_size": 6, # The batch size by search grid
 "gradient_accumulation_steps": 4, # The gradient accumulation steps
 #"gradient_accumulation_steps": 7, # The gradient accumulation steps by search grid
 "warmup_steps": 5, # The warmup steps
 "max_steps":0, # The maximum steps (0 if the epochs are defined)
 "num_train_epochs": 1, # The number of training epochs(0 if the maximum steps are defined)
 "learning_rate": 2e-4, # The learning rate
 #"learning_rate": 9.5e-05, # The learning rate by search grid
 "fp16": not torch.cuda.is_bf16_supported(), # The fp16
 "bf16": torch.cuda.is_bf16_supported(), # The bf16
 "logging_steps": 1, # The logging steps
 "optim" :"adamw_8bit", # The optimizer
 "weight_decay" : 0.01, # The weight decay
 "lr_scheduler_type": "linear", # The learning rate scheduler
 "seed" : 42, # The seed
 "output_dir" : "outputs", # The output directory
 }
}

In [None]:
config_dataset={ "training_dataset": {
 "name": "ruslanmv/ai-medical-dataset", # The dataset name(huggingface/datasets)
 "split": "train", # The dataset split
 "input_fields": ["question", "context"] ,# The input fields
 "input_field": "text",# The input field
 },
 }

In [None]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
 model_name = config.get("model_config").get("base_model"),
 max_seq_length = config.get("model_config").get("max_seq_length"),
 dtype = config.get("model_config").get("dtype"),
 load_in_4bit = config.get("model_config").get("load_in_4bit"),

)

In [None]:
# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
 model,
 r = config.get("lora_config").get("r"),
 target_modules = config.get("lora_config").get("target_modules"),
 lora_alpha = config.get("lora_config").get("lora_alpha"),
 lora_dropout = config.get("lora_config").get("lora_dropout"),
 bias = config.get("lora_config").get("bias"),
 use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
 random_state = 42,
 use_rslora = config.get("lora_config").get("use_rslora"),
 use_dora = config.get("lora_config").get("use_dora"),
 loftq_config = config.get("lora_config").get("loftq_config"),
)


In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(config.get("model_config").get("base_model"))


tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Loading the training dataset
train_dataset = load_dataset(config_dataset.get("training_dataset").get("name"), split = config_dataset.get("training_dataset").get("split"))



In [None]:
# Select the first 100 rows of the dataset
test_dataset = train_dataset.select(range(100))

In [None]:
medical_prompt = """You are an AI Medical Assistant Chatbot, trained to answer medical questions. Below is an instruction that describes a task, paired with an response context. Write a response that appropriately completes the request.

### Instruction:
{}


### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
 instructions = examples["question"]
 outputs = examples["context"]
 texts = []
 for instruction, output in zip(instructions, outputs):
 # Must add EOS_TOKEN, otherwise your generation will go on forever!
 text = medical_prompt.format(instruction, output) + EOS_TOKEN
 texts.append(text)
 return { "text" : texts, }
pass

test_dataset= test_dataset.map(formatting_prompts_func, batched = True,)



test_dataset['text'][1]

In [None]:
test_dataset

In [None]:
# Setting up the trainer for the model
trainer_test = SFTTrainer(
 model = model,
 tokenizer = tokenizer,
 train_dataset = test_dataset,
 dataset_text_field = config_dataset.get("training_dataset").get("input_field"),
 max_seq_length = config.get("model_config").get("max_seq_length"),
 dataset_num_proc = 2,
 packing = False,
 args = TrainingArguments(
 per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
 gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
 warmup_steps = config.get("training_config").get("warmup_steps"),
 max_steps = config.get("training_config").get("max_steps"),
 num_train_epochs= config.get("training_config").get("num_train_epochs"),
 learning_rate = config.get("training_config").get("learning_rate"),
 fp16 = config.get("training_config").get("fp16"),
 bf16 = config.get("training_config").get("bf16"),
 logging_steps = config.get("training_config").get("logging_steps"),
 optim = config.get("training_config").get("optim"),
 weight_decay = config.get("training_config").get("weight_decay"),
 lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
 seed = 42,
 output_dir = config.get("training_config").get("output_dir"),
 ),
)

## Method 1 optuna

In [None]:
from optuna import create_study, Trial

# Define search space
search_space = {
 "learning_rate": [1e-5, 5e-5, 1e-4, 2e-4],
 "per_device_train_batch_size": [2, 4, 8],
 "lora_alpha": [8, 16, 32],
}

def objective(trial):
 # Set hyperparameters based on trial values
 config["training_config"]["learning_rate"] = trial.suggest_float("learning_rate", search_space["learning_rate"][0], search_space["learning_rate"][-1])
 config["training_config"]["per_device_train_batch_size"] = trial.suggest_int("per_device_train_batch_size", search_space["per_device_train_batch_size"][0], search_space["per_device_train_batch_size"][-1])
 config["lora_config"]["lora_alpha"] = trial.suggest_int("lora_alpha", search_space["lora_alpha"][0], search_space["lora_alpha"][-1])

 # Train the model with the current hyperparameters
 try:
 trainer_stats = trainer_test.train() # Assuming this trains the model
 return trainer_stats["train_loss"] # Assuming this is the metric to minimize
 except Exception as e:
 return float("inf") # Assign a high value if training fails

study = create_study(direction="minimize")
study.optimize(objective, n_trials=2) # Adjust the number of trials

# Access the best trial and its hyperparameters after optimization
best_trial = study.best_trial
best_params = best_trial.params

print("Best Trial:", best_trial.number)
print("Best Hyperparameters:", best_params)
print("Best Training Loss:", best_trial.value)


## Analyzing Hyperparameters:

* **Batch Size**: Generally, increasing the batch size can improve


training speed by utilizing hardware resources more efficiently. However, there's a limit beyond which performance degrades. You can tune the batch size within a reasonable range (e.g., 2, 4, 8, 16) to see its impact.
* **Learning Rate**: A higher learning rate can accelerate training initially. But, a too high value can lead to unstable training and potentially slower convergence. Consider a range of learning rates (e.g., log-uniform distribution between 1e-5 and 1e-3) for exploration.
* **Gradient Accumulation Steps**: This technique accumulates gradients over multiple batches before updating model weights. It can help reduce memory requirements but might slow down training per epoch. Experiment with different accumulation steps (e.g., 1, 2, 4) to find a balance.
* **Optimizer Choice**: Some optimizers like Adam or SGD with momentum can be faster than others depending on the model and dataset. Explore different optimizers and their hyperparameters (e.g., momentum coefficient) to see if they lead to faster convergence.
## Additional Considerations:

Early Stopping: Implement early stopping to automatically terminate training if the validation loss doesn't improve for a certain number of epochs. This can save training time if the model starts overfitting.
Warmup Steps: A gradual increase in the learning rate during the initial training phase (warmup steps) can improve stability and potentially accelerate convergence compared to a fixed learning rate from the beginning.


* Experimentation and Profiling:

The best hyperparameters for faster training depend on your specific model, dataset, and hardware. You'll need to experiment with different configurations using tools like Hyperopt to find the optimal settings.
Consider using profiling tools to identify bottlenecks in your training pipeline. This can help you focus on optimizing specific parts of the training process that are most time-consuming.
By analyzing these hyperparameters and implementing techniques like early stopping and warmup steps, you can potentially achieve faster fine-tuning while maintaining good model performance.

In [None]:
## Method 1b Speed

In [None]:
from optuna import create_study, Trial
import time # Assuming you can use time.time() to measure training time

# Define search space with additional hyperparameter
search_space = {
 "learning_rate": [1e-5, 5e-5, 1e-4, 2e-4],
 "per_device_train_batch_size": [2, 4, 8],
 "lora_alpha": [8, 16, 32],
 "gradient_accumulation_steps": [1, 2, 4, 8], # Added gradient accumulation steps
}

def objective(trial):
 # Set hyperparameters based on trial values
 config["training_config"]["learning_rate"] = trial.suggest_float("learning_rate", search_space["learning_rate"][0], search_space["learning_rate"][-1])
 config["training_config"]["per_device_train_batch_size"] = trial.suggest_int("per_device_train_batch_size", search_space["per_device_train_batch_size"][0], search_space["per_device_train_batch_size"][-1])
 config["training_config"]["gradient_accumulation_steps"] = trial.suggest_int("gradient_accumulation_steps", search_space["gradient_accumulation_steps"][0], search_space["gradient_accumulation_steps"][-1])
 config["lora_config"]["lora_alpha"] = trial.suggest_int("lora_alpha", search_space["lora_alpha"][0], search_space["lora_alpha"][-1])

 # Train the model with the current hyperparameters
 start_time = time.time()
 try:
 trainer_stats = trainer_test.train()
 training_time = time.time() - start_time
 return training_time # Minimize training time
 except Exception as e:
 return float("inf") # Assign a high value if training fails

study = create_study(direction="minimize")
study.optimize(objective, n_trials=2) # Adjust the number of trials

# Access the best trial and its hyperparameters after optimization
best_trial = study.best_trial
best_params = best_trial.params

print("Best Trial:", best_trial.number)
print("Best Hyperparameters (Likely Fastest):", best_params)
print("Best Training Time:", best_trial.value, "seconds")

In [None]:
import hyperopt
from hyperopt import hp
from hyperopt import Trials
from hyperopt import fmin, tpe, Trials
# Define the search space for hyperparameters
space = {
 'learning_rate': hp.loguniform('learning_rate', -5, -1), # Learning rate in log scale
 'lora_alpha': hp.quniform('lora_alpha', 1, 32, 1), # LoRA alpha with quantized steps
 'lora_dropout': hp.uniform('lora_dropout', 0, 0.5), # LoRA dropout rate
 # Uncomment these if you want to tune them
 # 'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, 1),
 # 'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1),
 # 'warmup_steps': hp.quniform('warmup_steps', 0, 1000, 1),
 # 'num_train_epochs': hp.quniform('num_train_epochs', 1, 5, 1),
}
def objective(params):
 # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)
 config['training_config']['learning_rate'] = params['learning_rate']
 config['lora_config']['lora_alpha'] = params['lora_alpha']
 config['lora_config']['lora_dropout'] = params['lora_dropout']
 # ... Set other hyperparameters from params dictionary ...
 #config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']
 #config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']
 #config['training_config']['warmup_steps'] = params['warmup_steps']
 #config['training_config']['num_train_epochs'] = params['num_train_epochs']

 # Load the model and tokenizer (assuming these are defined elsewhere)
 try:
 model, tokenizer = FastLanguageModel.from_pretrained(
 model_name=config.get("model_config").get("base_model"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dtype=config.get("model_config").get("dtype"),
 load_in_4bit=config.get("model_config").get("load_in_4bit"),
 )
 except Exception as e:
 print(f"Error loading model and tokenizer: {e}")
 return float("inf") # Return high value for errors

 # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)
 try:
 model = FastLanguageModel.get_peft_model(
 model,
 r=config.get("lora_config").get("r"),
 target_modules=config.get("lora_config").get("target_modules"),
 lora_alpha=params['lora_alpha'],
 lora_dropout=params['lora_dropout'],
 bias=config.get("lora_config").get("bias"),
 use_gradient_checkpointing=config.get("lora_config").get("use_gradient_checkpointing"),
 random_state=42,
 use_rslora=config.get("lora_config").get("use_rslora"),
 use_dora=config.get("lora_config").get("use_dora"),
 loftq_config=config.get("lora_config").get("loftq_config")
 )
 except Exception as e:
 print(f"Error setting up LoRA: {e}")
 return float("inf") # Return high value for errors
 # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)
 try:
 trainer = SFTTrainer(
 model=model,
 tokenizer=tokenizer,
 train_dataset=test_dataset,
 dataset_text_field=config_dataset.get("training_dataset").get("input_field"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dataset_num_proc=2,
 packing=False,
 args=TrainingArguments(
 per_device_train_batch_size=int(params['per_device_train_batch_size']),
 gradient_accumulation_steps=params['gradient_accumulation_steps'],
 warmup_steps=params['warmup_steps'],
 max_steps=config.get("training_config").get("max_steps"),
 num_train_epochs=params['num_train_epochs'],
 learning_rate=params['learning_rate'],
 fp16=config.get("training_config").get("fp16"),
 bf16=config.get("training_config").get("bf16"),
 logging_steps=config.get("training_config").get("logging_steps"),
 optim=config.get("training_config").get("optim"),
 weight_decay=config.get("training_config").get("weight_decay"),
 lr_scheduler_type=config.get("training_config").get("lr_scheduler_type"),
 seed=42,
 output_dir=config.get("training_config").get("output_dir")
 )
 )
 trainer_stats = trainer.train()
 return trainer_stats.loss # Assuming loss is the metric to minimize
 except Exception as e:
 print(f"Error during training: {e}")
 return float("inf") # Return high value for failed trials

# Create a Trials object to track hyperparameter evaluations
trials = Trials()

# Run hyperparameter optimization using TPE algorithm
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)

# Print the best hyperparameters found during optimization
print("Best Hyperparameters:", best)


In [None]:
import hyperopt
from hyperopt import hp
from hyperopt import Trials
from hyperopt import fmin, tpe, Trials

# Define the search space for hyperparameters with uncommented additions
space = {
 'learning_rate': hp.loguniform('learning_rate', -5, -1), # Learning rate in log scale
 'lora_alpha': hp.quniform('lora_alpha', 1, 32, 1), # LoRA alpha with quantized steps
 'lora_dropout': hp.uniform('lora_dropout', 0, 0.5), # LoRA dropout rate
 'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, 1), # Added for exploration
 'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1), # Added for exploration
 # Uncomment these if you want to tune other hyperparameters
 # 'warmup_steps': hp.quniform('warmup_steps', 0, 1000, 1),
 # 'num_train_epochs': hp.quniform('num_train_epochs', 1, 5, 1),
}


def objective(params):
 # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)
 config['training_config']['learning_rate'] = params['learning_rate']
 config['lora_config']['lora_alpha'] = params['lora_alpha']
 config['lora_config']['lora_dropout'] = params['lora_dropout']
 config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']
 config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']
 # ... Set other hyperparameters from params dictionary ...

 # Load the model and tokenizer (assuming these are defined elsewhere)
 try:
 model, tokenizer = FastLanguageModel.from_pretrained(
 model_name=config.get("model_config").get("base_model"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dtype=config.get("model_config").get("dtype"),
 load_in_4bit=config.get("model_config").get("load_in_4bit"),
 )
 except Exception as e:
 print(f"Error loading model and tokenizer: {e}")
 return float("inf") # Return high value for errors

 # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)
 try:
 model = FastLanguageModel.get_peft_model(
 model,
 r=config.get("lora_config").get("r"),
 target_modules=config.get("lora_config").get("target_modules"),
 lora_alpha=params['lora_alpha'],
 lora_dropout=params['lora_dropout'],
 bias=config.get("lora_config").get("bias"),
 use_gradient_checkpointing=config.get("lora_config").get("use_gradient_checkpointing"),
 random_state=42,
 use_rslora=config.get("lora_config").get("use_rslora"),
 use_dora=config.get("lora_config").get("use_dora"),
 loftq_config=config.get("lora_config").get("loftq_config")
 )
 except Exception as e:
 print(f"Error setting up LoRA: {e}")
 return float("inf") # Return high value for errors

 # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)
 try:
 trainer = SFTTrainer(
 model=model,
 tokenizer=tokenizer,
 train_dataset=test_dataset,
 dataset_text_field=config_dataset.get("training_dataset").get("input_field"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dataset_num_proc=2,
 packing=False,
 args=TrainingArguments(
 per_device_train_batch_size=int(params['per_device_train_batch_size']),
 gradient_accumulation_steps=params['gradient_accumulation_steps'],
 warmup_steps=params['warmup_steps'],
 max_steps=config.get("training_config").get("max_steps"),
 num_train_epochs=params['num_train_epochs'],
 learning_rate=params['learning_rate'],
 fp16=config.get("training_config").get("fp16"),
 bf16=config.get("training_config").get("bf16"),
 logging_steps=config.get("training_config").get("logging_steps"),
 optim=config.get("training_config").get("optim"),
 weight_decay=config.get("training_config").get("weight_decay"),
 lr_scheduler_type=config.get("training_config").get("lr_scheduler_type"),
 seed=42,
 output_dir=config.get("training_config").get("output_dir")
 )
 )
 trainer_stats = trainer.train()
 return trainer_stats.loss # Assuming loss is the metric to minimize
 except Exception as e:
 print(f"Error during training: {e}")
 return float("inf") # Return high value for failed trials

# Create a Trials object to track hyperparameter evaluations
trials = Trials()

# Run hyperparameter optimization using TPE algorithm
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)

# Print the best hyperparameters found during optimization
print("Best Hyperparameters:", best)


In [None]:
## Method

In [None]:
import hyperopt
from hyperopt import hp
from hyperopt import Trials
from hyperopt import fmin, tpe, Trials
import time # Import time for measuring training duration

# Define the search space for hyperparameters with uncommented additions
space = {
 'learning_rate': hp.loguniform('learning_rate', -5, -1), # Learning rate in log scale
 'lora_alpha': hp.quniform('lora_alpha', 1, 32, 1), # LoRA alpha with quantized steps
 'lora_dropout': hp.uniform('lora_dropout', 0, 0.5), # LoRA dropout rate
 'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, 1), # Added for exploration
 'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1), # Added for exploration
 # Uncomment these if you want to tune other hyperparameters
 # 'warmup_steps': hp.quniform('warmup_steps', 0, 1000, 1),
 # 'num_train_epochs': hp.quniform('num_train_epochs', 1, 5, 1),
}


def objective(params):
 # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)
 config['training_config']['learning_rate'] = params['learning_rate']
 config['lora_config']['lora_alpha'] = params['lora_alpha']
 config['lora_config']['lora_dropout'] = params['lora_dropout']
 config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']
 config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']
 # ... Set other hyperparameters from params dictionary ...

 # Load the model and tokenizer (assuming these are defined elsewhere)
 try:
 model, tokenizer = FastLanguageModel.from_pretrained(
 model_name=config.get("model_config").get("base_model"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dtype=config.get("model_config").get("dtype"),
 load_in_4bit=config.get("model_config").get("load_in_4bit"),
 )
 except Exception as e:
 print(f"Error loading model and tokenizer: {e}")
 return float("inf") # Return high value for errors

 # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)
 try:
 model = FastLanguageModel.get_peft_model(
 model,
 r=config.get("lora_config").get("r"),
 target_modules=config.get("lora_config").get("target_modules"),
 lora_alpha=params['lora_alpha'],
 lora_dropout=params['lora_dropout'],
 bias=config.get("lora_config").get("bias"),
 use_gradient_checkpointing=config.get("lora_config").get("use_gradient_checkpointing"),
 random_state=42,
 use_rslora=config.get("lora_config").get("use_rslora"),
 use_dora=config.get("lora_config").get("use_dora"),
 loftq_config=config.get("lora_config").get("loftq_config")
 )
 except Exception as e:
 print(f"Error setting up LoRA: {e}")
 return float("inf") # Return high value for errors

 # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)
 try:
 start_time = time.time() # Measure training start time
 trainer = SFTTrainer(
 model=model,
 tokenizer=tokenizer,
 train_dataset=test_dataset,
 dataset_text_field=config_dataset.get("training_dataset").get("input_field"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dataset_num_proc=2,
 packing=False,
 args=TrainingArguments(
 per_device_train_batch_size=int(params['per_device_train_batch_size']),
 gradient_accumulation_steps=params['gradient_accumulation_steps'],
 warmup_steps=params['warmup_steps'],
 max_steps=config.get("training_config").get("max_steps"),
 num_train_epochs=params['num_train_epochs'],
 learning_rate=params['learning_rate'],
 fp16=config.get("training_config").get("fp16"),
 bf16=config.get("training_config").get("bf16"),
 logging_steps=config.get("training_config").get("logging_steps"),
 optim=config.get("training_config").get("optim"),
 weight_decay=config.get("training_config").get("weight_decay"),
 lr_scheduler_type=config.get("training_config").get("lr_scheduler_type"),
 seed=42,
 output_dir=config.get("training_config").get("output_dir")
 )
 )
 trainer_stats = trainer.train()
 end_time = time.time() # Measure training end time
 training_time = end_time - start_time # Calculate training time

 return training_time # Return training time for minimization
 except Exception as e:
 print(f"Error during training: {e}")
 return float("inf") # Return high value for failed trials

# Create a Trials object to track hyperparameter evaluations
trials = Trials()

# Run hyperparameter optimization using TPE algorithm
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)




In [None]:
# Print the best hyperparameters found during optimization
print("Best Hyperparameters:", best)

# Hyperparameter search
**Step 1: Define the Hyperparameter Search Space**
We need to define the search space for the hyperparameters we want to tune. For example, let's say we want to tune the following hyperparameters:

* `learning_rate`
* `per_device_train_batch_size`
* `gradient_accumulation_steps`
* `warmup_steps`
* `num_train_epochs`
* `lora_alpha`
* `lora_dropout`

We can define the search space as follows:

In [None]:
import hyperopt
from hyperopt import hp
from hyperopt import Trials
from hyperopt import fmin, tpe, Trials
# Define the search space for hyperparameters
space = {
 'learning_rate': hp.loguniform('learning_rate', -5, -1), # Learning rate in log scale
 'lora_alpha': hp.quniform('lora_alpha', 1, 32, 1), # LoRA alpha with quantized steps
 'lora_dropout': hp.uniform('lora_dropout', 0, 0.5), # LoRA dropout rate
 # Uncomment these if you want to tune them
 # 'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, 1),
 # 'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1),
 # 'warmup_steps': hp.quniform('warmup_steps', 0, 1000, 1),
 # 'num_train_epochs': hp.quniform('num_train_epochs', 1, 5, 1),
}

**Step 2. Define the Objective Function**

The objective function is a function that takes in the hyperparameters, sets them in the `config` dictionary, trains the model, and returns the loss or metric to minimize. We need to modify the previous fine-tuning code to define the objective function.

In [None]:
def objective(params):
 # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)
 config['training_config']['learning_rate'] = params['learning_rate']
 config['lora_config']['lora_alpha'] = params['lora_alpha']
 config['lora_config']['lora_dropout'] = params['lora_dropout']
 # ... Set other hyperparameters from params dictionary ...
 #config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']
 #config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']
 #config['training_config']['warmup_steps'] = params['warmup_steps']
 #config['training_config']['num_train_epochs'] = params['num_train_epochs']

 # Load the model and tokenizer (assuming these are defined elsewhere)
 try:
 model, tokenizer = FastLanguageModel.from_pretrained(
 model_name=config.get("model_config").get("base_model"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dtype=config.get("model_config").get("dtype"),
 load_in_4bit=config.get("model_config").get("load_in_4bit"),
 )
 except Exception as e:
 print(f"Error loading model and tokenizer: {e}")
 return float("inf") # Return high value for errors

 # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)
 try:
 model = FastLanguageModel.get_peft_model(
 model,
 r=config.get("lora_config").get("r"),
 target_modules=config.get("lora_config").get("target_modules"),
 lora_alpha=params['lora_alpha'],
 lora_dropout=params['lora_dropout'],
 bias=config.get("lora_config").get("bias"),
 use_gradient_checkpointing=config.get("lora_config").get("use_gradient_checkpointing"),
 random_state=42,
 use_rslora=config.get("lora_config").get("use_rslora"),
 use_dora=config.get("lora_config").get("use_dora"),
 loftq_config=config.get("lora_config").get("loftq_config")
 )
 except Exception as e:
 print(f"Error setting up LoRA: {e}")
 return float("inf") # Return high value for errors
 # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)
 try:
 trainer = SFTTrainer(
 model=model,
 tokenizer=tokenizer,
 train_dataset=test_dataset,
 dataset_text_field=config_dataset.get("training_dataset").get("input_field"),
 max_seq_length=config.get("model_config").get("max_seq_length"),
 dataset_num_proc=2,
 packing=False,
 args=TrainingArguments(
 per_device_train_batch_size=int(params['per_device_train_batch_size']),
 gradient_accumulation_steps=params['gradient_accumulation_steps'],
 warmup_steps=params['warmup_steps'],
 max_steps=config.get("training_config").get("max_steps"),
 num_train_epochs=params['num_train_epochs'],
 learning_rate=params['learning_rate'],
 fp16=config.get("training_config").get("fp16"),
 bf16=config.get("training_config").get("bf16"),
 logging_steps=config.get("training_config").get("logging_steps"),
 optim=config.get("training_config").get("optim"),
 weight_decay=config.get("training_config").get("weight_decay"),
 lr_scheduler_type=config.get("training_config").get("lr_scheduler_type"),
 seed=42,
 output_dir=config.get("training_config").get("output_dir")
 )
 )
 trainer_stats = trainer.train()
 return trainer_stats.loss # Assuming loss is the metric to minimize
 except Exception as e:
 print(f"Error during training: {e}")
 return float("inf") # Return high value for failed trials



**Step 3: Perform Hyperparameter Search**

Now that we have defined the objective function, we can perform the hyperparameter search using Hyperopt's `fmin` function. We need to specify the objective function, the search space, and the maximum number of evaluations.

In [None]:

# Create a Trials object to track hyperparameter evaluations
trials = Trials()
# Run hyperparameter optimization using TPE algorithm
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)
# Print the best hyperparameters found during optimization
print("Best Hyperparameters:", best)

In [None]:
from huggingface_hub import login, logout

In [None]:
#login(token) # non-blocking login

In [None]:
import torch
import gc
def reset_gpu_memory():
 torch.cuda.empty_cache()
 gc.collect()
 print("GPU memory cleared!")
# Example usage:
reset_gpu_memory()

Best Hyperparameters: {'learning_rate': 0.03347123299210303, 'lora_alpha': 19.0, 'lora_dropout': 0.4819141472093197}

Best Hyperparameters: {'gradient_accumulation_steps': 8.0, 'learning_rate': 0.23274337759179295, 'lora_alpha': 8.0, 'lora_dropout': 0.0491660925212421, 'per_device_train_batch_size': 13.0}

Best Hyperparameters: {'gradient_accumulation_steps': 4.0, 'learning_rate': 0.186066529001672, 'lora_alpha': 32.0, 'lora_dropout': 0.24368804023352264, 'per_device_train_batch_size': 10.0}

Best Hyperparameters: {'learning_rate': 0.011846192509972951, 'lora_alpha': 8.0, 'lora_dropout': 0.2087248476879589}



Best Hyperparameters (Likely Fastest): {'learning_rate': 1.881999040862022e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 2, 'lora_alpha': 29}
Best Training Time: 48.178661584854126 seconds


In [None]:
# Defining the configuration for the base model, LoRA and training
config = {
 "hugging_face_username":"ruslanmv",
 "model_config": {
 "base_model":"meta-llama/Meta-Llama-3-8B-Instruct", # The base model
 "finetuned_model":"ruslanmv/Medical-Mind-Llama-3-8b", # The finetuned model
 "max_seq_length": 2048, # The maximum sequence length
 # "dtype":torch.float16, # The data type
 # "dtype": torch.float32, # Use float32 instead of half CUDA capability < 8
 "dtype" : None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

 "load_in_4bit": True, # Load the model in 4-bit
 },
 "lora_config": {
 "r": 16, # The number of LoRA layers 8, 16, 32, 64
 "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
 "gate_proj", "up_proj", "down_proj"], # The target modules
 #"lora_alpha":16, # The alpha value for LoRA
 "lora_alpha":29, # The alpha value for LoRA by search grid
 "lora_dropout":0, # The dropout value for LoRA
 "bias":"none", # The bias for LoRA
 "use_gradient_checkpointing":True, # Use gradient checkpointing
 "use_rslora":False, # Use RSLora
 "use_dora":False, # Use DoRa
 "loftq_config":None # The LoFTQ configuration
 },

 "training_config": {
 #"per_device_train_batch_size": 2, # The batch size
 "per_device_train_batch_size": 2, # The batch size by search grid
 #"gradient_accumulation_steps": 4, # The gradient accumulation steps
 "gradient_accumulation_steps": 2, # The gradient accumulation steps by search grid
 "warmup_steps": 5, # The warmup steps
 "max_steps":0, # The maximum steps (0 if the epochs are defined)
 "num_train_epochs": 1, # The number of training epochs(0 if the maximum steps are defined)
 #"learning_rate": 2e-4, # The learning rate
 "learning_rate": 1.88e-05, # The learning rate by search grid
 "fp16": not torch.cuda.is_bf16_supported(), # The fp16
 "bf16": torch.cuda.is_bf16_supported(), # The bf16
 "logging_steps": 1, # The logging steps
 "optim" :"adamw_8bit", # The optimizer
 "weight_decay" : 0.01, # The weight decay
 "lr_scheduler_type": "linear", # The learning rate scheduler
 "seed" : 42, # The seed
 "output_dir" : "outputs", # The output directory
 }
}

In [None]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
 model_name = config.get("model_config").get("base_model"),
 max_seq_length = config.get("model_config").get("max_seq_length"),
 dtype = config.get("model_config").get("dtype"),
 load_in_4bit = config.get("model_config").get("load_in_4bit"),

)

In [None]:
# Set up GPU acceleration
if torch.cuda.device_count() > 1:
 print("Multiple GPUs enabled")
 devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
 model_parallel = torch.nn.DataParallel(model, device_ids=[0, 1])
 # Access the original model from the DataParallel object
 model = model_parallel.module
else:
 print("No DataParallel ")
 #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
#model = model.half() # the model to half precision (float16)

In [None]:
# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
 model,
 r = config.get("lora_config").get("r"),
 target_modules = config.get("lora_config").get("target_modules"),
 lora_alpha = config.get("lora_config").get("lora_alpha"),
 lora_dropout = config.get("lora_config").get("lora_dropout"),
 bias = config.get("lora_config").get("bias"),
 use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
 random_state = 42,
 use_rslora = config.get("lora_config").get("use_rslora"),
 use_dora = config.get("lora_config").get("use_dora"),
 loftq_config = config.get("lora_config").get("loftq_config"),
)


In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(config.get("model_config").get("base_model"))

In [None]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [None]:
config_dataset={ "training_dataset": {
 "name": "ruslanmv/ai-medical-dataset", # The dataset name(huggingface/datasets)
 "split": "train", # The dataset split
 "input_fields": ["question", "context"] ,# The input fields
 "input_field": "text",# The input field
 },
 }

In [None]:
config_dataset.get("training_dataset")

In [None]:
# Loading the training dataset
train_dataset = load_dataset(config_dataset.get("training_dataset").get("name"), split = config_dataset.get("training_dataset").get("split"))

In [None]:
train_dataset

In [None]:
# Select the first 10 rows of the dataset
test_dataset = train_dataset.select(range(100))

In [None]:
test_dataset

In [None]:
test_dataset[1]

In [None]:
medical_prompt = """You are an AI Medical Assistant Chatbot, trained to answer medical questions. Below is an instruction that describes a task, paired with an response context. Write a response that appropriately completes the request.

### Instruction:
{}


### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
 instructions = examples["question"]
 outputs = examples["context"]
 texts = []
 for instruction, output in zip(instructions, outputs):
 # Must add EOS_TOKEN, otherwise your generation will go on forever!
 text = medical_prompt.format(instruction, output) + EOS_TOKEN
 texts.append(text)
 return { "text" : texts, }
pass

In [None]:
test_dataset= test_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
test_dataset

In [None]:
test_dataset['text'][1]

In [None]:
is_test=True
if is_test:
 train_dataset=test_dataset
else:
 train_dataset= train_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
train_dataset['text'][1]

In [None]:
# Setting up the trainer for the model
trainer = SFTTrainer(
 model = model,
 tokenizer = tokenizer,
 train_dataset = train_dataset,
 dataset_text_field = config_dataset.get("training_dataset").get("input_field"),
 max_seq_length = config.get("model_config").get("max_seq_length"),
 dataset_num_proc = 2,
 packing = False,
 args = TrainingArguments(
 per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
 gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
 warmup_steps = config.get("training_config").get("warmup_steps"),
 max_steps = config.get("training_config").get("max_steps"),
 num_train_epochs= config.get("training_config").get("num_train_epochs"),
 learning_rate = config.get("training_config").get("learning_rate"),
 fp16 = config.get("training_config").get("fp16"),
 bf16 = config.get("training_config").get("bf16"),
 logging_steps = config.get("training_config").get("logging_steps"),
 optim = config.get("training_config").get("optim"),
 weight_decay = config.get("training_config").get("weight_decay"),
 lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
 seed = 42,
 output_dir = config.get("training_config").get("output_dir"),
 ),
)

In [None]:
# Memory statistics before training
gpu_statistics = torch.cuda.get_device_properties(0)
reserved_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 2)
max_memory = round(gpu_statistics.total_memory / 1024**3, 2)
print(f"Reserved Memory: {reserved_memory}GB")
print(f"Max Memory: {max_memory}GB")

In [None]:
## [ 1038/2651250 53:49 < 2295:10:28, 0.32 it/s, Epoch 0.00/1] old

In [None]:
# Training the model
trainer_stats = trainer.train()

In [None]:
# Memory statistics after training
used_memory = round(torch.cuda.max_memory_allocated() / 1024**3, 2)
used_memory_lora = round(used_memory - reserved_memory, 2)
used_memory_persentage = round((used_memory / max_memory) * 100, 2)
used_memory_lora_persentage = round((used_memory_lora / max_memory) * 100, 2)
print(f"Used Memory: {used_memory}GB ({used_memory_persentage}%)")
print(f"Used Memory for training(fine-tuning) LoRA: {used_memory_lora}GB ({used_memory_lora_persentage}%)")

In [None]:
# Saving the trainer stats
with open("trainer_stats.json", "w") as f:
 json.dump(trainer_stats, f, indent=4)

In [None]:
# Locally saving the model and pushing it to the Hugging Face Hub (only LoRA adapters)
model.save_pretrained(config.get("model_config").get("finetuned_model"))
model.push_to_hub(config.get("model_config").get("finetuned_model"), tokenizer = tokenizer)

In [None]:
# Saving the model using merged_16bit(float16), merged_4bit(int4) or quantization options(q8_0, q4_k_m, q5_k_m)...
model.save_pretrained_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_16bit")

model.save_pretrained_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_4bit",)
model.push_to_hub_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_4bit")

model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer)
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer)

model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "f16")
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "f16")

model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")

In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
 model_name = config.get("model_config").get("finetuned_model"),
 max_seq_length = config.get("model_config").get("max_seq_length"),
 dtype = config.get("model_config").get("dtype"),
 load_in_4bit = config.get("model_config").get("load_in_4bit"),
 )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)

# Tokenizing the input and generating the output
inputs = tokenizer(
[
 "<|start_header_id|>system<|end_header_id|> You are a Medical AI chatbot assistant .<|eot_id|><|start_header_id|>user<|end_header_id|> This is the question: What was the main cause of the inflammatory CD4+ T cells?<|eot_id|>"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens = True)