Getting this run time error on Colab
RuntimeError Traceback (most recent call last)
/tmp/ipython-input-2302535289.py in <cell line: 0>()
6
7 base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
----> 8 model = PeftModel.from_pretrained(base_model, "polyglots/SinLlama_v01")
3 frames
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict, assign)
2622
2623 if len(error_msgs) > 0:
-> 2624 raise RuntimeError(
2625 "Error(s) in loading state_dict for {}:\n\t{}".format(
2626 self.class.name, "\n\t".join(error_msgs)
RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
size mismatch for base_model.model.model.embed_tokens.original_module.weight: copying a param with shape torch.Size([139336, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
size mismatch for base_model.model.model.embed_tokens.modules_to_save.default.weight: copying a param with shape torch.Size([139336, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
size mismatch for base_model.model.lm_head.original_module.weight: copying a param with shape torch.Size([139336, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
size mismatch for base_model.model.lm_head.modules_to_save.default.weight: copying a param with shape torch.Size([139336, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
This is the full code to load the model for further experiments
# Install dependencies
!pip install unsloth # @ git+https://github.com/unslothai/unsloth.git
!pip install datasets==2.21.0
!pip install pandas==2.1.4
# Import dependencies
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import TextStreamer, AutoTokenizer
import torch
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from collections import Counter, defaultdict
import os
import sys
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
import pandas as pd
# Load the base model
model_config = {"model_name": "unsloth/llama-3-8b", "load_in_4bit": False}
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
model_name = "polyglots/SinLlama_v01" # Change the model name
# Load the model
model, _ = FastLanguageModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
resize_model_vocab=139336,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
# Load our extended tokenizer
tokenizer = AutoTokenizer.from_pretrained("polyglots/Extended-Sinhala-LLaMA")
model.resize_token_embeddings(len(tokenizer))
The error occurs because the checkpoint you’re trying to load (polyglots/SinLlama_v01) was trained with an extended vocabulary of 139,336 tokens, while the base model (meta-llama/Meta-Llama-3-8B) has only 128,256 tokens. When you try to load it directly with PeftModel.from_pretrained, the embedding layers and the LM head do not match in size, resulting in the size mismatch error.
The solution is to resize the model’s token embeddings to match the checkpoint’s vocabulary before loading the PEFT model. In the code you shared, this is handled by:
model, _ = FastLanguageModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
resize_model_vocab=139336, # resize embeddings to match checkpoint
)
tokenizer = AutoTokenizer.from_pretrained("polyglots/Extended-Sinhala-LLaMA")
model.resize_token_embeddings(len(tokenizer)) # align embeddings with tokenizer
This ensures the model’s embeddings and LM head are the correct size, avoiding the mismatch.
Cn you please update instructions to run this on Apple Silicon with out using unsloth
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.11/site-packages/unsloth/__init__.py", line 79, in <module>
DEVICE_TYPE : str = get_device_type()
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/unsloth/__init__.py", line 77, in get_device_type
raise NotImplementedError("Unsloth currently only works on NVIDIA GPUs and Intel GPUs.")
NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.
I also faced the issue with unsloth in Apple Silicon. The following work-around helped. Not sure if that is the best option. Here are the steps.
- First merged the Llama model and SinLlama and resize the tokens.
- Then convert the model to GGUF format using llama.cpp
- Load and run the GGUF model
Step 1 Merging the model and resize the tokens
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
Define model names
base_model_name = "./Meta-Llama-3-8B" # I used the downloaded model files,
# you can give Hugging Face path.
lora_model_name = "./SinLlama_v01"
merged_model_directory = "./merged_SinLlama"
--- Step 1: Load the Base Model onto the CPU FIRST ---
print("Loading base model onto CPU...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.bfloat16, # Still use bfloat16 to reduce memory footprint
#
# IMPORTANT: Do NOT use device_map="auto" here. Force to CPU.
# Python crashed when using "auto"
#
)
--- Step 2: Load the LoRA's Tokenizer ---
print("Loading LoRA tokenizer...")
lora_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
--- Step 3: Resize the Base Model's Vocabulary (on the CPU) ---
print("Resizing base model token embeddings...")
base_model.resize_token_embeddings(len(lora_tokenizer))
print("Resize successful on CPU.")
--- Step 4: Load the LoRA Adapter (still on the CPU) ---
print("Loading LoRA adapter...")
Load the adapter while the full model is still on the CPU
model = PeftModel.from_pretrained(base_model, lora_model_name)
print("LoRA model loaded successfully!")
--- Step 5: Merge and Save ---
print("Merging the model and adapter...")
model = model.merge_and_unload()
print("Merge complete.")
print(f"Saving merged model to {merged_model_directory}...")
model.save_pretrained(merged_model_directory)
lora_tokenizer.save_pretrained(merged_model_directory)
print("Merged model saved successfully!")
Step 2: Convert Using llama.cpp
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
pip install -r requirements.txt
Please note you need a python version >=3.7 & < 3.11 for this.
Then convert the merged model using the below command
python convert_hf_to_gguf.py /pathtomergedfile/merged_SinLlama --outfile SinLlama_v01.gguf --outtype f16
Step3: Load and run the GGUF model
from llama_cpp import Llama
Path to your GGUF model file
gguf_model_path = "/path_to_gguf/SinLlama_v01.gguf"
Load the model
llm = Llama(
model_path=gguf_model_path,
n_ctx=2048, # Context window size (equivalent to max_seq_length)
n_gpu_layers=-1, # Offload all possible layers to the M2 GPU for max performance
verbose=False
)
--- Run Inference ---
prompt = "ශ්රී ලංකාවේ අගනුවර කුමක්ද?"
output = llm(prompt, max_tokens=256) # Generate up to 256 tokens
print(output['choices'][0]['text'])