fastapi-rag-qa / app /rag_pipeline /model_loader.py
tasmimulhuda's picture
application addd
abb6f94
import logging
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import hf_hub_download
# from langchain.llms import LlamaCpp
from langchain_community.llms import LlamaCpp
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
LlamaForCausalLM,
LlamaTokenizer,
)
from langchain_community.llms import HuggingFacePipeline
from langchain.callbacks.manager import CallbackManager
from transformers import GenerationConfig, pipeline
import torch
import os
from app.settings import Config
conf = Config()
logger = logging.getLogger(__name__)
MODELS_PATH = conf.MODELS_PATH
CONTEXT_WINDOW_SIZE = 2048
MAX_NEW_TOKENS = 2048
N_BATCH= 512
N_GPU_LAYERS = 1
CACHE_DIR = conf.CACHE_DIR #"./models/"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
try:
logging.info("Using Llamacpp for GGUF/GGML quantized models")
model_path = hf_hub_download(
repo_id=model_id,
filename=model_basename,
resume_download=True,
# force_download=True,
cache_dir=MODELS_PATH,
)
kwargs = {
"model_path": model_path,
"n_ctx": CONTEXT_WINDOW_SIZE,
"max_tokens": MAX_NEW_TOKENS,
"n_batch": N_BATCH, # set this based on your GPU & CPU RAM
}
if device_type.lower() == "mps":
kwargs["n_gpu_layers"] = 1
if device_type.lower() == "cuda":
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
return LlamaCpp(**kwargs)
except:
if "ggml" in model_basename:
logging.INFO("If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead")
return None
def load_quantized_model_qptq(model_id, model_basename, device_type, logging):
logging.info("Using AutoGPTQForCausalLM for quantized models")
if ".safetensors" in model_basename:
# Remove the ".safetensors" ending if present
model_basename = model_basename.replace(".safetensors", "")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
logging.info("Tokenizer loaded")
model = AutoGPTQForCausalLM.from_quantized(
model_id,
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
device_map="auto",
use_triton=False,
quantize_config=None,
)
return model, tokenizer
def load_full_model(model_id, model_basename, device_type, logging):
if device_type.lower() in ["mps", "cpu"]:
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #
model = LlamaForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR
else:
logging.info("Using AutoModelForCausalLM for full models")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR
logging.info("Tokenizer loaded")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
cache_dir=MODELS_PATH,
use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
# trust_remote_code=True, # set these if you are using NVIDIA GPU
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
)
model.tie_weights()
return model, tokenizer
def load_model(device_type, model_id, model_basename=None, LOGGING=logger):
logger.info(f"Loading Model: {model_id}, on: {device_type}")
logger.info("This action can take a few minutes!")
if model_basename is not None:
if ".gguf" in model_basename.lower():
llm = load_quantized_model_gguf_ggml(
model_id, model_basename, device_type, LOGGING)
return llm
elif ".ggml" in model_basename.lower():
model, tokenizer = load_quantized_model_gguf_ggml(
model_id, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_quantized_model_qptq(
model_id, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_full_model(
model_id, model_basename, device_type, LOGGING)
# Load configuration from the model to avoid warnings
generation_config = GenerationConfig.from_pretrained(model_id)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=MAX_NEW_TOKENS,
temperature=0.1,
# top_p=0.95,
repetition_penalty=1.15,
generation_config=generation_config,
)
local_llm = HuggingFacePipeline(pipeline=pipe)
logger.info("Local LLM Loaded")
return local_llm