import logging from auto_gptq import AutoGPTQForCausalLM from huggingface_hub import hf_hub_download # from langchain.llms import LlamaCpp from langchain_community.llms import LlamaCpp from transformers import ( AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, ) from langchain_community.llms import HuggingFacePipeline from langchain.callbacks.manager import CallbackManager from transformers import GenerationConfig, pipeline import torch import os from app.settings import Config conf = Config() logger = logging.getLogger(__name__) MODELS_PATH = conf.MODELS_PATH CONTEXT_WINDOW_SIZE = 2048 MAX_NEW_TOKENS = 2048 N_BATCH= 512 N_GPU_LAYERS = 1 CACHE_DIR = conf.CACHE_DIR #"./models/" os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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' def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging): try: logging.info("Using Llamacpp for GGUF/GGML quantized models") model_path = hf_hub_download( repo_id=model_id, filename=model_basename, resume_download=True, # force_download=True, cache_dir=MODELS_PATH, ) kwargs = { "model_path": model_path, "n_ctx": CONTEXT_WINDOW_SIZE, "max_tokens": MAX_NEW_TOKENS, "n_batch": N_BATCH, # set this based on your GPU & CPU RAM } if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1 if device_type.lower() == "cuda": kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU return LlamaCpp(**kwargs) except: if "ggml" in model_basename: logging.INFO("If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead") return None def load_quantized_model_qptq(model_id, model_basename, device_type, logging): logging.info("Using AutoGPTQForCausalLM for quantized models") if ".safetensors" in model_basename: # Remove the ".safetensors" ending if present model_basename = model_basename.replace(".safetensors", "") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) logging.info("Tokenizer loaded") model = AutoGPTQForCausalLM.from_quantized( model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None, ) return model, tokenizer def load_full_model(model_id, model_basename, device_type, logging): if device_type.lower() in ["mps", "cpu"]: logging.info("Using LlamaTokenizer") tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) # model = LlamaForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR else: logging.info("Using AutoModelForCausalLM for full models") tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR logging.info("Tokenizer loaded") model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True, cache_dir=MODELS_PATH, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"] # trust_remote_code=True, # set these if you are using NVIDIA GPU # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.float16, # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors ) model.tie_weights() return model, tokenizer def load_model(device_type, model_id, model_basename=None, LOGGING=logger): logger.info(f"Loading Model: {model_id}, on: {device_type}") logger.info("This action can take a few minutes!") if model_basename is not None: if ".gguf" in model_basename.lower(): llm = load_quantized_model_gguf_ggml( model_id, model_basename, device_type, LOGGING) return llm elif ".ggml" in model_basename.lower(): model, tokenizer = load_quantized_model_gguf_ggml( model_id, model_basename, device_type, LOGGING) else: model, tokenizer = load_quantized_model_qptq( model_id, model_basename, device_type, LOGGING) else: model, tokenizer = load_full_model( model_id, model_basename, device_type, LOGGING) # Load configuration from the model to avoid warnings generation_config = GenerationConfig.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_length=MAX_NEW_TOKENS, temperature=0.1, # top_p=0.95, repetition_penalty=1.15, generation_config=generation_config, ) local_llm = HuggingFacePipeline(pipeline=pipe) logger.info("Local LLM Loaded") return local_llm