Spaces:
Runtime error
Runtime error
File size: 5,270 Bytes
abb6f94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import logging
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import hf_hub_download
# from langchain.llms import LlamaCpp
from langchain_community.llms import LlamaCpp
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
LlamaForCausalLM,
LlamaTokenizer,
)
from langchain_community.llms import HuggingFacePipeline
from langchain.callbacks.manager import CallbackManager
from transformers import GenerationConfig, pipeline
import torch
import os
from app.settings import Config
conf = Config()
logger = logging.getLogger(__name__)
MODELS_PATH = conf.MODELS_PATH
CONTEXT_WINDOW_SIZE = 2048
MAX_NEW_TOKENS = 2048
N_BATCH= 512
N_GPU_LAYERS = 1
CACHE_DIR = conf.CACHE_DIR #"./models/"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
try:
logging.info("Using Llamacpp for GGUF/GGML quantized models")
model_path = hf_hub_download(
repo_id=model_id,
filename=model_basename,
resume_download=True,
# force_download=True,
cache_dir=MODELS_PATH,
)
kwargs = {
"model_path": model_path,
"n_ctx": CONTEXT_WINDOW_SIZE,
"max_tokens": MAX_NEW_TOKENS,
"n_batch": N_BATCH, # set this based on your GPU & CPU RAM
}
if device_type.lower() == "mps":
kwargs["n_gpu_layers"] = 1
if device_type.lower() == "cuda":
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
return LlamaCpp(**kwargs)
except:
if "ggml" in model_basename:
logging.INFO("If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead")
return None
def load_quantized_model_qptq(model_id, model_basename, device_type, logging):
logging.info("Using AutoGPTQForCausalLM for quantized models")
if ".safetensors" in model_basename:
# Remove the ".safetensors" ending if present
model_basename = model_basename.replace(".safetensors", "")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
logging.info("Tokenizer loaded")
model = AutoGPTQForCausalLM.from_quantized(
model_id,
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
device_map="auto",
use_triton=False,
quantize_config=None,
)
return model, tokenizer
def load_full_model(model_id, model_basename, device_type, logging):
if device_type.lower() in ["mps", "cpu"]:
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #
model = LlamaForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR
else:
logging.info("Using AutoModelForCausalLM for full models")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR, use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]) #, cache_dir=CACHE_DIR
logging.info("Tokenizer loaded")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
cache_dir=MODELS_PATH,
use_auth_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
# trust_remote_code=True, # set these if you are using NVIDIA GPU
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
)
model.tie_weights()
return model, tokenizer
def load_model(device_type, model_id, model_basename=None, LOGGING=logger):
logger.info(f"Loading Model: {model_id}, on: {device_type}")
logger.info("This action can take a few minutes!")
if model_basename is not None:
if ".gguf" in model_basename.lower():
llm = load_quantized_model_gguf_ggml(
model_id, model_basename, device_type, LOGGING)
return llm
elif ".ggml" in model_basename.lower():
model, tokenizer = load_quantized_model_gguf_ggml(
model_id, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_quantized_model_qptq(
model_id, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_full_model(
model_id, model_basename, device_type, LOGGING)
# Load configuration from the model to avoid warnings
generation_config = GenerationConfig.from_pretrained(model_id)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=MAX_NEW_TOKENS,
temperature=0.1,
# top_p=0.95,
repetition_penalty=1.15,
generation_config=generation_config,
)
local_llm = HuggingFacePipeline(pipeline=pipe)
logger.info("Local LLM Loaded")
return local_llm
|