|
from typing import Optional |
|
|
|
from llama_index.llms.huggingface import HuggingFaceLLM |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
|
def setup_llm(model_name: str = "microsoft/phi-3-mini-4k-instruct", |
|
device: str = None, |
|
context_window: int = 4096, |
|
max_new_tokens: int = 512) -> HuggingFaceLLM: |
|
""" |
|
Set up the language model for the CSV chatbot. |
|
|
|
Args: |
|
model_name: Name of the Hugging Face model to use |
|
device: Device to run the model on ('cuda', 'cpu', etc.) |
|
context_window: Maximum context window size |
|
max_new_tokens: Maximum number of new tokens to generate |
|
|
|
Returns: |
|
Configured LLM instance |
|
""" |
|
|
|
if device is None: |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
if device == "cuda": |
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_compute_dtype=torch.float16 |
|
) |
|
else: |
|
quantization_config = None |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
model_kwargs = { |
|
"trust_remote_code": True, |
|
"torch_dtype": torch.float16, |
|
} |
|
|
|
if quantization_config: |
|
model_kwargs["quantization_config"] = quantization_config |
|
|
|
|
|
llm = HuggingFaceLLM( |
|
model_name=model_name, |
|
tokenizer_name=model_name, |
|
context_window=context_window, |
|
max_new_tokens=max_new_tokens, |
|
generate_kwargs={"temperature": 0.7, "top_p": 0.95}, |
|
device_map=device, |
|
tokenizer_kwargs={"trust_remote_code": True}, |
|
model_kwargs=model_kwargs, |
|
|
|
cache_folder="./model_cache" |
|
) |
|
|
|
return llm |
|
|