Spaces:

Chamin09
/

ChatCSV

Sleeping

App Files Files Community

Chamin09 commited on Apr 22

Commit

89efbe0

verified ·

1 Parent(s): e13d87a

Update models/llm_setup.py

Browse files

Files changed (1) hide show

models/llm_setup.py +65 -64

models/llm_setup.py CHANGED Viewed

@@ -1,64 +1,65 @@
-from typing import Optional
-from llama_index.llms import HuggingFaceLLM
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-def setup_llm(model_name: str = "microsoft/phi-3-mini-4k-instruct",
-              device: str = None,
-              context_window: int = 4096,
-              max_new_tokens: int = 512) -> HuggingFaceLLM:
-    """
-    Set up the language model for the CSV chatbot.
-    Args:
-        model_name: Name of the Hugging Face model to use
-        device: Device to run the model on ('cuda', 'cpu', etc.)
-        context_window: Maximum context window size
-        max_new_tokens: Maximum number of new tokens to generate
-    Returns:
-        Configured LLM instance
-    """
-    # Determine device
-    if device is None:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Configure quantization for memory efficiency
-    if device == "cuda":
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16
-        )
-    else:
-        quantization_config = None
-    # Configure tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        trust_remote_code=True
-    )
-    # Configure model with appropriate parameters for HF Spaces
-    model_kwargs = {
-        "trust_remote_code": True,
-        "torch_dtype": torch.float16,
-    }
-    if quantization_config:
-        model_kwargs["quantization_config"] = quantization_config
-    # Initialize LLM
-    llm = HuggingFaceLLM(
-        model_name=model_name,
-        tokenizer_name=model_name,
-        context_window=context_window,
-        max_new_tokens=max_new_tokens,
-        generate_kwargs={"temperature": 0.7, "top_p": 0.95},
-        device_map=device,
-        tokenizer_kwargs={"trust_remote_code": True},
-        model_kwargs=model_kwargs,
-        # Cache the model to avoid reloading
-        cache_folder="./model_cache"
-    )
-    return llm

+from typing import Optional
+#from llama_index.llms import HuggingFaceLLM
+from llama_index.llms.huggingface import HuggingFaceLLM
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+def setup_llm(model_name: str = "microsoft/phi-3-mini-4k-instruct",
+              device: str = None,
+              context_window: int = 4096,
+              max_new_tokens: int = 512) -> HuggingFaceLLM:
+    """
+    Set up the language model for the CSV chatbot.
+    Args:
+        model_name: Name of the Hugging Face model to use
+        device: Device to run the model on ('cuda', 'cpu', etc.)
+        context_window: Maximum context window size
+        max_new_tokens: Maximum number of new tokens to generate
+    Returns:
+        Configured LLM instance
+    """
+    # Determine device
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Configure quantization for memory efficiency
+    if device == "cuda":
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16
+        )
+    else:
+        quantization_config = None
+    # Configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+    # Configure model with appropriate parameters for HF Spaces
+    model_kwargs = {
+        "trust_remote_code": True,
+        "torch_dtype": torch.float16,
+    }
+    if quantization_config:
+        model_kwargs["quantization_config"] = quantization_config
+    # Initialize LLM
+    llm = HuggingFaceLLM(
+        model_name=model_name,
+        tokenizer_name=model_name,
+        context_window=context_window,
+        max_new_tokens=max_new_tokens,
+        generate_kwargs={"temperature": 0.7, "top_p": 0.95},
+        device_map=device,
+        tokenizer_kwargs={"trust_remote_code": True},
+        model_kwargs=model_kwargs,
+        # Cache the model to avoid reloading
+        cache_folder="./model_cache"
+    )
+    return llm