Spaces:

hmrizal
/

CSVBot-OpenSource

Sleeping

App Files Files Community

hmrizal commited on Apr 28

Commit

9e0e548

verified ·

1 Parent(s): b01ddc0

update initialize_model_once and create_llm_pipeline for GGUF model, add llama_cpp, add fallback hierarchy system

Browse files

Files changed (1) hide show

app.py +178 -29

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Force CPU only
 import uuid
 import threading
@@ -10,9 +12,13 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFacePipeline
 from langchain.chains import LLMChain
-from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, pipeline
 from langchain.prompts import PromptTemplate
-import time
 # Global model cache
 MODEL_CACHE = {
@@ -34,7 +40,7 @@ MODEL_CONFIG = {
     },
     "TinyLlama Chat": {
         "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-        "description": "Compact 1.1B parameter model, fast but less powerful",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "Mistral Instruct": {
@@ -44,12 +50,12 @@ MODEL_CONFIG = {
     },
     "Phi-4 Mini Instruct": {
         "name": "microsoft/Phi-4-mini-instruct",
-        "description": "Compact Microsoft model with strong instruction following",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "DeepSeek Coder Instruct": {
         "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
-        "description": "1.3B model specialized for code understanding",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "DeepSeek Lite Chat": {
@@ -75,15 +81,22 @@ MODEL_CONFIG = {
     }
 }
 def initialize_model_once(model_key):
-    """Initialize the model once and cache it"""
     with MODEL_CACHE["init_lock"]:
         current_model = MODEL_CACHE["model_name"]
         if MODEL_CACHE["model"] is None or current_model != model_key:
-            # Clear previous model from memory if any
             if MODEL_CACHE["model"] is not None:
                 del MODEL_CACHE["model"]
-                del MODEL_CACHE["tokenizer"]
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
             model_info = MODEL_CONFIG[model_key]
@@ -92,8 +105,45 @@ def initialize_model_once(model_key):
             try:
                 print(f"Loading model: {model_name}")
-                # Handle T5 models separately
-                if model_info.get("is_t5", False):
                     MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
                     MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
                         model_name,
@@ -101,16 +151,27 @@ def initialize_model_once(model_key):
                         device_map="auto" if torch.cuda.is_available() else None,
                         low_cpu_mem_usage=True
                     )
                 else:
-                    # Load tokenizer and model with appropriate configuration
-                    MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
                     MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         torch_dtype=model_info["dtype"],
                         device_map="auto" if torch.cuda.is_available() else None,
                         low_cpu_mem_usage=True,
                         trust_remote_code=True
                     )
                 print(f"Model {model_name} loaded successfully")
             except Exception as e:
                 import traceback
@@ -118,28 +179,39 @@ def initialize_model_once(model_key):
                 print(traceback.format_exc())
                 raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
-    if MODEL_CACHE["model"] is None or MODEL_CACHE["tokenizer"] is None:
-        raise ValueError(f"Model or tokenizer not initialized properly for {model_key}")
-    return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], model_info.get("is_t5", False)
 def create_llm_pipeline(model_key):
     """Create a new pipeline using the specified model"""
     try:
         print(f"Creating pipeline for model: {model_key}")
-        tokenizer, model, is_t5 = initialize_model_once(model_key)
-        if model is None or tokenizer is None:
-            raise ValueError(f"Model or tokenizer is None for {model_key}")
-        # Create appropriate pipeline based on model type
-        if is_t5:
             print("Creating T5 pipeline")
             pipe = pipeline(
                 "text2text-generation",
                 model=model,
                 tokenizer=tokenizer,
-                max_new_tokens=128,  # Reduced for better performance
                 temperature=0.3,
                 top_p=0.9,
                 return_full_text=False,
@@ -150,7 +222,7 @@ def create_llm_pipeline(model_key):
                 "text-generation",
                 model=model,
                 tokenizer=tokenizer,
-                max_new_tokens=128,  # Reduced for better performance
                 temperature=0.3,
                 top_p=0.9,
                 top_k=30,
@@ -159,13 +231,73 @@ def create_llm_pipeline(model_key):
             )
         print("Pipeline created successfully")
-        # Wrap pipeline in HuggingFacePipeline for LangChain compatibility
         return HuggingFacePipeline(pipeline=pipe)
     except Exception as e:
         import traceback
         print(f"Error creating pipeline: {str(e)}")
         print(traceback.format_exc())
-        raise RuntimeError(f"Failed to create pipeline: {str(e)}")
 def create_conversational_chain(db, file_path, model_key):
     llm = create_llm_pipeline(model_key)
@@ -523,10 +655,27 @@ def create_gradio_interface():
         def handle_process_file(file, model_key, sess_id):
             if file is None:
                 return None, None, False, "Mohon upload file CSV terlebih dahulu."
-            chatbot = ChatBot(sess_id, model_key)
-            result = chatbot.process_file(file)
-            return chatbot, True, [(None, result)]
         process_button.click(
             fn=handle_process_file,

 import gradio as gr
+import gc
 import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Force CPU only
 import uuid
 import threading
 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFacePipeline
 from langchain.chains import LLMChain
+from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, BitsAndBytesConfig, pipeline
 from langchain.prompts import PromptTemplate
+from llama_cpp import Llama
+import re
+import datetime
+import warnings
+warnings.filterwarnings('ignore')
 # Global model cache
 MODEL_CACHE = {
     },
     "TinyLlama Chat": {
         "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+        "description": "Model ringan dengan 1.1B parameter, cepat dan ringan",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "Mistral Instruct": {
     },
     "Phi-4 Mini Instruct": {
         "name": "microsoft/Phi-4-mini-instruct",
+        "description": "Model yang ringan dari Microsoft cocok untuk tugas instruksional",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "DeepSeek Coder Instruct": {
         "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
+        "description": "1.3B model untuk kode dan analisis data",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "DeepSeek Lite Chat": {
     }
 }
+# Tambahkan model fallback ke MODEL_CONFIG
+MODEL_CONFIG["Fallback Model"] = {
+    "name": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+    "description": "Model sangat ringan untuk fallback",
+    "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
+}
 def initialize_model_once(model_key):
     with MODEL_CACHE["init_lock"]:
         current_model = MODEL_CACHE["model_name"]
         if MODEL_CACHE["model"] is None or current_model != model_key:
+            # Clear previous model
             if MODEL_CACHE["model"] is not None:
                 del MODEL_CACHE["model"]
+                if MODEL_CACHE["tokenizer"] is not None:
+                    del MODEL_CACHE["tokenizer"]
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
             model_info = MODEL_CONFIG[model_key]
             try:
                 print(f"Loading model: {model_name}")
+                # Periksa apakah ini model GGUF
+                if "GGUF" in model_name:
+                    # Download model file terlebih dahulu jika belum ada
+                    from huggingface_hub import hf_hub_download
+                    try:
+                        # Coba temukan file GGUF di repo
+                        repo_id = model_name
+                        model_path = hf_hub_download(
+                            repo_id=repo_id,
+                            filename="model.gguf"  # Nama file dapat berbeda
+                        )
+                    except Exception as e:
+                        print(f"Couldn't find model.gguf, trying other filenames: {str(e)}")
+                        # Coba cari file GGUF dengan nama lain
+                        import requests
+                        from huggingface_hub import list_repo_files
+                        files = list_repo_files(repo_id)
+                        gguf_files = [f for f in files if f.endswith('.gguf')]
+                        if not gguf_files:
+                            raise ValueError(f"No GGUF files found in {repo_id}")
+                        # Gunakan file GGUF pertama yang ditemukan
+                        model_path = hf_hub_download(repo_id=repo_id, filename=gguf_files[0])
+                    # Load model GGUF dengan llama-cpp-python
+                    MODEL_CACHE["model"] = Llama(
+                        model_path=model_path,
+                        n_ctx=2048,  # Konteks yang lebih kecil untuk penghematan memori
+                        n_batch=512,
+                        n_threads=2  # Sesuaikan dengan 2 vCPU
+                    )
+                    MODEL_CACHE["tokenizer"] = None  # GGUF tidak membutuhkan tokenizer terpisah
+                    MODEL_CACHE["is_gguf"] = True
+                # Handle T5 models
+                elif model_info.get("is_t5", False):
                     MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
                     MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
                         model_name,
                         device_map="auto" if torch.cuda.is_available() else None,
                         low_cpu_mem_usage=True
                     )
+                    MODEL_CACHE["is_gguf"] = False
+                # Handle standard HF models
                 else:
+                    quantization_config = BitsAndBytesConfig(
+                        load_in_4bit=True,
+                        bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                        bnb_4bit_quant_type="nf4",
+                        bnb_4bit_use_double_quant=True
+                    )
+                    MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                     MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
+                        quantization_config=quantization_config,
                         torch_dtype=model_info["dtype"],
                         device_map="auto" if torch.cuda.is_available() else None,
                         low_cpu_mem_usage=True,
                         trust_remote_code=True
                     )
+                    MODEL_CACHE["is_gguf"] = False
                 print(f"Model {model_name} loaded successfully")
             except Exception as e:
                 import traceback
                 print(traceback.format_exc())
                 raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
+    return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], MODEL_CACHE.get("is_gguf", False)
 def create_llm_pipeline(model_key):
     """Create a new pipeline using the specified model"""
     try:
         print(f"Creating pipeline for model: {model_key}")
+        tokenizer, model, is_gguf = initialize_model_once(model_key)
+        if model is None:
+            raise ValueError(f"Model is None for {model_key}")
+        # For GGUF models from llama-cpp-python
+        if is_gguf:
+            # Buat adaptor untuk menggunakan model GGUF seperti HF pipeline
+            from langchain.llms import LlamaCpp
+            llm = LlamaCpp(
+                model_path=model.model_path,
+                temperature=0.3,
+                max_tokens=128,
+                top_p=0.9,
+                n_ctx=2048,
+                streaming=False
+            )
+            return llm
+        # Create appropriate pipeline for HF models
+        elif getattr(model_info, "is_t5", False):
             print("Creating T5 pipeline")
             pipe = pipeline(
                 "text2text-generation",
                 model=model,
                 tokenizer=tokenizer,
+                max_new_tokens=128,
                 temperature=0.3,
                 top_p=0.9,
                 return_full_text=False,
                 "text-generation",
                 model=model,
                 tokenizer=tokenizer,
+                max_new_tokens=128,
                 temperature=0.3,
                 top_p=0.9,
                 top_k=30,
             )
         print("Pipeline created successfully")
         return HuggingFacePipeline(pipeline=pipe)
     except Exception as e:
         import traceback
         print(f"Error creating pipeline: {str(e)}")
         print(traceback.format_exc())
+        # Fallback ke model sederhana jika yang utama gagal
+        if model_key != "Fallback Model":
+            print(f"Trying fallback model")
+            try:
+                return create_fallback_pipeline()
+            except:
+                raise RuntimeError(f"Failed to create pipeline: {str(e)}")
+        else:
+            raise RuntimeError(f"Failed to create pipeline: {str(e)}")
+def create_fallback_pipeline():
+    """Create a fallback pipeline with a very small model"""
+    model_key = "Fallback Model"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_CONFIG[model_key]["name"],
+        torch_dtype=MODEL_CONFIG[model_key]["dtype"],
+        device_map="auto" if torch.cuda.is_available() else None,
+        low_cpu_mem_usage=True
+    )
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=128,
+        temperature=0.3,
+        return_full_text=False,
+    )
+    return HuggingFacePipeline(pipeline=pipe)
+def handle_model_loading_error(model_key, session_id):
+    """Handle model loading errors with fallback options"""
+    fallback_hierarchy = [
+        "DeepSeek Coder Instruct",  # 1.3B model
+        "Phi-4",                  # 1.5B model
+        "TinyLlama-Chat",           # 1.1B model
+        "Flan-T5-Small"             # Paling ringan
+    ]
+    # Jika model yang gagal sudah merupakan fallback terakhir, berikan pesan error
+    if model_key == fallback_hierarchy[-1]:
+        return None, f"Tidak dapat memuat model {model_key}. Harap coba lagi nanti."
+    # Temukan posisi model yang gagal dalam hirarki
+    try:
+        current_index = fallback_hierarchy.index(model_key)
+    except ValueError:
+        current_index = -1
+    # Coba model berikutnya dalam hirarki
+    for fallback_model in fallback_hierarchy[current_index+1:]:
+        try:
+            print(f"Trying fallback model: {fallback_model}")
+            chatbot = ChatBot(session_id, fallback_model)
+            return chatbot, f"Model {model_key} tidak tersedia. Menggunakan {fallback_model} sebagai alternatif."
+        except Exception as e:
+            print(f"Fallback model {fallback_model} also failed: {str(e)}")
+    return None, "Semua model gagal dimuat. Harap coba lagi nanti."
 def create_conversational_chain(db, file_path, model_key):
     llm = create_llm_pipeline(model_key)
         def handle_process_file(file, model_key, sess_id):
             if file is None:
                 return None, None, False, "Mohon upload file CSV terlebih dahulu."
+            try:
+                chatbot = ChatBot(sess_id, model_key)
+                result = chatbot.process_file(file)
+                return chatbot, True, [(None, result)]
+            except Exception as e:
+                import traceback
+                print(f"Error processing file with {model_key}: {str(e)}")
+                print(traceback.format_exc())
+                # Coba dengan model fallback
+                try:
+                    chatbot, message = handle_model_loading_error(model_key, sess_id)
+                    if chatbot is not None:
+                        result = chatbot.process_file(file)
+                        return chatbot, True, [(None, message), (None, result)]
+                    else:
+                        return None, False, [(None, message)]
+                except Exception as fb_err:
+                    error_msg = f"Error dengan model {model_key}: {str(e)}\n\nFallback juga gagal: {str(fb_err)}"
+                    return None, False, [(None, error_msg)]
         process_button.click(
             fn=handle_process_file,