Spaces:

HGKo
/

vision_llm_agent

Running

App Files Files Community

sunheycho commited on 24 days ago

Commit

3432460

1 Parent(s): 85d2710

feat(llama): add llama-cpp-python GGUF Q4 auto-download and set as default base model for LoRA compare; graceful fallback

Browse files

Files changed (2) hide show

api.py +82 -3
requirements.txt +3 -0

api.py CHANGED Viewed

@@ -21,6 +21,7 @@ import sys
 import requests
 import asyncio
 from threading import Thread
 try:
     from openai import OpenAI
 except Exception as _e:
@@ -431,6 +432,68 @@ except Exception as e:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
 # LLM model (using an open-access model instead of Llama 4 which requires authentication)
 llm_model = None
 llm_tokenizer = None
@@ -1277,7 +1340,8 @@ def start_llama_lora_compare():
     """Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
     session_id = request.form.get('session_id') or str(uuid.uuid4())
     prompt = request.form.get('prompt', '')
-    base_model_id = request.form.get('baseModel', 'meta-llama/Llama-3.1-8B-Instruct')
     lora_path = request.form.get('loraPath', '')
     image_b64 = None
     if 'image' in request.files:
@@ -1314,12 +1378,27 @@ def start_llama_lora_compare():
             start_base = time.time()
             base_output = None
             try:
-                if llm_model is not None and llm_tokenizer is not None:
                     inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
                     with torch.no_grad():
                         out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
                     text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
-                    # strip prompt prefix
                     if text.startswith(full_prompt):
                         text = text[len(full_prompt):].strip()
                     base_output = text

 import requests
 import asyncio
 from threading import Thread
+import tempfile
 try:
     from openai import OpenAI
 except Exception as _e:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+# llama.cpp (GGUF) support
+llama_cpp = None
+llama_cpp_model = None
+gguf_model_path = None
+try:
+    import llama_cpp as llama_cpp
+    def ensure_q4_gguf_model():
+        """Download a TinyLlama Q4 GGUF model if not present and return the local path."""
+        global gguf_model_path
+        cache_dir = os.path.join(tempfile.gettempdir(), "gguf_models")
+        os.makedirs(cache_dir, exist_ok=True)
+        # Use a small, permissively accessible TinyLlama GGUF
+        filename = "TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf"
+        gguf_model_path = os.path.join(cache_dir, filename)
+        if not os.path.exists(gguf_model_path):
+            try:
+                url = (
+                    "https://huggingface.co/TinyLlama/"
+                    "TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/"
+                    + filename
+                )
+                print(f"[GGUF] Downloading model from {url} -> {gguf_model_path}")
+                with requests.get(url, stream=True, timeout=60) as r:
+                    r.raise_for_status()
+                    with open(gguf_model_path, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                print("[GGUF] Download complete")
+            except Exception as e:
+                print(f"[GGUF] Failed to download GGUF model: {e}")
+                return None
+        return gguf_model_path
+    def get_llama_cpp_model():
+        """Lazy-load llama.cpp model from local GGUF path."""
+        global llama_cpp_model
+        if llama_cpp_model is not None:
+            return llama_cpp_model
+        model_path = ensure_q4_gguf_model()
+        if not model_path:
+            return None
+        try:
+            print(f"[GGUF] Loading llama.cpp model: {model_path}")
+            llama_cpp_model = llama_cpp.Llama(
+                model_path=model_path,
+                n_ctx=4096,
+                n_threads=max(1, os.cpu_count() or 1),
+                n_gpu_layers=0,  # CPU-friendly default; adjust if GPU offload available
+                verbose=False,
+            )
+            print("[GGUF] llama.cpp model loaded")
+        except Exception as e:
+            print(f"[GGUF] Failed to load llama.cpp model: {e}")
+            llama_cpp_model = None
+        return llama_cpp_model
+except Exception as _e:
+    llama_cpp = None
+    llama_cpp_model = None
+    gguf_model_path = None
 # LLM model (using an open-access model instead of Llama 4 which requires authentication)
 llm_model = None
 llm_tokenizer = None
     """Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
     session_id = request.form.get('session_id') or str(uuid.uuid4())
     prompt = request.form.get('prompt', '')
+    # Default to local GGUF TinyLlama Q4 model via llama.cpp
+    base_model_id = request.form.get('baseModel', 'gguf:tinyllama-q4km')
     lora_path = request.form.get('loraPath', '')
     image_b64 = None
     if 'image' in request.files:
             start_base = time.time()
             base_output = None
             try:
+                # If base_model_id indicates GGUF, use llama.cpp
+                if base_model_id.startswith('gguf:') and llama_cpp is not None:
+                    model = get_llama_cpp_model()
+                    if model is None:
+                        raise RuntimeError('GGUF model unavailable')
+                    # Simple chat-style prompt
+                    prompt_text = f"You are a helpful assistant.\nUser: {full_prompt}\nAssistant:"
+                    res = model(
+                        prompt=prompt_text,
+                        max_tokens=128,
+                        temperature=0.7,
+                        top_p=0.9,
+                        stop=["User:", "\n\n"],
+                    )
+                    text = res.get('choices', [{}])[0].get('text', '').strip()
+                    base_output = text or ""
+                elif llm_model is not None and llm_tokenizer is not None:
                     inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
                     with torch.no_grad():
                         out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
                     text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
                     if text.startswith(full_prompt):
                         text = text[len(full_prompt):].strip()
                     base_output = text

requirements.txt CHANGED Viewed

@@ -49,3 +49,6 @@ langchain>=0.2.6
 langchain-openai>=0.1.16
 langchain-community>=0.2.6
 langchain-experimental>=0.0.60

 langchain-openai>=0.1.16
 langchain-community>=0.2.6
 langchain-experimental>=0.0.60
+# llama.cpp bindings for loading local GGUF (quantized Q4) models
+llama-cpp-python>=0.2.90