sunheycho commited on
Commit
3432460
·
1 Parent(s): 85d2710

feat(llama): add llama-cpp-python GGUF Q4 auto-download and set as default base model for LoRA compare; graceful fallback

Browse files
Files changed (2) hide show
  1. api.py +82 -3
  2. requirements.txt +3 -0
api.py CHANGED
@@ -21,6 +21,7 @@ import sys
21
  import requests
22
  import asyncio
23
  from threading import Thread
 
24
  try:
25
  from openai import OpenAI
26
  except Exception as _e:
@@ -431,6 +432,68 @@ except Exception as e:
431
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
432
  print(f"Using device: {device}")
433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  # LLM model (using an open-access model instead of Llama 4 which requires authentication)
435
  llm_model = None
436
  llm_tokenizer = None
@@ -1277,7 +1340,8 @@ def start_llama_lora_compare():
1277
  """Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
1278
  session_id = request.form.get('session_id') or str(uuid.uuid4())
1279
  prompt = request.form.get('prompt', '')
1280
- base_model_id = request.form.get('baseModel', 'meta-llama/Llama-3.1-8B-Instruct')
 
1281
  lora_path = request.form.get('loraPath', '')
1282
  image_b64 = None
1283
  if 'image' in request.files:
@@ -1314,12 +1378,27 @@ def start_llama_lora_compare():
1314
  start_base = time.time()
1315
  base_output = None
1316
  try:
1317
- if llm_model is not None and llm_tokenizer is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
1319
  with torch.no_grad():
1320
  out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
1321
  text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
1322
- # strip prompt prefix
1323
  if text.startswith(full_prompt):
1324
  text = text[len(full_prompt):].strip()
1325
  base_output = text
 
21
  import requests
22
  import asyncio
23
  from threading import Thread
24
+ import tempfile
25
  try:
26
  from openai import OpenAI
27
  except Exception as _e:
 
432
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
433
  print(f"Using device: {device}")
434
 
435
+ # llama.cpp (GGUF) support
436
+ llama_cpp = None
437
+ llama_cpp_model = None
438
+ gguf_model_path = None
439
+ try:
440
+ import llama_cpp as llama_cpp
441
+
442
+ def ensure_q4_gguf_model():
443
+ """Download a TinyLlama Q4 GGUF model if not present and return the local path."""
444
+ global gguf_model_path
445
+ cache_dir = os.path.join(tempfile.gettempdir(), "gguf_models")
446
+ os.makedirs(cache_dir, exist_ok=True)
447
+ # Use a small, permissively accessible TinyLlama GGUF
448
+ filename = "TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf"
449
+ gguf_model_path = os.path.join(cache_dir, filename)
450
+ if not os.path.exists(gguf_model_path):
451
+ try:
452
+ url = (
453
+ "https://huggingface.co/TinyLlama/"
454
+ "TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/"
455
+ + filename
456
+ )
457
+ print(f"[GGUF] Downloading model from {url} -> {gguf_model_path}")
458
+ with requests.get(url, stream=True, timeout=60) as r:
459
+ r.raise_for_status()
460
+ with open(gguf_model_path, 'wb') as f:
461
+ for chunk in r.iter_content(chunk_size=8192):
462
+ if chunk:
463
+ f.write(chunk)
464
+ print("[GGUF] Download complete")
465
+ except Exception as e:
466
+ print(f"[GGUF] Failed to download GGUF model: {e}")
467
+ return None
468
+ return gguf_model_path
469
+
470
+ def get_llama_cpp_model():
471
+ """Lazy-load llama.cpp model from local GGUF path."""
472
+ global llama_cpp_model
473
+ if llama_cpp_model is not None:
474
+ return llama_cpp_model
475
+ model_path = ensure_q4_gguf_model()
476
+ if not model_path:
477
+ return None
478
+ try:
479
+ print(f"[GGUF] Loading llama.cpp model: {model_path}")
480
+ llama_cpp_model = llama_cpp.Llama(
481
+ model_path=model_path,
482
+ n_ctx=4096,
483
+ n_threads=max(1, os.cpu_count() or 1),
484
+ n_gpu_layers=0, # CPU-friendly default; adjust if GPU offload available
485
+ verbose=False,
486
+ )
487
+ print("[GGUF] llama.cpp model loaded")
488
+ except Exception as e:
489
+ print(f"[GGUF] Failed to load llama.cpp model: {e}")
490
+ llama_cpp_model = None
491
+ return llama_cpp_model
492
+ except Exception as _e:
493
+ llama_cpp = None
494
+ llama_cpp_model = None
495
+ gguf_model_path = None
496
+
497
  # LLM model (using an open-access model instead of Llama 4 which requires authentication)
498
  llm_model = None
499
  llm_tokenizer = None
 
1340
  """Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
1341
  session_id = request.form.get('session_id') or str(uuid.uuid4())
1342
  prompt = request.form.get('prompt', '')
1343
+ # Default to local GGUF TinyLlama Q4 model via llama.cpp
1344
+ base_model_id = request.form.get('baseModel', 'gguf:tinyllama-q4km')
1345
  lora_path = request.form.get('loraPath', '')
1346
  image_b64 = None
1347
  if 'image' in request.files:
 
1378
  start_base = time.time()
1379
  base_output = None
1380
  try:
1381
+ # If base_model_id indicates GGUF, use llama.cpp
1382
+ if base_model_id.startswith('gguf:') and llama_cpp is not None:
1383
+ model = get_llama_cpp_model()
1384
+ if model is None:
1385
+ raise RuntimeError('GGUF model unavailable')
1386
+ # Simple chat-style prompt
1387
+ prompt_text = f"You are a helpful assistant.\nUser: {full_prompt}\nAssistant:"
1388
+ res = model(
1389
+ prompt=prompt_text,
1390
+ max_tokens=128,
1391
+ temperature=0.7,
1392
+ top_p=0.9,
1393
+ stop=["User:", "\n\n"],
1394
+ )
1395
+ text = res.get('choices', [{}])[0].get('text', '').strip()
1396
+ base_output = text or ""
1397
+ elif llm_model is not None and llm_tokenizer is not None:
1398
  inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
1399
  with torch.no_grad():
1400
  out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
1401
  text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
 
1402
  if text.startswith(full_prompt):
1403
  text = text[len(full_prompt):].strip()
1404
  base_output = text
requirements.txt CHANGED
@@ -49,3 +49,6 @@ langchain>=0.2.6
49
  langchain-openai>=0.1.16
50
  langchain-community>=0.2.6
51
  langchain-experimental>=0.0.60
 
 
 
 
49
  langchain-openai>=0.1.16
50
  langchain-community>=0.2.6
51
  langchain-experimental>=0.0.60
52
+
53
+ # llama.cpp bindings for loading local GGUF (quantized Q4) models
54
+ llama-cpp-python>=0.2.90