Spaces:
Running
Running
sunheycho
commited on
Commit
·
3432460
1
Parent(s):
85d2710
feat(llama): add llama-cpp-python GGUF Q4 auto-download and set as default base model for LoRA compare; graceful fallback
Browse files- api.py +82 -3
- requirements.txt +3 -0
api.py
CHANGED
@@ -21,6 +21,7 @@ import sys
|
|
21 |
import requests
|
22 |
import asyncio
|
23 |
from threading import Thread
|
|
|
24 |
try:
|
25 |
from openai import OpenAI
|
26 |
except Exception as _e:
|
@@ -431,6 +432,68 @@ except Exception as e:
|
|
431 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
432 |
print(f"Using device: {device}")
|
433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
# LLM model (using an open-access model instead of Llama 4 which requires authentication)
|
435 |
llm_model = None
|
436 |
llm_tokenizer = None
|
@@ -1277,7 +1340,8 @@ def start_llama_lora_compare():
|
|
1277 |
"""Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
|
1278 |
session_id = request.form.get('session_id') or str(uuid.uuid4())
|
1279 |
prompt = request.form.get('prompt', '')
|
1280 |
-
|
|
|
1281 |
lora_path = request.form.get('loraPath', '')
|
1282 |
image_b64 = None
|
1283 |
if 'image' in request.files:
|
@@ -1314,12 +1378,27 @@ def start_llama_lora_compare():
|
|
1314 |
start_base = time.time()
|
1315 |
base_output = None
|
1316 |
try:
|
1317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1318 |
inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
|
1319 |
with torch.no_grad():
|
1320 |
out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
|
1321 |
text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
|
1322 |
-
# strip prompt prefix
|
1323 |
if text.startswith(full_prompt):
|
1324 |
text = text[len(full_prompt):].strip()
|
1325 |
base_output = text
|
|
|
21 |
import requests
|
22 |
import asyncio
|
23 |
from threading import Thread
|
24 |
+
import tempfile
|
25 |
try:
|
26 |
from openai import OpenAI
|
27 |
except Exception as _e:
|
|
|
432 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
433 |
print(f"Using device: {device}")
|
434 |
|
435 |
+
# llama.cpp (GGUF) support
|
436 |
+
llama_cpp = None
|
437 |
+
llama_cpp_model = None
|
438 |
+
gguf_model_path = None
|
439 |
+
try:
|
440 |
+
import llama_cpp as llama_cpp
|
441 |
+
|
442 |
+
def ensure_q4_gguf_model():
|
443 |
+
"""Download a TinyLlama Q4 GGUF model if not present and return the local path."""
|
444 |
+
global gguf_model_path
|
445 |
+
cache_dir = os.path.join(tempfile.gettempdir(), "gguf_models")
|
446 |
+
os.makedirs(cache_dir, exist_ok=True)
|
447 |
+
# Use a small, permissively accessible TinyLlama GGUF
|
448 |
+
filename = "TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf"
|
449 |
+
gguf_model_path = os.path.join(cache_dir, filename)
|
450 |
+
if not os.path.exists(gguf_model_path):
|
451 |
+
try:
|
452 |
+
url = (
|
453 |
+
"https://huggingface.co/TinyLlama/"
|
454 |
+
"TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/"
|
455 |
+
+ filename
|
456 |
+
)
|
457 |
+
print(f"[GGUF] Downloading model from {url} -> {gguf_model_path}")
|
458 |
+
with requests.get(url, stream=True, timeout=60) as r:
|
459 |
+
r.raise_for_status()
|
460 |
+
with open(gguf_model_path, 'wb') as f:
|
461 |
+
for chunk in r.iter_content(chunk_size=8192):
|
462 |
+
if chunk:
|
463 |
+
f.write(chunk)
|
464 |
+
print("[GGUF] Download complete")
|
465 |
+
except Exception as e:
|
466 |
+
print(f"[GGUF] Failed to download GGUF model: {e}")
|
467 |
+
return None
|
468 |
+
return gguf_model_path
|
469 |
+
|
470 |
+
def get_llama_cpp_model():
|
471 |
+
"""Lazy-load llama.cpp model from local GGUF path."""
|
472 |
+
global llama_cpp_model
|
473 |
+
if llama_cpp_model is not None:
|
474 |
+
return llama_cpp_model
|
475 |
+
model_path = ensure_q4_gguf_model()
|
476 |
+
if not model_path:
|
477 |
+
return None
|
478 |
+
try:
|
479 |
+
print(f"[GGUF] Loading llama.cpp model: {model_path}")
|
480 |
+
llama_cpp_model = llama_cpp.Llama(
|
481 |
+
model_path=model_path,
|
482 |
+
n_ctx=4096,
|
483 |
+
n_threads=max(1, os.cpu_count() or 1),
|
484 |
+
n_gpu_layers=0, # CPU-friendly default; adjust if GPU offload available
|
485 |
+
verbose=False,
|
486 |
+
)
|
487 |
+
print("[GGUF] llama.cpp model loaded")
|
488 |
+
except Exception as e:
|
489 |
+
print(f"[GGUF] Failed to load llama.cpp model: {e}")
|
490 |
+
llama_cpp_model = None
|
491 |
+
return llama_cpp_model
|
492 |
+
except Exception as _e:
|
493 |
+
llama_cpp = None
|
494 |
+
llama_cpp_model = None
|
495 |
+
gguf_model_path = None
|
496 |
+
|
497 |
# LLM model (using an open-access model instead of Llama 4 which requires authentication)
|
498 |
llm_model = None
|
499 |
llm_tokenizer = None
|
|
|
1340 |
"""Start a LoRA-vs-Base comparison session (text or image+text prompt)."""
|
1341 |
session_id = request.form.get('session_id') or str(uuid.uuid4())
|
1342 |
prompt = request.form.get('prompt', '')
|
1343 |
+
# Default to local GGUF TinyLlama Q4 model via llama.cpp
|
1344 |
+
base_model_id = request.form.get('baseModel', 'gguf:tinyllama-q4km')
|
1345 |
lora_path = request.form.get('loraPath', '')
|
1346 |
image_b64 = None
|
1347 |
if 'image' in request.files:
|
|
|
1378 |
start_base = time.time()
|
1379 |
base_output = None
|
1380 |
try:
|
1381 |
+
# If base_model_id indicates GGUF, use llama.cpp
|
1382 |
+
if base_model_id.startswith('gguf:') and llama_cpp is not None:
|
1383 |
+
model = get_llama_cpp_model()
|
1384 |
+
if model is None:
|
1385 |
+
raise RuntimeError('GGUF model unavailable')
|
1386 |
+
# Simple chat-style prompt
|
1387 |
+
prompt_text = f"You are a helpful assistant.\nUser: {full_prompt}\nAssistant:"
|
1388 |
+
res = model(
|
1389 |
+
prompt=prompt_text,
|
1390 |
+
max_tokens=128,
|
1391 |
+
temperature=0.7,
|
1392 |
+
top_p=0.9,
|
1393 |
+
stop=["User:", "\n\n"],
|
1394 |
+
)
|
1395 |
+
text = res.get('choices', [{}])[0].get('text', '').strip()
|
1396 |
+
base_output = text or ""
|
1397 |
+
elif llm_model is not None and llm_tokenizer is not None:
|
1398 |
inputs = llm_tokenizer(full_prompt, return_tensors='pt').to(device)
|
1399 |
with torch.no_grad():
|
1400 |
out = llm_model.generate(**inputs, max_new_tokens=128, temperature=0.7, top_p=0.9)
|
1401 |
text = llm_tokenizer.decode(out[0], skip_special_tokens=True)
|
|
|
1402 |
if text.startswith(full_prompt):
|
1403 |
text = text[len(full_prompt):].strip()
|
1404 |
base_output = text
|
requirements.txt
CHANGED
@@ -49,3 +49,6 @@ langchain>=0.2.6
|
|
49 |
langchain-openai>=0.1.16
|
50 |
langchain-community>=0.2.6
|
51 |
langchain-experimental>=0.0.60
|
|
|
|
|
|
|
|
49 |
langchain-openai>=0.1.16
|
50 |
langchain-community>=0.2.6
|
51 |
langchain-experimental>=0.0.60
|
52 |
+
|
53 |
+
# llama.cpp bindings for loading local GGUF (quantized Q4) models
|
54 |
+
llama-cpp-python>=0.2.90
|