Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -403,86 +403,153 @@
|
|
403 |
|
404 |
|
405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
import os
|
407 |
import gradio as gr
|
408 |
-
import
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
)
|
421 |
-
model.eval()
|
422 |
|
423 |
-
|
424 |
-
"
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
if a:
|
432 |
-
messages.append({"role": "assistant", "content": a})
|
433 |
-
messages.append({"role": "user", "content": user_msg})
|
434 |
-
|
435 |
-
# Use chat template when available
|
436 |
-
try:
|
437 |
-
if getattr(tokenizer, "chat_template", None):
|
438 |
-
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
439 |
-
except Exception:
|
440 |
-
pass
|
441 |
-
|
442 |
-
# Fallback generic formatting
|
443 |
-
parts = []
|
444 |
-
if system_message:
|
445 |
-
parts.append(f"System: {system_message}")
|
446 |
for u, a in (history or []):
|
447 |
if u:
|
448 |
-
|
449 |
if a:
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
**inputs,
|
461 |
-
max_new_tokens=int(max_tokens),
|
462 |
-
do_sample=True,
|
463 |
-
temperature=float(temperature),
|
464 |
-
top_p=float(top_p),
|
465 |
-
pad_token_id=tokenizer.eos_token_id,
|
466 |
-
eos_token_id=tokenizer.eos_token_id,
|
467 |
-
)
|
468 |
-
# Decode only the newly generated portion
|
469 |
-
gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
470 |
-
text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
471 |
-
|
472 |
-
# Stream the text in chunks so the UI feels live
|
473 |
acc = ""
|
474 |
-
for
|
475 |
-
|
476 |
-
|
|
|
|
|
|
|
477 |
|
478 |
demo = gr.ChatInterface(
|
479 |
respond,
|
480 |
additional_inputs=[
|
481 |
-
gr.Textbox(
|
482 |
-
value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
483 |
-
". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
484 |
-
label="System message",
|
485 |
-
),
|
486 |
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
487 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
488 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
@@ -490,6 +557,7 @@ demo = gr.ChatInterface(
|
|
490 |
)
|
491 |
|
492 |
if __name__ == "__main__":
|
493 |
-
|
494 |
demo.launch(share=True)
|
495 |
|
|
|
|
403 |
|
404 |
|
405 |
|
406 |
+
# import os
|
407 |
+
# import gradio as gr
|
408 |
+
# import torch
|
409 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
410 |
+
|
411 |
+
# # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
|
412 |
+
# MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
413 |
+
|
414 |
+
# # Load once at startup
|
415 |
+
# print(f"🔧 Loading local model: {MODEL_ID}")
|
416 |
+
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
417 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
418 |
+
# MODEL_ID,
|
419 |
+
# torch_dtype=torch.float32, # CPU-friendly
|
420 |
+
# )
|
421 |
+
# model.eval()
|
422 |
+
|
423 |
+
# def build_prompt(system_message: str, history, user_msg: str) -> str:
|
424 |
+
# """Try to use the model's chat template if present; otherwise use a generic prompt."""
|
425 |
+
# messages = []
|
426 |
+
# if system_message:
|
427 |
+
# messages.append({"role": "system", "content": system_message})
|
428 |
+
# for u, a in (history or []):
|
429 |
+
# if u:
|
430 |
+
# messages.append({"role": "user", "content": u})
|
431 |
+
# if a:
|
432 |
+
# messages.append({"role": "assistant", "content": a})
|
433 |
+
# messages.append({"role": "user", "content": user_msg})
|
434 |
+
|
435 |
+
# # Use chat template when available
|
436 |
+
# try:
|
437 |
+
# if getattr(tokenizer, "chat_template", None):
|
438 |
+
# return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
439 |
+
# except Exception:
|
440 |
+
# pass
|
441 |
+
|
442 |
+
# # Fallback generic formatting
|
443 |
+
# parts = []
|
444 |
+
# if system_message:
|
445 |
+
# parts.append(f"System: {system_message}")
|
446 |
+
# for u, a in (history or []):
|
447 |
+
# if u:
|
448 |
+
# parts.append(f"User: {u}")
|
449 |
+
# if a:
|
450 |
+
# parts.append(f"Assistant: {a}")
|
451 |
+
# parts.append(f"User: {user_msg}")
|
452 |
+
# parts.append("Assistant:")
|
453 |
+
# return "\n".join(parts)
|
454 |
+
|
455 |
+
# def respond(message, history, system_message, max_tokens, temperature, top_p):
|
456 |
+
# prompt = build_prompt(system_message, history, message)
|
457 |
+
# inputs = tokenizer(prompt, return_tensors="pt")
|
458 |
+
# with torch.no_grad():
|
459 |
+
# outputs = model.generate(
|
460 |
+
# **inputs,
|
461 |
+
# max_new_tokens=int(max_tokens),
|
462 |
+
# do_sample=True,
|
463 |
+
# temperature=float(temperature),
|
464 |
+
# top_p=float(top_p),
|
465 |
+
# pad_token_id=tokenizer.eos_token_id,
|
466 |
+
# eos_token_id=tokenizer.eos_token_id,
|
467 |
+
# )
|
468 |
+
# # Decode only the newly generated portion
|
469 |
+
# gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
470 |
+
# text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
471 |
+
|
472 |
+
# # Stream the text in chunks so the UI feels live
|
473 |
+
# acc = ""
|
474 |
+
# for i in range(0, len(text), 40):
|
475 |
+
# acc += text[i:i+40]
|
476 |
+
# yield acc
|
477 |
+
|
478 |
+
# demo = gr.ChatInterface(
|
479 |
+
# respond,
|
480 |
+
# additional_inputs=[
|
481 |
+
# gr.Textbox(
|
482 |
+
# value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
483 |
+
# ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
484 |
+
# label="System message",
|
485 |
+
# ),
|
486 |
+
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
487 |
+
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
488 |
+
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
489 |
+
# ],
|
490 |
+
# )
|
491 |
+
|
492 |
+
# if __name__ == "__main__":
|
493 |
+
# # share=True gives you a public link automatically
|
494 |
+
# demo.launch(share=True)
|
495 |
+
|
496 |
+
|
497 |
import os
|
498 |
import gradio as gr
|
499 |
+
from llama_cpp import Llama
|
500 |
+
|
501 |
+
# Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
|
502 |
+
REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
503 |
+
FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
|
504 |
+
|
505 |
+
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
506 |
+
CTX = int(os.getenv("CTX", "2048"))
|
507 |
+
|
508 |
+
print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
|
509 |
+
llm = Llama.from_pretrained(
|
510 |
+
repo_id=REPO_ID,
|
511 |
+
filename=FILENAME,
|
512 |
+
n_ctx=CTX,
|
513 |
+
n_threads=N_THREADS,
|
514 |
+
n_gpu_layers=0, # CPU only
|
515 |
+
logits_all=False,
|
516 |
+
verbose=False,
|
517 |
)
|
|
|
518 |
|
519 |
+
SYSTEM_DEFAULT = (
|
520 |
+
"You are a Chatbot who only answers spiritual questions based on Indian scriptures "
|
521 |
+
"and politely decline other questions."
|
522 |
+
)
|
523 |
+
|
524 |
+
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
525 |
+
sysmsg = system_message or SYSTEM_DEFAULT
|
526 |
+
msgs = [{"role": "system", "content": sysmsg}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
for u, a in (history or []):
|
528 |
if u:
|
529 |
+
msgs.append({"role": "user", "content": u})
|
530 |
if a:
|
531 |
+
msgs.append({"role": "assistant", "content": a})
|
532 |
+
msgs.append({"role": "user", "content": message})
|
533 |
+
|
534 |
+
stream = llm.create_chat_completion(
|
535 |
+
messages=msgs,
|
536 |
+
temperature=float(temperature),
|
537 |
+
top_p=float(top_p),
|
538 |
+
max_tokens=int(max_tokens),
|
539 |
+
stream=True,
|
540 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
acc = ""
|
542 |
+
for chunk in stream:
|
543 |
+
delta = chunk["choices"][0]["delta"]
|
544 |
+
tok = delta.get("content", "")
|
545 |
+
if tok:
|
546 |
+
acc += tok
|
547 |
+
yield acc
|
548 |
|
549 |
demo = gr.ChatInterface(
|
550 |
respond,
|
551 |
additional_inputs=[
|
552 |
+
gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
|
|
|
|
|
|
|
|
|
553 |
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
554 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
555 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
|
|
557 |
)
|
558 |
|
559 |
if __name__ == "__main__":
|
560 |
+
print(f"🧵 Threads: {N_THREADS}")
|
561 |
demo.launch(share=True)
|
562 |
|
563 |
+
|