Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -403,86 +403,153 @@
|
|
| 403 |
|
| 404 |
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
import os
|
| 407 |
import gradio as gr
|
| 408 |
-
import
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
)
|
| 421 |
-
model.eval()
|
| 422 |
|
| 423 |
-
|
| 424 |
-
"
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
if a:
|
| 432 |
-
messages.append({"role": "assistant", "content": a})
|
| 433 |
-
messages.append({"role": "user", "content": user_msg})
|
| 434 |
-
|
| 435 |
-
# Use chat template when available
|
| 436 |
-
try:
|
| 437 |
-
if getattr(tokenizer, "chat_template", None):
|
| 438 |
-
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 439 |
-
except Exception:
|
| 440 |
-
pass
|
| 441 |
-
|
| 442 |
-
# Fallback generic formatting
|
| 443 |
-
parts = []
|
| 444 |
-
if system_message:
|
| 445 |
-
parts.append(f"System: {system_message}")
|
| 446 |
for u, a in (history or []):
|
| 447 |
if u:
|
| 448 |
-
|
| 449 |
if a:
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
**inputs,
|
| 461 |
-
max_new_tokens=int(max_tokens),
|
| 462 |
-
do_sample=True,
|
| 463 |
-
temperature=float(temperature),
|
| 464 |
-
top_p=float(top_p),
|
| 465 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 466 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 467 |
-
)
|
| 468 |
-
# Decode only the newly generated portion
|
| 469 |
-
gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
| 470 |
-
text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
| 471 |
-
|
| 472 |
-
# Stream the text in chunks so the UI feels live
|
| 473 |
acc = ""
|
| 474 |
-
for
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
demo = gr.ChatInterface(
|
| 479 |
respond,
|
| 480 |
additional_inputs=[
|
| 481 |
-
gr.Textbox(
|
| 482 |
-
value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
| 483 |
-
". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
| 484 |
-
label="System message",
|
| 485 |
-
),
|
| 486 |
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 487 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 488 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
|
@@ -490,6 +557,7 @@ demo = gr.ChatInterface(
|
|
| 490 |
)
|
| 491 |
|
| 492 |
if __name__ == "__main__":
|
| 493 |
-
|
| 494 |
demo.launch(share=True)
|
| 495 |
|
|
|
|
|
|
| 403 |
|
| 404 |
|
| 405 |
|
| 406 |
+
# import os
|
| 407 |
+
# import gradio as gr
|
| 408 |
+
# import torch
|
| 409 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 410 |
+
|
| 411 |
+
# # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
|
| 412 |
+
# MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
| 413 |
+
|
| 414 |
+
# # Load once at startup
|
| 415 |
+
# print(f"🔧 Loading local model: {MODEL_ID}")
|
| 416 |
+
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
| 417 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
| 418 |
+
# MODEL_ID,
|
| 419 |
+
# torch_dtype=torch.float32, # CPU-friendly
|
| 420 |
+
# )
|
| 421 |
+
# model.eval()
|
| 422 |
+
|
| 423 |
+
# def build_prompt(system_message: str, history, user_msg: str) -> str:
|
| 424 |
+
# """Try to use the model's chat template if present; otherwise use a generic prompt."""
|
| 425 |
+
# messages = []
|
| 426 |
+
# if system_message:
|
| 427 |
+
# messages.append({"role": "system", "content": system_message})
|
| 428 |
+
# for u, a in (history or []):
|
| 429 |
+
# if u:
|
| 430 |
+
# messages.append({"role": "user", "content": u})
|
| 431 |
+
# if a:
|
| 432 |
+
# messages.append({"role": "assistant", "content": a})
|
| 433 |
+
# messages.append({"role": "user", "content": user_msg})
|
| 434 |
+
|
| 435 |
+
# # Use chat template when available
|
| 436 |
+
# try:
|
| 437 |
+
# if getattr(tokenizer, "chat_template", None):
|
| 438 |
+
# return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 439 |
+
# except Exception:
|
| 440 |
+
# pass
|
| 441 |
+
|
| 442 |
+
# # Fallback generic formatting
|
| 443 |
+
# parts = []
|
| 444 |
+
# if system_message:
|
| 445 |
+
# parts.append(f"System: {system_message}")
|
| 446 |
+
# for u, a in (history or []):
|
| 447 |
+
# if u:
|
| 448 |
+
# parts.append(f"User: {u}")
|
| 449 |
+
# if a:
|
| 450 |
+
# parts.append(f"Assistant: {a}")
|
| 451 |
+
# parts.append(f"User: {user_msg}")
|
| 452 |
+
# parts.append("Assistant:")
|
| 453 |
+
# return "\n".join(parts)
|
| 454 |
+
|
| 455 |
+
# def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 456 |
+
# prompt = build_prompt(system_message, history, message)
|
| 457 |
+
# inputs = tokenizer(prompt, return_tensors="pt")
|
| 458 |
+
# with torch.no_grad():
|
| 459 |
+
# outputs = model.generate(
|
| 460 |
+
# **inputs,
|
| 461 |
+
# max_new_tokens=int(max_tokens),
|
| 462 |
+
# do_sample=True,
|
| 463 |
+
# temperature=float(temperature),
|
| 464 |
+
# top_p=float(top_p),
|
| 465 |
+
# pad_token_id=tokenizer.eos_token_id,
|
| 466 |
+
# eos_token_id=tokenizer.eos_token_id,
|
| 467 |
+
# )
|
| 468 |
+
# # Decode only the newly generated portion
|
| 469 |
+
# gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
| 470 |
+
# text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
| 471 |
+
|
| 472 |
+
# # Stream the text in chunks so the UI feels live
|
| 473 |
+
# acc = ""
|
| 474 |
+
# for i in range(0, len(text), 40):
|
| 475 |
+
# acc += text[i:i+40]
|
| 476 |
+
# yield acc
|
| 477 |
+
|
| 478 |
+
# demo = gr.ChatInterface(
|
| 479 |
+
# respond,
|
| 480 |
+
# additional_inputs=[
|
| 481 |
+
# gr.Textbox(
|
| 482 |
+
# value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
| 483 |
+
# ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
| 484 |
+
# label="System message",
|
| 485 |
+
# ),
|
| 486 |
+
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 487 |
+
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 488 |
+
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
| 489 |
+
# ],
|
| 490 |
+
# )
|
| 491 |
+
|
| 492 |
+
# if __name__ == "__main__":
|
| 493 |
+
# # share=True gives you a public link automatically
|
| 494 |
+
# demo.launch(share=True)
|
| 495 |
+
|
| 496 |
+
|
| 497 |
import os
|
| 498 |
import gradio as gr
|
| 499 |
+
from llama_cpp import Llama
|
| 500 |
+
|
| 501 |
+
# Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
|
| 502 |
+
REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
| 503 |
+
FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
|
| 504 |
+
|
| 505 |
+
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 506 |
+
CTX = int(os.getenv("CTX", "2048"))
|
| 507 |
+
|
| 508 |
+
print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
|
| 509 |
+
llm = Llama.from_pretrained(
|
| 510 |
+
repo_id=REPO_ID,
|
| 511 |
+
filename=FILENAME,
|
| 512 |
+
n_ctx=CTX,
|
| 513 |
+
n_threads=N_THREADS,
|
| 514 |
+
n_gpu_layers=0, # CPU only
|
| 515 |
+
logits_all=False,
|
| 516 |
+
verbose=False,
|
| 517 |
)
|
|
|
|
| 518 |
|
| 519 |
+
SYSTEM_DEFAULT = (
|
| 520 |
+
"You are a Chatbot who only answers spiritual questions based on Indian scriptures "
|
| 521 |
+
"and politely decline other questions."
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 525 |
+
sysmsg = system_message or SYSTEM_DEFAULT
|
| 526 |
+
msgs = [{"role": "system", "content": sysmsg}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
for u, a in (history or []):
|
| 528 |
if u:
|
| 529 |
+
msgs.append({"role": "user", "content": u})
|
| 530 |
if a:
|
| 531 |
+
msgs.append({"role": "assistant", "content": a})
|
| 532 |
+
msgs.append({"role": "user", "content": message})
|
| 533 |
+
|
| 534 |
+
stream = llm.create_chat_completion(
|
| 535 |
+
messages=msgs,
|
| 536 |
+
temperature=float(temperature),
|
| 537 |
+
top_p=float(top_p),
|
| 538 |
+
max_tokens=int(max_tokens),
|
| 539 |
+
stream=True,
|
| 540 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
acc = ""
|
| 542 |
+
for chunk in stream:
|
| 543 |
+
delta = chunk["choices"][0]["delta"]
|
| 544 |
+
tok = delta.get("content", "")
|
| 545 |
+
if tok:
|
| 546 |
+
acc += tok
|
| 547 |
+
yield acc
|
| 548 |
|
| 549 |
demo = gr.ChatInterface(
|
| 550 |
respond,
|
| 551 |
additional_inputs=[
|
| 552 |
+
gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 554 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 555 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
|
|
|
| 557 |
)
|
| 558 |
|
| 559 |
if __name__ == "__main__":
|
| 560 |
+
print(f"🧵 Threads: {N_THREADS}")
|
| 561 |
demo.launch(share=True)
|
| 562 |
|
| 563 |
+
|