Spaces:

rajeshlion
/

ask-baba-bhAIro

Running

App Files Files Community

rajeshlion commited on Aug 31

Commit

bfed36f

verified ·

1 Parent(s): dcf09e5

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -69

app.py CHANGED Viewed

@@ -403,86 +403,153 @@
 import os
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
-MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# Load once at startup
-print(f"🔧 Loading local model: {MODEL_ID}")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float32,     # CPU-friendly
 )
-model.eval()
-def build_prompt(system_message: str, history, user_msg: str) -> str:
-    """Try to use the model's chat template if present; otherwise use a generic prompt."""
-    messages = []
-    if system_message:
-        messages.append({"role": "system", "content": system_message})
-    for u, a in (history or []):
-        if u:
-            messages.append({"role": "user", "content": u})
-        if a:
-            messages.append({"role": "assistant", "content": a})
-    messages.append({"role": "user", "content": user_msg})
-    # Use chat template when available
-    try:
-        if getattr(tokenizer, "chat_template", None):
-            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    except Exception:
-        pass
-    # Fallback generic formatting
-    parts = []
-    if system_message:
-        parts.append(f"System: {system_message}")
     for u, a in (history or []):
         if u:
-            parts.append(f"User: {u}")
         if a:
-            parts.append(f"Assistant: {a}")
-    parts.append(f"User: {user_msg}")
-    parts.append("Assistant:")
-    return "\n".join(parts)
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    prompt = build_prompt(system_message, history, message)
-    inputs = tokenizer(prompt, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=int(max_tokens),
-            do_sample=True,
-            temperature=float(temperature),
-            top_p=float(top_p),
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-    # Decode only the newly generated portion
-    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
-    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
-    # Stream the text in chunks so the UI feels live
     acc = ""
-    for i in range(0, len(text), 40):
-        acc += text[i:i+40]
-        yield acc
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(
-            value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
-                   ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
-            label="System message",
-        ),
         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
@@ -490,6 +557,7 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    # share=True gives you a public link automatically
     demo.launch(share=True)

+# import os
+# import gradio as gr
+# import torch
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
+# MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+# # Load once at startup
+# print(f"🔧 Loading local model: {MODEL_ID}")
+# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+# model = AutoModelForCausalLM.from_pretrained(
+#     MODEL_ID,
+#     torch_dtype=torch.float32,     # CPU-friendly
+# )
+# model.eval()
+# def build_prompt(system_message: str, history, user_msg: str) -> str:
+#     """Try to use the model's chat template if present; otherwise use a generic prompt."""
+#     messages = []
+#     if system_message:
+#         messages.append({"role": "system", "content": system_message})
+#     for u, a in (history or []):
+#         if u:
+#             messages.append({"role": "user", "content": u})
+#         if a:
+#             messages.append({"role": "assistant", "content": a})
+#     messages.append({"role": "user", "content": user_msg})
+#     # Use chat template when available
+#     try:
+#         if getattr(tokenizer, "chat_template", None):
+#             return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     except Exception:
+#         pass
+#     # Fallback generic formatting
+#     parts = []
+#     if system_message:
+#         parts.append(f"System: {system_message}")
+#     for u, a in (history or []):
+#         if u:
+#             parts.append(f"User: {u}")
+#         if a:
+#             parts.append(f"Assistant: {a}")
+#     parts.append(f"User: {user_msg}")
+#     parts.append("Assistant:")
+#     return "\n".join(parts)
+# def respond(message, history, system_message, max_tokens, temperature, top_p):
+#     prompt = build_prompt(system_message, history, message)
+#     inputs = tokenizer(prompt, return_tensors="pt")
+#     with torch.no_grad():
+#         outputs = model.generate(
+#             **inputs,
+#             max_new_tokens=int(max_tokens),
+#             do_sample=True,
+#             temperature=float(temperature),
+#             top_p=float(top_p),
+#             pad_token_id=tokenizer.eos_token_id,
+#             eos_token_id=tokenizer.eos_token_id,
+#         )
+#     # Decode only the newly generated portion
+#     gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
+#     text = tokenizer.decode(gen_ids, skip_special_tokens=True)
+#     # Stream the text in chunks so the UI feels live
+#     acc = ""
+#     for i in range(0, len(text), 40):
+#         acc += text[i:i+40]
+#         yield acc
+# demo = gr.ChatInterface(
+#     respond,
+#     additional_inputs=[
+#         gr.Textbox(
+#             value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
+#                    ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
+#             label="System message",
+#         ),
+#         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
+#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+#         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
+#     ],
+# )
+# if __name__ == "__main__":
+#     # share=True gives you a public link automatically
+#     demo.launch(share=True)
 import os
 import gradio as gr
+from llama_cpp import Llama
+# Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
+REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
+FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
+N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
+CTX = int(os.getenv("CTX", "2048"))
+print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
+llm = Llama.from_pretrained(
+    repo_id=REPO_ID,
+    filename=FILENAME,
+    n_ctx=CTX,
+    n_threads=N_THREADS,
+    n_gpu_layers=0,          # CPU only
+    logits_all=False,
+    verbose=False,
 )
+SYSTEM_DEFAULT = (
+    "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
+    "and politely decline other questions."
+)
+def respond(message, history, system_message, max_tokens, temperature, top_p):
+    sysmsg = system_message or SYSTEM_DEFAULT
+    msgs = [{"role": "system", "content": sysmsg}]
     for u, a in (history or []):
         if u:
+            msgs.append({"role": "user", "content": u})
         if a:
+            msgs.append({"role": "assistant", "content": a})
+    msgs.append({"role": "user", "content": message})
+    stream = llm.create_chat_completion(
+        messages=msgs,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        max_tokens=int(max_tokens),
+        stream=True,
+    )
     acc = ""
+    for chunk in stream:
+        delta = chunk["choices"][0]["delta"]
+        tok = delta.get("content", "")
+        if tok:
+            acc += tok
+            yield acc
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 )
 if __name__ == "__main__":
+    print(f"🧵 Threads: {N_THREADS}")
     demo.launch(share=True)