# import gradio as gr
# from huggingface_hub import InferenceClient
# import spaces


# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# @spaces.GPU
# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     messages.append({"role": "user", "content": message})

#     response = ""

#     for message in client.chat_completion(
#         messages,
#         max_tokens=max_tokens,
#         stream=True,
#         temperature=temperature,
#         top_p=top_p,
#     ):
#         token = message.choices[0].delta.content

#         response += token
#         yield response

# """
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
# """
# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(
#             minimum=0.1,
#             maximum=1.0,
#             value=0.95,
#             step=0.05,
#             label="Top-p (nucleus sampling)",
#         ),
#     ],

# )


# if __name__ == "__main__":
#     demo.launch()

# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError 

# MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
# HF_TOKEN = os.getenv("HF_TOKEN")  # ⚠️ set this in Spaces → Settings → Secrets

# client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)


# def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str:
#     parts = []
#     if system_message:
#         parts.append(f"<|system|>\n{system_message}\n</s>")
#     for u, a in (history or []):
#         if u:
#             parts.append(f"<|user|>\n{u}\n</s>")
#         if a:
#             parts.append(f"<|assistant|>\n{a}\n</s>")
#     parts.append(f"<|user|>\n{user_msg}\n</s>\n<|assistant|>\n")
#     return "\n".join(parts)


# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     # Early guardrails for missing token
#     if not HF_TOKEN:
#         yield (
#             "⚠️ Missing HF_TOKEN.\n\n"
#             "Set a Hugging Face access token in your Space:\n"
#             "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n"
#             "Token needs at least 'read' scope."
#         )
#         return

#     # Try OpenAI-like chat completion first
#     try:
#         response_text = ""
#         for chunk in client.chat_completion(
#             messages=(
#                 [{"role": "system", "content": system_message}] if system_message else []
#             )
#             + [
#                 msg
#                 for pair in (history or [])
#                 for msg in (
#                     [{"role": "user", "content": pair[0]}] if pair and pair[0] else []
#                 )
#                 + (
#                     [{"role": "assistant", "content": pair[1]}]
#                     if pair and len(pair) > 1 and pair[1]
#                     else []
#                 )
#             ]
#             + [{"role": "user", "content": message}],
#             max_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):
#             token = getattr(chunk.choices[0].delta, "content", None)
#             if token:
#                 response_text += token
#                 yield response_text
#         return
#     except HfHubHTTPError as e:
#         # Handle 401 explicitly with helpful guidance
#         try:
#             status = e.response.status_code
#         except Exception:
#             status = None
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized from Hugging Face Inference API.\n\n"
#                 "Fix:\n"
#                 "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n"
#                 "2) In your Space, go to Settings → Repository secrets → Add secret\n"
#                 "   Name: HF_TOKEN, Value: <your token>\n"
#                 "3) Restart the Space.\n"
#             )
#             return
#         # Otherwise drop to fallback
#     except Exception:
#         pass

#     # Fallback: raw text_generation with Zephyr chat format
#     zephyr_prompt = _build_zephyr_prompt(system_message, history, message)
#     try:
#         response_text = ""
#         # for tok in client.text_generation(
#         #     zephyr_prompt,
#         #     max_new_tokens=max_tokens,
#         #     temperature=temperature,
#         #     top_p=top_p,
#         #     stream=True,
#         #     stop=["</s>", "<|user|>", "<|assistant|>", "<|system|>"],
#         # ):

#         for tok in client.text_generation(
#             zephyr_prompt,
#             max_new_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):

#             if tok:
#                 response_text += tok
#                 yield response_text
#     except HfHubHTTPError as e:
#         try:
#             status = e.response.status_code
#         except Exception:
#             status = None
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized (text_generation fallback).\n\n"
#                 "Set HF_TOKEN in Space secrets (Settings → Repository secrets)."
#             )
#         else:
#             yield f"[Inference error] {e}"
#     except Exception as e:
#         yield f"[Runtime error] {e}"


# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(
#             value=(
#                 "You are a Chatbot who only answers spiritual questions based "
#                 "on Indian scriptures and declines answering other questions."
#             ),
#             label="System message",
#         ),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(
#             minimum=0.1,
#             maximum=1.0,
#             value=0.95,
#             step=0.05,
#             label="Top-p (nucleus sampling)",
#         ),
#     ],
# )

# if __name__ == "__main__":
#     demo.launch()

# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError  # correct import for 0.22.x

# # You can override with a Space secret: MODEL_ID=<your preferred model>
# PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta")

# # Accept either token name (matches your other Spaces)
# HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")

# # If your preferred endpoint is down, we’ll try these in order:
# CANDIDATES = [
#     PREFERRED,
#     "google/gemma-2-2b-it",
#     "Qwen/Qwen2.5-1.5B-Instruct",
#     "tiiuae/falcon-7b-instruct",
# ]

# def _build_generic_prompt(system_message, history, user_msg):
#     """
#     Simple, model-agnostic chat prompt (works across many instruct models).
#     """
#     parts = []
#     if system_message:
#         parts.append(f"System: {system_message}")
#     for u, a in (history or []):
#         if u:
#             parts.append(f"User: {u}")
#         if a:
#             parts.append(f"Assistant: {a}")
#     parts.append(f"User: {user_msg}")
#     parts.append("Assistant:")
#     return "\n".join(parts)

# def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
#     """
#     Try streaming via chat_completions; on failure, fall back to text_generation.
#     Returns a generator that yields text chunks.
#     Raises ValueError('NEXT') to indicate “try next model”.
#     """
#     client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id)

#     # 1) Try chat-completions (if supported by the backend)
#     try:
#         msgs = (
#             [{"role": "system", "content": system_message}] if system_message else []
#         )
#         for u, a in (history or []):
#             if u:
#                 msgs.append({"role": "user", "content": u})
#             if a:
#                 msgs.append({"role": "assistant", "content": a})
#         msgs.append({"role": "user", "content": message})

#         def gen_chat():
#             response_text = ""
#             for chunk in client.chat_completion(
#                 messages=msgs,
#                 max_tokens=max_tokens,
#                 temperature=temperature,
#                 top_p=top_p,
#                 stream=True,
#             ):
#                 token = getattr(chunk.choices[0].delta, "content", None)
#                 if token:
#                     response_text += token
#                     yield response_text

#         # sanity probe: start the generator and yield progressively
#         for out in gen_chat():
#             yield out
#         return
#     except HfHubHTTPError as e:
#         status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
#         if status == 404:
#             # Endpoint not available for this model → try next candidate
#             raise ValueError("NEXT")
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized from HF Inference API.\n\n"
#                 "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) "
#                 "in Space secrets, then restart."
#             )
#             return
#         if status == 403:
#             yield (
#                 "❌ 403 Forbidden from HF Inference API.\n\n"
#                 "This model likely requires Inference Providers + billing on your token. "
#                 "Either enable those or switch to a free hosted model using the MODEL_ID secret."
#             )
#             return
#         # fall through to text_generation for other statuses
#     except Exception:
#         # fall through to text_generation
#         pass

#     # 2) Fallback: plain text_generation with a generic prompt
#     prompt = _build_generic_prompt(system_message, history, message)
#     try:
#         response_text = ""
#         for tok in client.text_generation(
#             prompt,
#             max_new_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):
#             # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg)
#             if any(s in tok for s in ["</s>", "<|user|>", "<|assistant|>", "<|system|>"]):
#                 break
#             if tok:
#                 response_text += tok
#                 yield response_text
#     except HfHubHTTPError as e:
#         status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
#         if status == 404:
#             # Endpoint not available for this model → try next candidate
#             raise ValueError("NEXT")
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized (text-generation fallback).\n\n"
#                 "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart."
#             )
#         elif status == 403:
#             yield (
#                 "❌ 403 Forbidden (text-generation fallback).\n\n"
#                 "Your token lacks 'Use Inference API/Providers' or billing is not enabled. "
#                 "Enable those or use a free hosted model via MODEL_ID."
#             )
#         else:
#             yield f"[Inference error] {e}"
#     except Exception as e:
#         yield f"[Runtime error] {e}"

# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     last_error = None
#     tried = []

#     for model_id in [m for m in CANDIDATES if m]:
#         tried.append(model_id)
#         try:
#             for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
#                 yield chunk
#             # If we streamed anything without raising, we’re done
#             return
#         except ValueError as ve:
#             if str(ve) == "NEXT":
#                 last_error = f"Model `{model_id}` endpoint unavailable (404)."
#                 continue
#             else:
#                 last_error = str(ve)
#         except Exception as e:
#             last_error = f"Unexpected error on `{model_id}`: {e}"

#     # If we got here, all candidates failed
#     tried_str = " → ".join(tried) if tried else "(none)"
#     yield (
#         "❌ All candidate models failed.\n\n"
#         f"Tried: {tried_str}\n\n"
#         f"Last error: {last_error or 'unknown'}\n\n"
#         "Fixes:\n"
#         "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n"
#         "• Or enable Inference Providers + billing on your HF token for models served via providers.\n"
#     )

# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(
#             value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures "
#                    "and declines answering other questions."),
#             label="System message",
#         ),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
#     ],
# )

# if __name__ == "__main__":
#     demo.launch(share=True)


# import os
# import gradio as gr
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
# MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# # Load once at startup
# print(f"🔧 Loading local model: {MODEL_ID}")
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.float32,     # CPU-friendly
# )
# model.eval()

# def build_prompt(system_message: str, history, user_msg: str) -> str:
#     """Try to use the model's chat template if present; otherwise use a generic prompt."""
#     messages = []
#     if system_message:
#         messages.append({"role": "system", "content": system_message})
#     for u, a in (history or []):
#         if u:
#             messages.append({"role": "user", "content": u})
#         if a:
#             messages.append({"role": "assistant", "content": a})
#     messages.append({"role": "user", "content": user_msg})

#     # Use chat template when available
#     try:
#         if getattr(tokenizer, "chat_template", None):
#             return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     except Exception:
#         pass

#     # Fallback generic formatting
#     parts = []
#     if system_message:
#         parts.append(f"System: {system_message}")
#     for u, a in (history or []):
#         if u:
#             parts.append(f"User: {u}")
#         if a:
#             parts.append(f"Assistant: {a}")
#     parts.append(f"User: {user_msg}")
#     parts.append("Assistant:")
#     return "\n".join(parts)

# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     prompt = build_prompt(system_message, history, message)
#     inputs = tokenizer(prompt, return_tensors="pt")
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=int(max_tokens),
#             do_sample=True,
#             temperature=float(temperature),
#             top_p=float(top_p),
#             pad_token_id=tokenizer.eos_token_id,
#             eos_token_id=tokenizer.eos_token_id,
#         )
#     # Decode only the newly generated portion
#     gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
#     text = tokenizer.decode(gen_ids, skip_special_tokens=True)

#     # Stream the text in chunks so the UI feels live
#     acc = ""
#     for i in range(0, len(text), 40):
#         acc += text[i:i+40]
#         yield acc

# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(
#             value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
#                    ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
#             label="System message",
#         ),
#         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
#     ],
# )

# if __name__ == "__main__":
#     # share=True gives you a public link automatically
#     demo.launch(share=True)


import os
import gradio as gr
from llama_cpp import Llama

# Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")

N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
CTX = int(os.getenv("CTX", "2048"))

print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
llm = Llama.from_pretrained(
    repo_id=REPO_ID,
    filename=FILENAME,
    n_ctx=CTX,
    n_threads=N_THREADS,
    n_gpu_layers=0,          # CPU only
    logits_all=False,
    verbose=False,
)

SYSTEM_DEFAULT = (
    "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
    "and politely decline other questions."
)

def respond(message, history, system_message, max_tokens, temperature, top_p):
    sysmsg = system_message or SYSTEM_DEFAULT
    msgs = [{"role": "system", "content": sysmsg}]
    for u, a in (history or []):
        if u:
            msgs.append({"role": "user", "content": u})
        if a:
            msgs.append({"role": "assistant", "content": a})
    msgs.append({"role": "user", "content": message})

    stream = llm.create_chat_completion(
        messages=msgs,
        temperature=float(temperature),
        top_p=float(top_p),
        max_tokens=int(max_tokens),
        stream=True,
    )
    acc = ""
    for chunk in stream:
        delta = chunk["choices"][0]["delta"]
        tok = delta.get("content", "")
        if tok:
            acc += tok
            yield acc

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    print(f"🧵 Threads: {N_THREADS}")
    demo.launch(share=True)