# import gradio as gr
# from huggingface_hub import InferenceClient
# import spaces


# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# @spaces.GPU
# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     messages.append({"role": "user", "content": message})

#     response = ""

#     for message in client.chat_completion(
#         messages,
#         max_tokens=max_tokens,
#         stream=True,
#         temperature=temperature,
#         top_p=top_p,
#     ):
#         token = message.choices[0].delta.content

#         response += token
#         yield response

# """
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
# """
# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(
#             minimum=0.1,
#             maximum=1.0,
#             value=0.95,
#             step=0.05,
#             label="Top-p (nucleus sampling)",
#         ),
#     ],

# )


# if __name__ == "__main__":
#     demo.launch()

# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError 

# MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
# HF_TOKEN = os.getenv("HF_TOKEN")  # ⚠️ set this in Spaces → Settings → Secrets

# client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)


# def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str:
#     parts = []
#     if system_message:
#         parts.append(f"<|system|>\n{system_message}\n</s>")
#     for u, a in (history or []):
#         if u:
#             parts.append(f"<|user|>\n{u}\n</s>")
#         if a:
#             parts.append(f"<|assistant|>\n{a}\n</s>")
#     parts.append(f"<|user|>\n{user_msg}\n</s>\n<|assistant|>\n")
#     return "\n".join(parts)


# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     # Early guardrails for missing token
#     if not HF_TOKEN:
#         yield (
#             "⚠️ Missing HF_TOKEN.\n\n"
#             "Set a Hugging Face access token in your Space:\n"
#             "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n"
#             "Token needs at least 'read' scope."
#         )
#         return

#     # Try OpenAI-like chat completion first
#     try:
#         response_text = ""
#         for chunk in client.chat_completion(
#             messages=(
#                 [{"role": "system", "content": system_message}] if system_message else []
#             )
#             + [
#                 msg
#                 for pair in (history or [])
#                 for msg in (
#                     [{"role": "user", "content": pair[0]}] if pair and pair[0] else []
#                 )
#                 + (
#                     [{"role": "assistant", "content": pair[1]}]
#                     if pair and len(pair) > 1 and pair[1]
#                     else []
#                 )
#             ]
#             + [{"role": "user", "content": message}],
#             max_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):
#             token = getattr(chunk.choices[0].delta, "content", None)
#             if token:
#                 response_text += token
#                 yield response_text
#         return
#     except HfHubHTTPError as e:
#         # Handle 401 explicitly with helpful guidance
#         try:
#             status = e.response.status_code
#         except Exception:
#             status = None
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized from Hugging Face Inference API.\n\n"
#                 "Fix:\n"
#                 "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n"
#                 "2) In your Space, go to Settings → Repository secrets → Add secret\n"
#                 "   Name: HF_TOKEN, Value: <your token>\n"
#                 "3) Restart the Space.\n"
#             )
#             return
#         # Otherwise drop to fallback
#     except Exception:
#         pass

#     # Fallback: raw text_generation with Zephyr chat format
#     zephyr_prompt = _build_zephyr_prompt(system_message, history, message)
#     try:
#         response_text = ""
#         # for tok in client.text_generation(
#         #     zephyr_prompt,
#         #     max_new_tokens=max_tokens,
#         #     temperature=temperature,
#         #     top_p=top_p,
#         #     stream=True,
#         #     stop=["</s>", "<|user|>", "<|assistant|>", "<|system|>"],
#         # ):

#         for tok in client.text_generation(
#             zephyr_prompt,
#             max_new_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):

#             if tok:
#                 response_text += tok
#                 yield response_text
#     except HfHubHTTPError as e:
#         try:
#             status = e.response.status_code
#         except Exception:
#             status = None
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized (text_generation fallback).\n\n"
#                 "Set HF_TOKEN in Space secrets (Settings → Repository secrets)."
#             )
#         else:
#             yield f"[Inference error] {e}"
#     except Exception as e:
#         yield f"[Runtime error] {e}"


# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(
#             value=(
#                 "You are a Chatbot who only answers spiritual questions based "
#                 "on Indian scriptures and declines answering other questions."
#             ),
#             label="System message",
#         ),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(
#             minimum=0.1,
#             maximum=1.0,
#             value=0.95,
#             step=0.05,
#             label="Top-p (nucleus sampling)",
#         ),
#     ],
# )

# if __name__ == "__main__":
#     demo.launch()

# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError  # correct import for 0.22.x

# # You can override with a Space secret: MODEL_ID=<your preferred model>
# PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta")

# # Accept either token name (matches your other Spaces)
# HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")

# # If your preferred endpoint is down, we’ll try these in order:
# CANDIDATES = [
#     PREFERRED,
#     "google/gemma-2-2b-it",
#     "Qwen/Qwen2.5-1.5B-Instruct",
#     "tiiuae/falcon-7b-instruct",
# ]

# def _build_generic_prompt(system_message, history, user_msg):
#     """
#     Simple, model-agnostic chat prompt (works across many instruct models).
#     """
#     parts = []
#     if system_message:
#         parts.append(f"System: {system_message}")
#     for u, a in (history or []):
#         if u:
#             parts.append(f"User: {u}")
#         if a:
#             parts.append(f"Assistant: {a}")
#     parts.append(f"User: {user_msg}")
#     parts.append("Assistant:")
#     return "\n".join(parts)

# def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
#     """
#     Try streaming via chat_completions; on failure, fall back to text_generation.
#     Returns a generator that yields text chunks.
#     Raises ValueError('NEXT') to indicate “try next model”.
#     """
#     client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id)

#     # 1) Try chat-completions (if supported by the backend)
#     try:
#         msgs = (
#             [{"role": "system", "content": system_message}] if system_message else []
#         )
#         for u, a in (history or []):
#             if u:
#                 msgs.append({"role": "user", "content": u})
#             if a:
#                 msgs.append({"role": "assistant", "content": a})
#         msgs.append({"role": "user", "content": message})

#         def gen_chat():
#             response_text = ""
#             for chunk in client.chat_completion(
#                 messages=msgs,
#                 max_tokens=max_tokens,
#                 temperature=temperature,
#                 top_p=top_p,
#                 stream=True,
#             ):
#                 token = getattr(chunk.choices[0].delta, "content", None)
#                 if token:
#                     response_text += token
#                     yield response_text

#         # sanity probe: start the generator and yield progressively
#         for out in gen_chat():
#             yield out
#         return
#     except HfHubHTTPError as e:
#         status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
#         if status == 404:
#             # Endpoint not available for this model → try next candidate
#             raise ValueError("NEXT")
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized from HF Inference API.\n\n"
#                 "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) "
#                 "in Space secrets, then restart."
#             )
#             return
#         if status == 403:
#             yield (
#                 "❌ 403 Forbidden from HF Inference API.\n\n"
#                 "This model likely requires Inference Providers + billing on your token. "
#                 "Either enable those or switch to a free hosted model using the MODEL_ID secret."
#             )
#             return
#         # fall through to text_generation for other statuses
#     except Exception:
#         # fall through to text_generation
#         pass

#     # 2) Fallback: plain text_generation with a generic prompt
#     prompt = _build_generic_prompt(system_message, history, message)
#     try:
#         response_text = ""
#         for tok in client.text_generation(
#             prompt,
#             max_new_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             stream=True,
#         ):
#             # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg)
#             if any(s in tok for s in ["</s>", "<|user|>", "<|assistant|>", "<|system|>"]):
#                 break
#             if tok:
#                 response_text += tok
#                 yield response_text
#     except HfHubHTTPError as e:
#         status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
#         if status == 404:
#             # Endpoint not available for this model → try next candidate
#             raise ValueError("NEXT")
#         if status == 401:
#             yield (
#                 "❌ 401 Unauthorized (text-generation fallback).\n\n"
#                 "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart."
#             )
#         elif status == 403:
#             yield (
#                 "❌ 403 Forbidden (text-generation fallback).\n\n"
#                 "Your token lacks 'Use Inference API/Providers' or billing is not enabled. "
#                 "Enable those or use a free hosted model via MODEL_ID."
#             )
#         else:
#             yield f"[Inference error] {e}"
#     except Exception as e:
#         yield f"[Runtime error] {e}"

# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     last_error = None
#     tried = []

#     for model_id in [m for m in CANDIDATES if m]:
#         tried.append(model_id)
#         try:
#             for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
#                 yield chunk
#             # If we streamed anything without raising, we’re done
#             return
#         except ValueError as ve:
#             if str(ve) == "NEXT":
#                 last_error = f"Model `{model_id}` endpoint unavailable (404)."
#                 continue
#             else:
#                 last_error = str(ve)
#         except Exception as e:
#             last_error = f"Unexpected error on `{model_id}`: {e}"

#     # If we got here, all candidates failed
#     tried_str = " → ".join(tried) if tried else "(none)"
#     yield (
#         "❌ All candidate models failed.\n\n"
#         f"Tried: {tried_str}\n\n"
#         f"Last error: {last_error or 'unknown'}\n\n"
#         "Fixes:\n"
#         "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n"
#         "• Or enable Inference Providers + billing on your HF token for models served via providers.\n"
#     )

# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(
#             value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures "
#                    "and declines answering other questions."),
#             label="System message",
#         ),
#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
#     ],
# )

# if __name__ == "__main__":
#     demo.launch(share=True)


import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# Load once at startup
print(f"🔧 Loading local model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,     # CPU-friendly
)
model.eval()

def build_prompt(system_message: str, history, user_msg: str) -> str:
    """Try to use the model's chat template if present; otherwise use a generic prompt."""
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    for u, a in (history or []):
        if u:
            messages.append({"role": "user", "content": u})
        if a:
            messages.append({"role": "assistant", "content": a})
    messages.append({"role": "user", "content": user_msg})

    # Use chat template when available
    try:
        if getattr(tokenizer, "chat_template", None):
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except Exception:
        pass

    # Fallback generic formatting
    parts = []
    if system_message:
        parts.append(f"System: {system_message}")
    for u, a in (history or []):
        if u:
            parts.append(f"User: {u}")
        if a:
            parts.append(f"Assistant: {a}")
    parts.append(f"User: {user_msg}")
    parts.append("Assistant:")
    return "\n".join(parts)

def respond(message, history, system_message, max_tokens, temperature, top_p):
    prompt = build_prompt(system_message, history, message)
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=int(max_tokens),
            do_sample=True,
            temperature=float(temperature),
            top_p=float(top_p),
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    # Decode only the newly generated portion
    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)

    # Stream the text in chunks so the UI feels live
    acc = ""
    for i in range(0, len(text), 40):
        acc += text[i:i+40]
        yield acc

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."),
            label="System message",
        ),
        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    # share=True gives you a public link automatically
    demo.launch(share=True)

# import os
# import gradio as gr

# # ---- llama.cpp backend (fast CPU) ----
# from llama_cpp import Llama

# # ---- to list files in a repo and pick a GGUF automatically ----
# from huggingface_hub import list_repo_files

# # ----------------- Config -----------------
# # You can override these via Space "Settings → Variables"
# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None

# # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
# CANDIDATE_REPOS = [
#     MODEL_REPO,  # user-preferred first (may be None)
#     "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
#     "Qwen/Qwen2-0.5B-Instruct-GGUF",
#     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
#     "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
# ]

# # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
# PREFERRED_PATTERNS = [
#     "q4_k_m.gguf", "Q4_K_M.gguf",
#     "q4_0.gguf",   "Q4_0.gguf",
#     "q5_k_m.gguf", "Q5_K_M.gguf",
#     ".gguf",  # catch-all
# ]

# # Runtime knobs
# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
# CTX = int(os.getenv("CTX", "2048"))

# SYSTEM_DEFAULT = (
#     "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
#     "and politely decline other questions."
# )

# # --------------- GGUF Picker ---------------
# def pick_repo_and_file():
#     """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
#     tried = []
#     for repo in [r for r in CANDIDATE_REPOS if r]:  # drop None
#         try:
#             files = list_repo_files(repo)
#         except Exception:
#             tried.append(f"{repo} (list failed)")
#             continue
#         ggufs = [f for f in files if f.lower().endswith(".gguf")]
#         if not ggufs:
#             tried.append(f"{repo} (no .gguf)")
#             continue
#         # pick by pattern preference
#         for pat in PREFERRED_PATTERNS:
#             for f in ggufs:
#                 if pat in f:
#                     return repo, f
#     tried_str = " | ".join(tried) if tried else "(none)"
#     raise RuntimeError(
#         "No GGUF file found in any candidate repo.\n"
#         f"Tried: {tried_str}\n"
#         "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
#         "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
#     )

# REPO_ID, FILENAME = pick_repo_and_file()
# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME}  |  threads={N_THREADS}, ctx={CTX}")

# llm = Llama.from_pretrained(
#     repo_id=REPO_ID,
#     filename=FILENAME,
#     n_ctx=CTX,
#     n_threads=N_THREADS,
#     n_gpu_layers=0,       # CPU only
#     logits_all=False,
#     verbose=False,
# )

# def respond(message, history, system_message, max_tokens, temperature, top_p):
#     sysmsg = system_message or SYSTEM_DEFAULT
#     msgs = [{"role": "system", "content": sysmsg}]
#     for u, a in (history or []):
#         if u:
#             msgs.append({"role": "user", "content": u})
#         if a:
#             msgs.append({"role": "assistant", "content": a})
#     msgs.append({"role": "user", "content": message})

#     stream = llm.create_chat_completion(
#         messages=msgs,
#         temperature=float(temperature),
#         top_p=float(top_p),
#         max_tokens=int(max_tokens),
#         stream=True,
#     )
#     acc = ""
#     for chunk in stream:
#         delta = chunk["choices"][0]["delta"]
#         tok = delta.get("content", "")
#         if tok:
#             acc += tok
#             yield acc

# demo = gr.ChatInterface(
#     respond,
#     additional_inputs=[
#         gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
#         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
#     ],
# )

# if __name__ == "__main__":
#     print(f"🧵 Threads: {N_THREADS}")
#     demo.launch(share=True)