# import gradio as gr # from huggingface_hub import InferenceClient # import spaces # """ # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference # """ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # @spaces.GPU # def respond( # message, # history: list[tuple[str, str]], # system_message, # max_tokens, # temperature, # top_p, # ): # messages = [{"role": "system", "content": system_message}] # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response # """ # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface # """ # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (nucleus sampling)", # ), # ], # ) # if __name__ == "__main__": # demo.launch() # import os # import gradio as gr # from huggingface_hub import InferenceClient # from huggingface_hub.utils import HfHubHTTPError # MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" # HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets # client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) # def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str: # parts = [] # if system_message: # parts.append(f"<|system|>\n{system_message}\n") # for u, a in (history or []): # if u: # parts.append(f"<|user|>\n{u}\n") # if a: # parts.append(f"<|assistant|>\n{a}\n") # parts.append(f"<|user|>\n{user_msg}\n\n<|assistant|>\n") # return "\n".join(parts) # def respond(message, history, system_message, max_tokens, temperature, top_p): # # Early guardrails for missing token # if not HF_TOKEN: # yield ( # "⚠️ Missing HF_TOKEN.\n\n" # "Set a Hugging Face access token in your Space:\n" # "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: \n" # "Token needs at least 'read' scope." # ) # return # # Try OpenAI-like chat completion first # try: # response_text = "" # for chunk in client.chat_completion( # messages=( # [{"role": "system", "content": system_message}] if system_message else [] # ) # + [ # msg # for pair in (history or []) # for msg in ( # [{"role": "user", "content": pair[0]}] if pair and pair[0] else [] # ) # + ( # [{"role": "assistant", "content": pair[1]}] # if pair and len(pair) > 1 and pair[1] # else [] # ) # ] # + [{"role": "user", "content": message}], # max_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # token = getattr(chunk.choices[0].delta, "content", None) # if token: # response_text += token # yield response_text # return # except HfHubHTTPError as e: # # Handle 401 explicitly with helpful guidance # try: # status = e.response.status_code # except Exception: # status = None # if status == 401: # yield ( # "❌ 401 Unauthorized from Hugging Face Inference API.\n\n" # "Fix:\n" # "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n" # "2) In your Space, go to Settings → Repository secrets → Add secret\n" # " Name: HF_TOKEN, Value: \n" # "3) Restart the Space.\n" # ) # return # # Otherwise drop to fallback # except Exception: # pass # # Fallback: raw text_generation with Zephyr chat format # zephyr_prompt = _build_zephyr_prompt(system_message, history, message) # try: # response_text = "" # # for tok in client.text_generation( # # zephyr_prompt, # # max_new_tokens=max_tokens, # # temperature=temperature, # # top_p=top_p, # # stream=True, # # stop=["", "<|user|>", "<|assistant|>", "<|system|>"], # # ): # for tok in client.text_generation( # zephyr_prompt, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # if tok: # response_text += tok # yield response_text # except HfHubHTTPError as e: # try: # status = e.response.status_code # except Exception: # status = None # if status == 401: # yield ( # "❌ 401 Unauthorized (text_generation fallback).\n\n" # "Set HF_TOKEN in Space secrets (Settings → Repository secrets)." # ) # else: # yield f"[Inference error] {e}" # except Exception as e: # yield f"[Runtime error] {e}" # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox( # value=( # "You are a Chatbot who only answers spiritual questions based " # "on Indian scriptures and declines answering other questions." # ), # label="System message", # ), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (nucleus sampling)", # ), # ], # ) # if __name__ == "__main__": # demo.launch() # import os # import gradio as gr # from huggingface_hub import InferenceClient # from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x # # You can override with a Space secret: MODEL_ID= # PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta") # # Accept either token name (matches your other Spaces) # HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") # # If your preferred endpoint is down, we’ll try these in order: # CANDIDATES = [ # PREFERRED, # "google/gemma-2-2b-it", # "Qwen/Qwen2.5-1.5B-Instruct", # "tiiuae/falcon-7b-instruct", # ] # def _build_generic_prompt(system_message, history, user_msg): # """ # Simple, model-agnostic chat prompt (works across many instruct models). # """ # parts = [] # if system_message: # parts.append(f"System: {system_message}") # for u, a in (history or []): # if u: # parts.append(f"User: {u}") # if a: # parts.append(f"Assistant: {a}") # parts.append(f"User: {user_msg}") # parts.append("Assistant:") # return "\n".join(parts) # def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): # """ # Try streaming via chat_completions; on failure, fall back to text_generation. # Returns a generator that yields text chunks. # Raises ValueError('NEXT') to indicate “try next model”. # """ # client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id) # # 1) Try chat-completions (if supported by the backend) # try: # msgs = ( # [{"role": "system", "content": system_message}] if system_message else [] # ) # for u, a in (history or []): # if u: # msgs.append({"role": "user", "content": u}) # if a: # msgs.append({"role": "assistant", "content": a}) # msgs.append({"role": "user", "content": message}) # def gen_chat(): # response_text = "" # for chunk in client.chat_completion( # messages=msgs, # max_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # token = getattr(chunk.choices[0].delta, "content", None) # if token: # response_text += token # yield response_text # # sanity probe: start the generator and yield progressively # for out in gen_chat(): # yield out # return # except HfHubHTTPError as e: # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None # if status == 404: # # Endpoint not available for this model → try next candidate # raise ValueError("NEXT") # if status == 401: # yield ( # "❌ 401 Unauthorized from HF Inference API.\n\n" # "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) " # "in Space secrets, then restart." # ) # return # if status == 403: # yield ( # "❌ 403 Forbidden from HF Inference API.\n\n" # "This model likely requires Inference Providers + billing on your token. " # "Either enable those or switch to a free hosted model using the MODEL_ID secret." # ) # return # # fall through to text_generation for other statuses # except Exception: # # fall through to text_generation # pass # # 2) Fallback: plain text_generation with a generic prompt # prompt = _build_generic_prompt(system_message, history, message) # try: # response_text = "" # for tok in client.text_generation( # prompt, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg) # if any(s in tok for s in ["", "<|user|>", "<|assistant|>", "<|system|>"]): # break # if tok: # response_text += tok # yield response_text # except HfHubHTTPError as e: # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None # if status == 404: # # Endpoint not available for this model → try next candidate # raise ValueError("NEXT") # if status == 401: # yield ( # "❌ 401 Unauthorized (text-generation fallback).\n\n" # "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart." # ) # elif status == 403: # yield ( # "❌ 403 Forbidden (text-generation fallback).\n\n" # "Your token lacks 'Use Inference API/Providers' or billing is not enabled. " # "Enable those or use a free hosted model via MODEL_ID." # ) # else: # yield f"[Inference error] {e}" # except Exception as e: # yield f"[Runtime error] {e}" # def respond(message, history, system_message, max_tokens, temperature, top_p): # last_error = None # tried = [] # for model_id in [m for m in CANDIDATES if m]: # tried.append(model_id) # try: # for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): # yield chunk # # If we streamed anything without raising, we’re done # return # except ValueError as ve: # if str(ve) == "NEXT": # last_error = f"Model `{model_id}` endpoint unavailable (404)." # continue # else: # last_error = str(ve) # except Exception as e: # last_error = f"Unexpected error on `{model_id}`: {e}" # # If we got here, all candidates failed # tried_str = " → ".join(tried) if tried else "(none)" # yield ( # "❌ All candidate models failed.\n\n" # f"Tried: {tried_str}\n\n" # f"Last error: {last_error or 'unknown'}\n\n" # "Fixes:\n" # "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n" # "• Or enable Inference Providers + billing on your HF token for models served via providers.\n" # ) # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox( # value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures " # "and declines answering other questions."), # label="System message", # ), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), # ], # ) # if __name__ == "__main__": # demo.launch(share=True) import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.) MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Load once at startup print(f"🔧 Loading local model: {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # CPU-friendly ) model.eval() def build_prompt(system_message: str, history, user_msg: str) -> str: """Try to use the model's chat template if present; otherwise use a generic prompt.""" messages = [] if system_message: messages.append({"role": "system", "content": system_message}) for u, a in (history or []): if u: messages.append({"role": "user", "content": u}) if a: messages.append({"role": "assistant", "content": a}) messages.append({"role": "user", "content": user_msg}) # Use chat template when available try: if getattr(tokenizer, "chat_template", None): return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) except Exception: pass # Fallback generic formatting parts = [] if system_message: parts.append(f"System: {system_message}") for u, a in (history or []): if u: parts.append(f"User: {u}") if a: parts.append(f"Assistant: {a}") parts.append(f"User: {user_msg}") parts.append("Assistant:") return "\n".join(parts) def respond(message, history, system_message, max_tokens, temperature, top_p): prompt = build_prompt(system_message, history, message) inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=int(max_tokens), do_sample=True, temperature=float(temperature), top_p=float(top_p), pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) # Decode only the newly generated portion gen_ids = outputs[0][inputs["input_ids"].shape[1]:] text = tokenizer.decode(gen_ids, skip_special_tokens=True) # Stream the text in chunks so the UI feels live acc = "" for i in range(0, len(text), 40): acc += text[i:i+40] yield acc demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox( value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."), label="System message", ), gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), ], ) if __name__ == "__main__": # share=True gives you a public link automatically demo.launch(share=True) # import os # import gradio as gr # # ---- llama.cpp backend (fast CPU) ---- # from llama_cpp import Llama # # ---- to list files in a repo and pick a GGUF automatically ---- # from huggingface_hub import list_repo_files # # ----------------- Config ----------------- # # You can override these via Space "Settings → Variables" # # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below. # MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None # # Known small GGUF chat repos (fast & lightweight). We'll try them in order. # CANDIDATE_REPOS = [ # MODEL_REPO, # user-preferred first (may be None) # "Qwen/Qwen2.5-0.5B-Instruct-GGUF", # "Qwen/Qwen2-0.5B-Instruct-GGUF", # "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", # "bartowski/Qwen2.5-0.5B-Instruct-GGUF", # ] # # Best-to-worst file name patterns to prefer when multiple GGUFs are present. # PREFERRED_PATTERNS = [ # "q4_k_m.gguf", "Q4_K_M.gguf", # "q4_0.gguf", "Q4_0.gguf", # "q5_k_m.gguf", "Q5_K_M.gguf", # ".gguf", # catch-all # ] # # Runtime knobs # N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4))) # CTX = int(os.getenv("CTX", "2048")) # SYSTEM_DEFAULT = ( # "You are a Chatbot who only answers spiritual questions based on Indian scriptures " # "and politely decline other questions." # ) # # --------------- GGUF Picker --------------- # def pick_repo_and_file(): # """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF.""" # tried = [] # for repo in [r for r in CANDIDATE_REPOS if r]: # drop None # try: # files = list_repo_files(repo) # except Exception: # tried.append(f"{repo} (list failed)") # continue # ggufs = [f for f in files if f.lower().endswith(".gguf")] # if not ggufs: # tried.append(f"{repo} (no .gguf)") # continue # # pick by pattern preference # for pat in PREFERRED_PATTERNS: # for f in ggufs: # if pat in f: # return repo, f # tried_str = " | ".join(tried) if tried else "(none)" # raise RuntimeError( # "No GGUF file found in any candidate repo.\n" # f"Tried: {tried_str}\n" # "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' " # "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'." # ) # REPO_ID, FILENAME = pick_repo_and_file() # print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}") # llm = Llama.from_pretrained( # repo_id=REPO_ID, # filename=FILENAME, # n_ctx=CTX, # n_threads=N_THREADS, # n_gpu_layers=0, # CPU only # logits_all=False, # verbose=False, # ) # def respond(message, history, system_message, max_tokens, temperature, top_p): # sysmsg = system_message or SYSTEM_DEFAULT # msgs = [{"role": "system", "content": sysmsg}] # for u, a in (history or []): # if u: # msgs.append({"role": "user", "content": u}) # if a: # msgs.append({"role": "assistant", "content": a}) # msgs.append({"role": "user", "content": message}) # stream = llm.create_chat_completion( # messages=msgs, # temperature=float(temperature), # top_p=float(top_p), # max_tokens=int(max_tokens), # stream=True, # ) # acc = "" # for chunk in stream: # delta = chunk["choices"][0]["delta"] # tok = delta.get("content", "") # if tok: # acc += tok # yield acc # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox(value=SYSTEM_DEFAULT, label="System message"), # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), # ], # ) # if __name__ == "__main__": # print(f"🧵 Threads: {N_THREADS}") # demo.launch(share=True)