# import gradio as gr # from huggingface_hub import InferenceClient # import spaces # """ # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference # """ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # @spaces.GPU # def respond( # message, # history: list[tuple[str, str]], # system_message, # max_tokens, # temperature, # top_p, # ): # messages = [{"role": "system", "content": system_message}] # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response # """ # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface # """ # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (nucleus sampling)", # ), # ], # ) # if __name__ == "__main__": # demo.launch() # import os # import gradio as gr # from huggingface_hub import InferenceClient # from huggingface_hub.utils import HfHubHTTPError # MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" # HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets # client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) # def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str: # parts = [] # if system_message: # parts.append(f"<|system|>\n{system_message}\n") # for u, a in (history or []): # if u: # parts.append(f"<|user|>\n{u}\n") # if a: # parts.append(f"<|assistant|>\n{a}\n") # parts.append(f"<|user|>\n{user_msg}\n\n<|assistant|>\n") # return "\n".join(parts) # def respond(message, history, system_message, max_tokens, temperature, top_p): # # Early guardrails for missing token # if not HF_TOKEN: # yield ( # "⚠️ Missing HF_TOKEN.\n\n" # "Set a Hugging Face access token in your Space:\n" # "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: \n" # "Token needs at least 'read' scope." # ) # return # # Try OpenAI-like chat completion first # try: # response_text = "" # for chunk in client.chat_completion( # messages=( # [{"role": "system", "content": system_message}] if system_message else [] # ) # + [ # msg # for pair in (history or []) # for msg in ( # [{"role": "user", "content": pair[0]}] if pair and pair[0] else [] # ) # + ( # [{"role": "assistant", "content": pair[1]}] # if pair and len(pair) > 1 and pair[1] # else [] # ) # ] # + [{"role": "user", "content": message}], # max_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # token = getattr(chunk.choices[0].delta, "content", None) # if token: # response_text += token # yield response_text # return # except HfHubHTTPError as e: # # Handle 401 explicitly with helpful guidance # try: # status = e.response.status_code # except Exception: # status = None # if status == 401: # yield ( # "❌ 401 Unauthorized from Hugging Face Inference API.\n\n" # "Fix:\n" # "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n" # "2) In your Space, go to Settings → Repository secrets → Add secret\n" # " Name: HF_TOKEN, Value: \n" # "3) Restart the Space.\n" # ) # return # # Otherwise drop to fallback # except Exception: # pass # # Fallback: raw text_generation with Zephyr chat format # zephyr_prompt = _build_zephyr_prompt(system_message, history, message) # try: # response_text = "" # # for tok in client.text_generation( # # zephyr_prompt, # # max_new_tokens=max_tokens, # # temperature=temperature, # # top_p=top_p, # # stream=True, # # stop=["", "<|user|>", "<|assistant|>", "<|system|>"], # # ): # for tok in client.text_generation( # zephyr_prompt, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # if tok: # response_text += tok # yield response_text # except HfHubHTTPError as e: # try: # status = e.response.status_code # except Exception: # status = None # if status == 401: # yield ( # "❌ 401 Unauthorized (text_generation fallback).\n\n" # "Set HF_TOKEN in Space secrets (Settings → Repository secrets)." # ) # else: # yield f"[Inference error] {e}" # except Exception as e: # yield f"[Runtime error] {e}" # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox( # value=( # "You are a Chatbot who only answers spiritual questions based " # "on Indian scriptures and declines answering other questions." # ), # label="System message", # ), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (nucleus sampling)", # ), # ], # ) # if __name__ == "__main__": # demo.launch() # import os # import gradio as gr # from huggingface_hub import InferenceClient # from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x # # You can override with a Space secret: MODEL_ID= # PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta") # # Accept either token name (matches your other Spaces) # HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") # # If your preferred endpoint is down, we’ll try these in order: # CANDIDATES = [ # PREFERRED, # "google/gemma-2-2b-it", # "Qwen/Qwen2.5-1.5B-Instruct", # "tiiuae/falcon-7b-instruct", # ] # def _build_generic_prompt(system_message, history, user_msg): # """ # Simple, model-agnostic chat prompt (works across many instruct models). # """ # parts = [] # if system_message: # parts.append(f"System: {system_message}") # for u, a in (history or []): # if u: # parts.append(f"User: {u}") # if a: # parts.append(f"Assistant: {a}") # parts.append(f"User: {user_msg}") # parts.append("Assistant:") # return "\n".join(parts) # def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): # """ # Try streaming via chat_completions; on failure, fall back to text_generation. # Returns a generator that yields text chunks. # Raises ValueError('NEXT') to indicate “try next model”. # """ # client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id) # # 1) Try chat-completions (if supported by the backend) # try: # msgs = ( # [{"role": "system", "content": system_message}] if system_message else [] # ) # for u, a in (history or []): # if u: # msgs.append({"role": "user", "content": u}) # if a: # msgs.append({"role": "assistant", "content": a}) # msgs.append({"role": "user", "content": message}) # def gen_chat(): # response_text = "" # for chunk in client.chat_completion( # messages=msgs, # max_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # token = getattr(chunk.choices[0].delta, "content", None) # if token: # response_text += token # yield response_text # # sanity probe: start the generator and yield progressively # for out in gen_chat(): # yield out # return # except HfHubHTTPError as e: # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None # if status == 404: # # Endpoint not available for this model → try next candidate # raise ValueError("NEXT") # if status == 401: # yield ( # "❌ 401 Unauthorized from HF Inference API.\n\n" # "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) " # "in Space secrets, then restart." # ) # return # if status == 403: # yield ( # "❌ 403 Forbidden from HF Inference API.\n\n" # "This model likely requires Inference Providers + billing on your token. " # "Either enable those or switch to a free hosted model using the MODEL_ID secret." # ) # return # # fall through to text_generation for other statuses # except Exception: # # fall through to text_generation # pass # # 2) Fallback: plain text_generation with a generic prompt # prompt = _build_generic_prompt(system_message, history, message) # try: # response_text = "" # for tok in client.text_generation( # prompt, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg) # if any(s in tok for s in ["", "<|user|>", "<|assistant|>", "<|system|>"]): # break # if tok: # response_text += tok # yield response_text # except HfHubHTTPError as e: # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None # if status == 404: # # Endpoint not available for this model → try next candidate # raise ValueError("NEXT") # if status == 401: # yield ( # "❌ 401 Unauthorized (text-generation fallback).\n\n" # "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart." # ) # elif status == 403: # yield ( # "❌ 403 Forbidden (text-generation fallback).\n\n" # "Your token lacks 'Use Inference API/Providers' or billing is not enabled. " # "Enable those or use a free hosted model via MODEL_ID." # ) # else: # yield f"[Inference error] {e}" # except Exception as e: # yield f"[Runtime error] {e}" # def respond(message, history, system_message, max_tokens, temperature, top_p): # last_error = None # tried = [] # for model_id in [m for m in CANDIDATES if m]: # tried.append(model_id) # try: # for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): # yield chunk # # If we streamed anything without raising, we’re done # return # except ValueError as ve: # if str(ve) == "NEXT": # last_error = f"Model `{model_id}` endpoint unavailable (404)." # continue # else: # last_error = str(ve) # except Exception as e: # last_error = f"Unexpected error on `{model_id}`: {e}" # # If we got here, all candidates failed # tried_str = " → ".join(tried) if tried else "(none)" # yield ( # "❌ All candidate models failed.\n\n" # f"Tried: {tried_str}\n\n" # f"Last error: {last_error or 'unknown'}\n\n" # "Fixes:\n" # "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n" # "• Or enable Inference Providers + billing on your HF token for models served via providers.\n" # ) # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox( # value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures " # "and declines answering other questions."), # label="System message", # ), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), # ], # ) # if __name__ == "__main__": # demo.launch(share=True) # import os # import gradio as gr # import torch # from transformers import AutoTokenizer, AutoModelForCausalLM # # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.) # MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") # # Load once at startup # print(f"🔧 Loading local model: {MODEL_ID}") # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) # model = AutoModelForCausalLM.from_pretrained( # MODEL_ID, # torch_dtype=torch.float32, # CPU-friendly # ) # model.eval() # def build_prompt(system_message: str, history, user_msg: str) -> str: # """Try to use the model's chat template if present; otherwise use a generic prompt.""" # messages = [] # if system_message: # messages.append({"role": "system", "content": system_message}) # for u, a in (history or []): # if u: # messages.append({"role": "user", "content": u}) # if a: # messages.append({"role": "assistant", "content": a}) # messages.append({"role": "user", "content": user_msg}) # # Use chat template when available # try: # if getattr(tokenizer, "chat_template", None): # return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # except Exception: # pass # # Fallback generic formatting # parts = [] # if system_message: # parts.append(f"System: {system_message}") # for u, a in (history or []): # if u: # parts.append(f"User: {u}") # if a: # parts.append(f"Assistant: {a}") # parts.append(f"User: {user_msg}") # parts.append("Assistant:") # return "\n".join(parts) # def respond(message, history, system_message, max_tokens, temperature, top_p): # prompt = build_prompt(system_message, history, message) # inputs = tokenizer(prompt, return_tensors="pt") # with torch.no_grad(): # outputs = model.generate( # **inputs, # max_new_tokens=int(max_tokens), # do_sample=True, # temperature=float(temperature), # top_p=float(top_p), # pad_token_id=tokenizer.eos_token_id, # eos_token_id=tokenizer.eos_token_id, # ) # # Decode only the newly generated portion # gen_ids = outputs[0][inputs["input_ids"].shape[1]:] # text = tokenizer.decode(gen_ids, skip_special_tokens=True) # # Stream the text in chunks so the UI feels live # acc = "" # for i in range(0, len(text), 40): # acc += text[i:i+40] # yield acc # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox( # value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible" # ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."), # label="System message", # ), # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), # ], # ) # if __name__ == "__main__": # # share=True gives you a public link automatically # demo.launch(share=True) import os import gradio as gr from llama_cpp import Llama # Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant) REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF") FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf") N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4))) CTX = int(os.getenv("CTX", "2048")) print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}") llm = Llama.from_pretrained( repo_id=REPO_ID, filename=FILENAME, n_ctx=CTX, n_threads=N_THREADS, n_gpu_layers=0, # CPU only logits_all=False, verbose=False, ) SYSTEM_DEFAULT = ( "You are a Chatbot who only answers spiritual questions based on Indian scriptures " "and politely decline other questions." ) def respond(message, history, system_message, max_tokens, temperature, top_p): sysmsg = system_message or SYSTEM_DEFAULT msgs = [{"role": "system", "content": sysmsg}] for u, a in (history or []): if u: msgs.append({"role": "user", "content": u}) if a: msgs.append({"role": "assistant", "content": a}) msgs.append({"role": "user", "content": message}) stream = llm.create_chat_completion( messages=msgs, temperature=float(temperature), top_p=float(top_p), max_tokens=int(max_tokens), stream=True, ) acc = "" for chunk in stream: delta = chunk["choices"][0]["delta"] tok = delta.get("content", "") if tok: acc += tok yield acc demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value=SYSTEM_DEFAULT, label="System message"), gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), ], ) if __name__ == "__main__": print(f"🧵 Threads: {N_THREADS}") demo.launch(share=True)