# app.py — Corrected for Hugging Face ZeroGPU Spaces # --------------------------------------------------------------- # This version is adapted for the ZeroGPU environment by using # the @spaces.GPU decorator. # --------------------------------------------------------------- import os import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import spaces # 1. Import the spaces library IS_CUDA = torch.cuda.is_available() IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False if IS_ZEROGPU: torch.compiler.set_stance("force_eager") torch.set_float32_matmul_precision("high") torch.backends.cuda.matmul.allow_tf32 = True # ── Configuration ──────────────────────────────────────────────────────────── MODEL_ID = "Reubencf/gemma3-konkani" HF_TOKEN = os.getenv("HF_TOKEN", None) TITLE = "Konkani LLM Fine Tuned on Gemma 3" DESCRIPTION = ( "Version 1 of the Konkani LLM.\n" "This release may contain inconsistencies, but improvements will follow in future updates." ) # ── Loading ────────────────────────────────────────────────────────────────── print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...") def load_model(): try: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {} model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32, device_map="auto", token=HF_TOKEN, **kwargs) print("[Init] Model loaded successfully.") return model, tokenizer except Exception as e: # If model loading fails, we can't proceed. print(f"[Fatal] Could not load model: {e}") raise Exception(f"❌ Model failed to load: {e}") model, tokenizer = load_model() DEF_TOKENS = 256 DEF_TEMPERATURE = 0.7 DEF_TOPK = 50 DEF_TOPP = 0.95 DEF_DURATION = 10 def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): return int(duration if duration is not None else DEF_DURATION) # ── Generation Function ────────────────────────────────────────────────────── @spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU @torch.inference_mode() def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): """ This function is called for each user message. The @spaces.GPU decorator ensures a GPU is allocated when this runs. """ try: # Format the conversation history conversation = [] if system_message: conversation.append({"role": "system", "content": system_message}) for msg in history: # https://www.gradio.app/docs/gradio/chatbot if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue conversation.append({"role": msg["role"], "content": msg["content"]}) # Add the current user's message conversation.append({"role": "user", "content": message}) # Apply the chat template inputs = tokenizer.apply_chat_template( conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True, ).to(model.device) # Generate the response gen_kwargs = dict( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=top_k, top_p=top_p, #eos_token_id=tokenizer.eos_token_id, #num_beams=1, output_scores=False, cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501 ) outputs = model.generate(**gen_kwargs) # Extract only the newly generated text gen_ids = outputs[0][inputs["input_ids"].shape[-1]:] new_response = tokenizer.decode(gen_ids, skip_special_tokens=True) return new_response except Exception as e: print(f"Error: {e}") gr.Warning(f"Error: {e}") return "" # ── UI ──────────────────────────────────────────────────────────────────────── examples = [ ["Translate From English to Devnagri Konkani: what is color?"], ["घरांत विजेचो वापर उणो करपाची येवजण तयार करप."], ] demo = gr.ChatInterface( fn=generate_response, type="messages", title=TITLE, description=DESCRIPTION, examples=examples, cache_examples=True, theme="soft", additional_inputs=[ gr.Textbox(value="", label="System message"), gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"), gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"), gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"), gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"), ], ) # ── Launch ──────────────────────────────────────────────────────────────────── if __name__ == "__main__": print("🚀 Starting Gradio app for ZeroGPU...") demo.queue().launch()