Spaces:

Reubencf
/

Gemma3-konkani

Running on Zero

App Files Files Community

John6666 commited on 12 days ago

Commit

f55fb49

verified ·

1 Parent(s): 4debea8

Upload 3 files

Browse files

1.5x faster and fixing small bug.

Files changed (3) hide show

README.md +0 -3
app.py +93 -63
requirements.txt +4 -3

README.md CHANGED Viewed

@@ -7,9 +7,6 @@ sdk: gradio
 sdk_version: 5.42.0
 app_file: app.py
 pinned: false
-hf_oauth: true
-hf_oauth_scopes:
-- inference-api
 license: apache-2.0
 short_description: Konkani LLM with Gemma 3
 ---

 sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Konkani LLM with Gemma 3
 ---

app.py CHANGED Viewed

@@ -6,97 +6,127 @@
 import os
 import torch
 import gradio as gr
-from transformers import pipeline
 import spaces # 1. Import the spaces library
-# ── Configuration ──────────────────────────────────────────────────────────────
 MODEL_ID = "Reubencf/gemma3-konkani"
-HF_TOKEN = os.getenv("HF_TOKEN")
 TITLE = "Konkani LLM Fine Tuned on Gemma 3"
 DESCRIPTION = (
     "Version 1 of the Konkani LLM.\n"
     "This release may contain inconsistencies, but improvements will follow in future updates."
 )
-# We define the pipeline object globally but initialize it inside the function
-pipe = None
 # ── Generation Function ──────────────────────────────────────────────────────
-@spaces.GPU(duration=120) # 2. Decorate the function that needs the GPU
-def generate_response(message, history):
     """
     This function is called for each user message.
     The @spaces.GPU decorator ensures a GPU is allocated when this runs.
     """
-    global pipe # Use the global pipe variable
-    # 3. Load the model inside the decorated function
-    # This ensures the model is loaded only when a GPU is active.
-    # We check if it's already loaded to avoid reloading on every call.
-    if pipe is None:
-        print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
-        try:
-            pipe = pipeline(
-                "text-generation",
-                model=MODEL_ID,
-                torch_dtype=torch.bfloat16,
-                device_map="auto", # This will now correctly map to the allocated GPU
-                token=HF_TOKEN,
-            )
-            print("[Init] Model pipeline loaded successfully.")
-        except Exception as e:
-            # If model loading fails, we can't proceed.
-            print(f"[Fatal] Could not load model: {e}")
-            return f"❌ Model failed to load: {e}"
-    # Format the conversation history
-    conversation = []
-    for user_msg, assistant_msg in history:
-        conversation.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            conversation.append({"role": "assistant", "content": assistant_msg})
-    # Add the current user's message
-    conversation.append({"role": "user", "content": message})
-    # Apply the chat template
-    prompt = pipe.tokenizer.apply_chat_template(
-        conversation,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    # Generate the response
-    outputs = pipe(
-        prompt,
-        max_new_tokens=256, # It's good practice to set a max token limit
-        do_sample=True,
-        temperature=0.7,
-        top_k=50,
-        top_p=0.95
-    )
-    # Extract only the newly generated text
-    response = outputs[0]["generated_text"]
-    new_response = response[len(prompt):].strip()
-    return new_response
 # ── UI ────────────────────────────────────────────────────────────────────────
 examples = [
-    "Translate From English to Devnagri Konkani: what is color?",
-    "घरांत विजेचो वापर उणो करपाची येवजण तयार करप.",
 ]
 demo = gr.ChatInterface(
     fn=generate_response,
     title=TITLE,
     description=DESCRIPTION,
     examples=examples,
     theme="soft",
 )
 # ── Launch ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     print("🚀 Starting Gradio app for ZeroGPU...")
-    demo.launch()

 import os
 import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import spaces # 1. Import the spaces library
+IS_CUDA = torch.cuda.is_available()
+IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
+if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
+# ── Configuration ────────────────────────────────────────────────────────────
 MODEL_ID = "Reubencf/gemma3-konkani"
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 TITLE = "Konkani LLM Fine Tuned on Gemma 3"
 DESCRIPTION = (
     "Version 1 of the Konkani LLM.\n"
     "This release may contain inconsistencies, but improvements will follow in future updates."
 )
+# ── Loading ──────────────────────────────────────────────────────────────────
+print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
+def load_model():
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
+                                                     device_map="auto", token=HF_TOKEN)
+        print("[Init] Model loaded successfully.")
+        return model, tokenizer
+    except Exception as e:
+        # If model loading fails, we can't proceed.
+        print(f"[Fatal] Could not load model: {e}")
+        raise Exception(f"❌ Model failed to load: {e}")
+model, tokenizer = load_model()
+DEF_TOKENS = 256
+DEF_TEMPERATURE = 0.7
+DEF_TOPK = 50
+DEF_TOPP = 0.95
+DEF_DURATION = 59
+def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
+    return int(duration if duration is not None else DEF_DURATION)
 # ── Generation Function ──────────────────────────────────────────────────────
+@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
+@torch.inference_mode()
+def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
     """
     This function is called for each user message.
     The @spaces.GPU decorator ensures a GPU is allocated when this runs.
     """
+    try:
+        # Format the conversation history
+        conversation = []
+        if system_message: conversation.append({"role": "system", "content": system_message})
+        for msg in history: # https://www.gradio.app/docs/gradio/chatbot
+            if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
+            conversation.append({"role": msg["role"], "content": msg["content"]})
+        # Add the current user's message
+        conversation.append({"role": "user", "content": message})
+        # Apply the chat template
+        inputs = tokenizer.apply_chat_template(
+            conversation,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            return_dict=True,
+        ).to(model.device)
+        # Generate the response
+        gen_kwargs = dict(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            #eos_token_id=tokenizer.eos_token_id,
+            #num_beams=1,
+            output_scores=False,
+            cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
+        )
+        outputs = model.generate(**gen_kwargs)
+        # Extract only the newly generated text
+        gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
+        new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)
+        return new_response
+    except Exception as e:
+        print(f"Error: {e}")
+        gr.Warning(f"Error: {e}")
+        return ""
 # ── UI ────────────────────────────────────────────────────────────────────────
 examples = [
+    ["Translate From English to Devnagri Konkani: what is color?"],
+    ["घरांत विजेचो वापर उणो करपाची येवजण तयार करप."],
 ]
 demo = gr.ChatInterface(
     fn=generate_response,
+    type="messages",
     title=TITLE,
     description=DESCRIPTION,
     examples=examples,
+    cache_examples=True,
     theme="soft",
+    additional_inputs=[
+        gr.Textbox(value="", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
+        gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
+    ],
 )
 # ── Launch ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     print("🚀 Starting Gradio app for ZeroGPU...")
+    demo.queue().launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 transformers>=4.41
 peft>=0.11.0
 accelerate>=0.31.0
-gradio>=4.0,<5.0
-torch>=2.2 ; sys_platform != "darwin"
-bitsandbytes>=0.43.1 ; platform_system == "Linux"

+torch>=2.2
 transformers>=4.41
 peft>=0.11.0
 accelerate>=0.31.0
+bitsandbytes>=0.43.1
+gradio>=4.0
+pydantic==2.10.6