Spaces:

konkani
/

Goan-information

Running

App Files Files Community

Reubencf commited on Aug 20

Commit

6f40086

verified ·

1 Parent(s): b441326

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -87

app.py CHANGED Viewed

@@ -1,142 +1,264 @@
-# app.py — Hugging Face Spaces (Gradio) friendly
 import os
 import torch
 import gradio as gr
 from typing import List, Tuple
-from peft import PeftConfig, PeftModel, AutoPeftModelForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# ── Secrets / IDs ──────────────────────────────────────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space → Settings → Variables & secrets
 ADAPTER_ID = "Reubencf/gemma3-goan-finetuned"  # your LoRA adapter repo
 TITLE = "🌴 Gemma Goan Q&A Bot"
 DESCRIPTION = """
 Gemma-3-4B-Instruct base + LoRA adapter fine-tuned on a Goan Q&A dataset.
 Ask about Goa, Konkani culture, or general topics!
 **Adapter**: https://huggingface.co/Reubencf/gemma3-goan-finetuned
 """
-# ── Load model + tokenizer (robust, Space-safe) ────────────────────────────────
 def load_model_and_tokenizer():
-    # Read the exact base the adapter expects (prevents KeyError on wrong modules)
     peft_cfg = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
-    base_id = peft_cfg.base_model_name_or_path  # e.g. "google/gemma-3-4b-it"
-    print(f"[Load] Adapter expects base: {base_id}")
-    # Try the simple path first: AutoPEFT will fetch the correct base under the hood
     try:
-        model = AutoPeftModelForCausalLM.from_pretrained(
-            ADAPTER_ID,
-            token=HF_TOKEN,
-            trust_remote_code=True,
-            device_map="auto",              # Spaces CPU/GPU both OK
-            low_cpu_mem_usage=True,
-            torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else torch.float32),
-        )
-        print("[Load] AutoPEFT model loaded")
-    except Exception as e:
-        # Fallback: explicit base + attach adapter (also works on CPU Spaces)
-        print(f"[Warn] AutoPEFT failed: {e} — trying explicit base + adapter...")
-        base = AutoModelForCausalLM.from_pretrained(
             base_id,
             token=HF_TOKEN,
             trust_remote_code=True,
-            device_map=None,                # explicit CPU (safer on free tier)
             low_cpu_mem_usage=True,
-            torch_dtype=torch.float32,
-        ).to("cpu")
         model = PeftModel.from_pretrained(
-            base,
             ADAPTER_ID,
             token=HF_TOKEN,
             trust_remote_code=True,
         )
-        print("[Load] Base+adapter attached")
-    # Always load tokenizer from the **base model**
-    tok = AutoTokenizer.from_pretrained(
         base_id,
         token=HF_TOKEN,
         use_fast=True,
         trust_remote_code=True,
     )
-    if tok.pad_token is None:
-        tok.pad_token = tok.eos_token
-    tok.padding_side = "right"
     model.eval()
-    return model, tok, base_id
-model, tokenizer, BASE_ID = load_model_and_tokenizer()
-# ── Generation using chat template ─────────────────────────────────────────────
-def _to_chat(history: List[Tuple[str, str]], message: str):
-    conv = []
-    if history:
-        for u, a in history:
-            if u:
-                conv.append({"role": "user", "content": u})
-            if a:
-                conv.append({"role": "assistant", "content": a})
-    conv.append({"role": "user", "content": message})
-    return conv
 def generate_response(
-    message,
-    history,
-    temperature=0.7,
-    max_new_tokens=256,
-    top_p=0.95,
-    repetition_penalty=1.1,
-):
     try:
-        conv = _to_chat(history, message)
-        # Let Transformers format for Gemma’s chat template
-        inputs = tokenizer.apply_chat_template(
-            conv, add_generation_prompt=True, return_tensors="pt"
         )
-        inputs = inputs.to(next(model.parameters()).device)
         with torch.no_grad():
-            out = model.generate(
-                input_ids=inputs,
-                max_new_tokens=int(max_new_tokens),
                 temperature=float(temperature),
                 top_p=float(top_p),
                 repetition_penalty=float(repetition_penalty),
                 do_sample=True,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
             )
-        gen = out[0, inputs.shape[-1]:]
-        return tokenizer.decode(gen, skip_special_tokens=True).strip()
     except Exception as e:
-        return f"Error generating response: {e}"
-# ── Gradio app (Spaces auto-detects `demo`) ────────────────────────────────────
 examples = [
     ["What is the capital of Goa?"],
-    ["Tell me about the Konkani language."],
-    ["Famous beaches in Goa?"],
-    ["What is Goan fish curry?"],
-    ["Explain the history of Old Goa."],
 ]
-demo = gr.ChatInterface(
-    fn=generate_response,
-    title=TITLE,
-    description=DESCRIPTION,
-    examples=examples,
-    additional_inputs=[
-        gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature"),
-        gr.Slider(32, 512, value=256, step=16, label="Max new tokens"),
-        gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
-        gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repetition penalty"),
-    ],
-).queue(concurrency_count=1, max_size=16)

+# app.py — Optimized for Hugging Face Spaces Free Tier (CPU-only)
 import os
+import gc
 import torch
 import gradio as gr
 from typing import List, Tuple
+from peft import PeftConfig, PeftModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+# ── Configuration ──────────────────────────────────────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space → Settings → Variables & secrets
 ADAPTER_ID = "Reubencf/gemma3-goan-finetuned"  # your LoRA adapter repo
+# Free tier optimization flags
+USE_8BIT = False  # Set to True if you have access to GPU tier
+MAX_MEMORY = "15GB"  # Conservative for free tier
+DEVICE = "cpu"  # Force CPU for free tier
 TITLE = "🌴 Gemma Goan Q&A Bot"
 DESCRIPTION = """
 Gemma-3-4B-Instruct base + LoRA adapter fine-tuned on a Goan Q&A dataset.
 Ask about Goa, Konkani culture, or general topics!
 **Adapter**: https://huggingface.co/Reubencf/gemma3-goan-finetuned
+⚠️ **Note**: Running on free tier (CPU). Responses may be slower. For faster inference, consider upgrading to GPU tier.
 """
+# ── Load model + tokenizer (optimized for free tier) ───────────────────────────
 def load_model_and_tokenizer():
+    """Load model with memory optimizations for free tier"""
+    print("[Init] Starting model load for free tier...")
+    # Get the base model ID from adapter config
     peft_cfg = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
+    base_id = peft_cfg.base_model_name_or_path
+    print(f"[Load] Base model: {base_id}")
+    # Memory cleanup before loading
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     try:
+        # Load base model with memory optimizations
+        print("[Load] Loading base model with CPU optimizations...")
+        # Quantization config (only if GPU available and enabled)
+        quantization_config = None
+        if USE_8BIT and torch.cuda.is_available():
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_8bit_compute_dtype=torch.float16
+            )
+        # Load base model
+        base_model = AutoModelForCausalLM.from_pretrained(
             base_id,
             token=HF_TOKEN,
             trust_remote_code=True,
+            quantization_config=quantization_config,
             low_cpu_mem_usage=True,
+            torch_dtype=torch.float32 if DEVICE == "cpu" else torch.float16,
+            device_map=None,  # We'll move manually
+            max_memory={0: MAX_MEMORY} if torch.cuda.is_available() else None,
+        )
+        # Move to device
+        if DEVICE == "cpu":
+            base_model = base_model.to("cpu")
+            print("[Load] Model moved to CPU")
+        # Load and apply LoRA adapter
+        print("[Load] Loading LoRA adapter...")
         model = PeftModel.from_pretrained(
+            base_model,
             ADAPTER_ID,
             token=HF_TOKEN,
             trust_remote_code=True,
+            is_trainable=False,  # Inference only
         )
+        # Merge adapter with base (reduces memory overhead during inference)
+        print("[Load] Merging adapter for efficiency...")
+        model = model.merge_and_unload()
+        print("[Load] Model loaded successfully!")
+    except Exception as e:
+        print(f"[Error] Failed to load model: {e}")
+        raise gr.Error(
+            f"Failed to load model. This may be due to memory constraints on free tier. "
+            f"Consider using a smaller model or upgrading to GPU tier. Error: {str(e)}"
+        )
+    # Load tokenizer
+    print("[Load] Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
         base_id,
         token=HF_TOKEN,
         use_fast=True,
         trust_remote_code=True,
     )
+    # Set padding token
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"  # Better for generation
+    # Set model to eval mode
     model.eval()
+    # Memory cleanup
+    gc.collect()
+    return model, tokenizer, base_id
+# Load model globally (done once at startup)
+try:
+    model, tokenizer, BASE_ID = load_model_and_tokenizer()
+    MODEL_LOADED = True
+except Exception as e:
+    print(f"[Fatal] Could not load model: {e}")
+    MODEL_LOADED = False
+    model, tokenizer, BASE_ID = None, None, None
+# ── Generation function ─────────────────────────────────────────────────────────
 def generate_response(
+    message: str,
+    history: List[Tuple[str, str]],
+    temperature: float = 0.7,
+    max_new_tokens: int = 256,
+    top_p: float = 0.95,
+    repetition_penalty: float = 1.1,
+) -> str:
+    """Generate response using the fine-tuned model"""
+    if not MODEL_LOADED:
+        return "⚠️ Model failed to load. This usually happens due to memory constraints on the free tier. Please try again later or contact the space owner."
     try:
+        # Build conversation history
+        conversation = []
+        for user_msg, assistant_msg in history:
+            if user_msg:
+                conversation.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                conversation.append({"role": "assistant", "content": assistant_msg})
+        conversation.append({"role": "user", "content": message})
+        # Apply chat template
+        prompt = tokenizer.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            return_tensors="pt"
         )
+        # Move to model device
+        prompt = prompt.to(model.device)
+        # Generate with memory-efficient settings
         with torch.no_grad():
+            # Use cache for faster generation
+            outputs = model.generate(
+                input_ids=prompt,
+                max_new_tokens=min(int(max_new_tokens), 256),  # Cap for free tier
                 temperature=float(temperature),
                 top_p=float(top_p),
                 repetition_penalty=float(repetition_penalty),
                 do_sample=True,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,  # Enable KV cache
             )
+        # Decode only the generated tokens
+        generated_tokens = outputs[0][prompt.shape[-1]:]
+        response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+        # Memory cleanup after generation
+        del outputs, prompt, generated_tokens
+        gc.collect()
+        return response
+    except torch.cuda.OutOfMemoryError:
+        gc.collect()
+        torch.cuda.empty_cache()
+        return "⚠️ Out of memory. Try reducing max_new_tokens or restarting the space."
     except Exception as e:
+        return f"⚠️ Error generating response: {str(e)}"
+# ── Gradio Interface ────────────────────────────────────────────────────────────
 examples = [
     ["What is the capital of Goa?"],
+    ["Tell me about Konkani language"],
+    ["What are the famous beaches in Goa?"],
+    ["Describe Goan fish curry"],
+    ["What is the history of Old Goa?"],
 ]
+# Create the chat interface
+if MODEL_LOADED:
+    demo = gr.ChatInterface(
+        fn=generate_response,
+        title=TITLE,
+        description=DESCRIPTION,
+        examples=examples,
+        retry_btn=None,  # Disable retry to save memory
+        undo_btn=None,   # Disable undo to save memory
+        additional_inputs=[
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.7,
+                step=0.05,
+                label="Temperature (lower = more focused)"
+            ),
+            gr.Slider(
+                minimum=32,
+                maximum=256,
+                value=128,  # Reduced default for free tier
+                step=16,
+                label="Max new tokens"
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.95,
+                step=0.05,
+                label="Top-p (nucleus sampling)"
+            ),
+            gr.Slider(
+                minimum=1.0,
+                maximum=2.0,
+                value=1.1,
+                step=0.05,
+                label="Repetition penalty"
+            ),
+        ],
+        theme=gr.themes.Soft(),
+    )
+else:
+    # Fallback interface if model fails to load
+    demo = gr.Interface(
+        fn=lambda x: "⚠️ Model failed to load. Please check the logs or try restarting the space.",
+        inputs=gr.Textbox(label="Message"),
+        outputs=gr.Textbox(label="Response"),
+        title=TITLE,
+        description="**Error**: Model could not be loaded. This is likely due to memory constraints on the free tier.",
+    )
+# Queue for handling multiple users
+demo.queue(
+    concurrency_count=1,  # Process one at a time to save memory
+    max_size=10,          # Reduced queue size for free tier
+)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()