Spaces:

Reubencf
/

Gemma3-konkani

Running on Zero

App Files Files Community

Reubencf commited on 18 days ago

Commit

5b16dc7

verified ·

1 Parent(s): 6e8f667

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -13

app.py CHANGED Viewed

@@ -7,14 +7,14 @@ import os
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers import TorchAoConfig # not for Zero GPU
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Float8DynamicActivationFloat8WeightConfig # not for Zero GPU
 import spaces # 1. Import the spaces library
 IS_CUDA = torch.cuda.is_available()
 IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
-if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
-IS_QUANT = True
 # ── Configuration ────────────────────────────────────────────────────────────
 MODEL_ID = "Reubencf/gemma3-konkani"
@@ -31,14 +31,9 @@ print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
 def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
-        if IS_QUANT: # not for Zero GPU
-            quant_config = Float8DynamicActivationFloat8WeightConfig() if IS_CUDA else Int8DynamicActivationInt8WeightConfig()
-            quantization_config = TorchAoConfig(quant_type=quant_config)
-            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
-                                                         device_map="auto", quantization_config=quantization_config, token=HF_TOKEN)
-        else:
-            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
-                                                         device_map="auto", token=HF_TOKEN)
         print("[Init] Model loaded successfully.")
         return model, tokenizer
     except Exception as e:
@@ -88,7 +83,7 @@ def generate_response(message, history=[], system_message="", max_tokens=DEF_TOK
         # Generate the response
         gen_kwargs = dict(
             input_ids=inputs["input_ids"],
-            #attention_mask=inputs["attention_mask"],
             max_new_tokens=max_tokens,
             do_sample=True,
             temperature=temperature,

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import spaces # 1. Import the spaces library
 IS_CUDA = torch.cuda.is_available()
 IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
+if IS_ZEROGPU:
+    torch.compiler.set_stance("force_eager")
+    torch.set_float32_matmul_precision("high")
+    torch.backends.cuda.matmul.allow_tf32 = True
 # ── Configuration ────────────────────────────────────────────────────────────
 MODEL_ID = "Reubencf/gemma3-konkani"
 def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+        kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
+                                                     device_map="auto", token=HF_TOKEN, **kwargs)
         print("[Init] Model loaded successfully.")
         return model, tokenizer
     except Exception as e:
         # Generate the response
         gen_kwargs = dict(
             input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
             max_new_tokens=max_tokens,
             do_sample=True,
             temperature=temperature,