Reubencf commited on
Commit
5b16dc7
Β·
verified Β·
1 Parent(s): 6e8f667

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -13
app.py CHANGED
@@ -7,14 +7,14 @@ import os
7
  import torch
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
10
- from transformers import TorchAoConfig # not for Zero GPU
11
- from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Float8DynamicActivationFloat8WeightConfig # not for Zero GPU
12
  import spaces # 1. Import the spaces library
13
 
14
  IS_CUDA = torch.cuda.is_available()
15
  IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
16
- if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
17
- IS_QUANT = True
 
 
18
 
19
  # ── Configuration ────────────────────────────────────────────────────────────
20
  MODEL_ID = "Reubencf/gemma3-konkani"
@@ -31,14 +31,9 @@ print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
31
  def load_model():
32
  try:
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
34
- if IS_QUANT: # not for Zero GPU
35
- quant_config = Float8DynamicActivationFloat8WeightConfig() if IS_CUDA else Int8DynamicActivationInt8WeightConfig()
36
- quantization_config = TorchAoConfig(quant_type=quant_config)
37
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
38
- device_map="auto", quantization_config=quantization_config, token=HF_TOKEN)
39
- else:
40
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
41
- device_map="auto", token=HF_TOKEN)
42
  print("[Init] Model loaded successfully.")
43
  return model, tokenizer
44
  except Exception as e:
@@ -88,7 +83,7 @@ def generate_response(message, history=[], system_message="", max_tokens=DEF_TOK
88
  # Generate the response
89
  gen_kwargs = dict(
90
  input_ids=inputs["input_ids"],
91
- #attention_mask=inputs["attention_mask"],
92
  max_new_tokens=max_tokens,
93
  do_sample=True,
94
  temperature=temperature,
 
7
  import torch
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
10
  import spaces # 1. Import the spaces library
11
 
12
  IS_CUDA = torch.cuda.is_available()
13
  IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
14
+ if IS_ZEROGPU:
15
+ torch.compiler.set_stance("force_eager")
16
+ torch.set_float32_matmul_precision("high")
17
+ torch.backends.cuda.matmul.allow_tf32 = True
18
 
19
  # ── Configuration ────────────────────────────────────────────────────────────
20
  MODEL_ID = "Reubencf/gemma3-konkani"
 
31
  def load_model():
32
  try:
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
34
+ kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
35
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
36
+ device_map="auto", token=HF_TOKEN, **kwargs)
 
 
 
 
 
37
  print("[Init] Model loaded successfully.")
38
  return model, tokenizer
39
  except Exception as e:
 
83
  # Generate the response
84
  gen_kwargs = dict(
85
  input_ids=inputs["input_ids"],
86
+ attention_mask=inputs["attention_mask"],
87
  max_new_tokens=max_tokens,
88
  do_sample=True,
89
  temperature=temperature,