John6666 commited on
Commit
f55fb49
Β·
verified Β·
1 Parent(s): 4debea8

Upload 3 files

Browse files

1.5x faster and fixing small bug.

Files changed (3) hide show
  1. README.md +0 -3
  2. app.py +93 -63
  3. requirements.txt +4 -3
README.md CHANGED
@@ -7,9 +7,6 @@ sdk: gradio
7
  sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
  license: apache-2.0
14
  short_description: Konkani LLM with Gemma 3
15
  ---
 
7
  sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  license: apache-2.0
11
  short_description: Konkani LLM with Gemma 3
12
  ---
app.py CHANGED
@@ -6,97 +6,127 @@
6
  import os
7
  import torch
8
  import gradio as gr
9
- from transformers import pipeline
10
  import spaces # 1. Import the spaces library
11
 
12
- # ── Configuration ──────────────────────────────────────────────────────────────
 
 
 
 
13
  MODEL_ID = "Reubencf/gemma3-konkani"
14
- HF_TOKEN = os.getenv("HF_TOKEN")
15
 
16
  TITLE = "Konkani LLM Fine Tuned on Gemma 3"
17
  DESCRIPTION = (
18
  "Version 1 of the Konkani LLM.\n"
19
  "This release may contain inconsistencies, but improvements will follow in future updates."
20
  )
21
- # We define the pipeline object globally but initialize it inside the function
22
- pipe = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # ── Generation Function ──────────────────────────────────────────────────────
25
- @spaces.GPU(duration=120) # 2. Decorate the function that needs the GPU
26
- def generate_response(message, history):
 
27
  """
28
  This function is called for each user message.
29
  The @spaces.GPU decorator ensures a GPU is allocated when this runs.
30
  """
31
- global pipe # Use the global pipe variable
32
-
33
- # 3. Load the model inside the decorated function
34
- # This ensures the model is loaded only when a GPU is active.
35
- # We check if it's already loaded to avoid reloading on every call.
36
- if pipe is None:
37
- print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
38
- try:
39
- pipe = pipeline(
40
- "text-generation",
41
- model=MODEL_ID,
42
- torch_dtype=torch.bfloat16,
43
- device_map="auto", # This will now correctly map to the allocated GPU
44
- token=HF_TOKEN,
45
- )
46
- print("[Init] Model pipeline loaded successfully.")
47
- except Exception as e:
48
- # If model loading fails, we can't proceed.
49
- print(f"[Fatal] Could not load model: {e}")
50
- return f"❌ Model failed to load: {e}"
51
-
52
- # Format the conversation history
53
- conversation = []
54
- for user_msg, assistant_msg in history:
55
- conversation.append({"role": "user", "content": user_msg})
56
- if assistant_msg:
57
- conversation.append({"role": "assistant", "content": assistant_msg})
58
-
59
- # Add the current user's message
60
- conversation.append({"role": "user", "content": message})
61
-
62
- # Apply the chat template
63
- prompt = pipe.tokenizer.apply_chat_template(
64
- conversation,
65
- tokenize=False,
66
- add_generation_prompt=True
67
- )
68
-
69
- # Generate the response
70
- outputs = pipe(
71
- prompt,
72
- max_new_tokens=256, # It's good practice to set a max token limit
73
- do_sample=True,
74
- temperature=0.7,
75
- top_k=50,
76
- top_p=0.95
77
- )
78
-
79
- # Extract only the newly generated text
80
- response = outputs[0]["generated_text"]
81
- new_response = response[len(prompt):].strip()
82
-
83
- return new_response
84
 
85
  # ── UI ────────────────────────────────────────────────────────────────────────
86
  examples = [
87
- "Translate From English to Devnagri Konkani: what is color?",
88
- "ΰ€˜ΰ€°ΰ€Ύΰ€‚ΰ€€ ΰ€΅ΰ€Ώΰ€œΰ₯‡ΰ€šΰ₯‹ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€‰ΰ€£ΰ₯‹ ΰ€•ΰ€°ΰ€ͺΰ€Ύΰ€šΰ₯€ ΰ€―ΰ₯‡ΰ€΅ΰ€œΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€•ΰ€°ΰ€ͺ.",
89
  ]
90
 
91
  demo = gr.ChatInterface(
92
  fn=generate_response,
 
93
  title=TITLE,
94
  description=DESCRIPTION,
95
  examples=examples,
 
96
  theme="soft",
 
 
 
 
 
 
 
 
97
  )
98
 
99
  # ── Launch ────────────────────────────────────────────────────────────────────
100
  if __name__ == "__main__":
101
  print("πŸš€ Starting Gradio app for ZeroGPU...")
102
- demo.launch()
 
6
  import os
7
  import torch
8
  import gradio as gr
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
  import spaces # 1. Import the spaces library
11
 
12
+ IS_CUDA = torch.cuda.is_available()
13
+ IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
14
+ if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
15
+
16
+ # ── Configuration ────────────────────────────────────────────────────────────
17
  MODEL_ID = "Reubencf/gemma3-konkani"
18
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
19
 
20
  TITLE = "Konkani LLM Fine Tuned on Gemma 3"
21
  DESCRIPTION = (
22
  "Version 1 of the Konkani LLM.\n"
23
  "This release may contain inconsistencies, but improvements will follow in future updates."
24
  )
25
+
26
+ # ── Loading ──────────────────────────────────────────────────────────────────
27
+ print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
28
+ def load_model():
29
+ try:
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
31
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
32
+ device_map="auto", token=HF_TOKEN)
33
+ print("[Init] Model loaded successfully.")
34
+ return model, tokenizer
35
+ except Exception as e:
36
+ # If model loading fails, we can't proceed.
37
+ print(f"[Fatal] Could not load model: {e}")
38
+ raise Exception(f"❌ Model failed to load: {e}")
39
+
40
+ model, tokenizer = load_model()
41
+
42
+ DEF_TOKENS = 256
43
+ DEF_TEMPERATURE = 0.7
44
+ DEF_TOPK = 50
45
+ DEF_TOPP = 0.95
46
+ DEF_DURATION = 59
47
+
48
+ def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
49
+ return int(duration if duration is not None else DEF_DURATION)
50
 
51
  # ── Generation Function ──────────────────────────────────────────────────────
52
+ @spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
53
+ @torch.inference_mode()
54
+ def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
55
  """
56
  This function is called for each user message.
57
  The @spaces.GPU decorator ensures a GPU is allocated when this runs.
58
  """
59
+ try:
60
+ # Format the conversation history
61
+ conversation = []
62
+ if system_message: conversation.append({"role": "system", "content": system_message})
63
+ for msg in history: # https://www.gradio.app/docs/gradio/chatbot
64
+ if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
65
+ conversation.append({"role": msg["role"], "content": msg["content"]})
66
+
67
+ # Add the current user's message
68
+ conversation.append({"role": "user", "content": message})
69
+
70
+ # Apply the chat template
71
+ inputs = tokenizer.apply_chat_template(
72
+ conversation,
73
+ tokenize=True,
74
+ add_generation_prompt=True,
75
+ return_tensors="pt",
76
+ return_dict=True,
77
+ ).to(model.device)
78
+
79
+ # Generate the response
80
+ gen_kwargs = dict(
81
+ input_ids=inputs["input_ids"],
82
+ attention_mask=inputs["attention_mask"],
83
+ max_new_tokens=max_tokens,
84
+ do_sample=True,
85
+ temperature=temperature,
86
+ top_k=top_k,
87
+ top_p=top_p,
88
+ #eos_token_id=tokenizer.eos_token_id,
89
+ #num_beams=1,
90
+ output_scores=False,
91
+ cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
92
+ )
93
+ outputs = model.generate(**gen_kwargs)
94
+
95
+ # Extract only the newly generated text
96
+ gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
97
+ new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)
98
+
99
+ return new_response
100
+ except Exception as e:
101
+ print(f"Error: {e}")
102
+ gr.Warning(f"Error: {e}")
103
+ return ""
 
 
 
 
 
 
 
 
104
 
105
  # ── UI ────────────────────────────────────────────────────────────────────────
106
  examples = [
107
+ ["Translate From English to Devnagri Konkani: what is color?"],
108
+ ["ΰ€˜ΰ€°ΰ€Ύΰ€‚ΰ€€ ΰ€΅ΰ€Ώΰ€œΰ₯‡ΰ€šΰ₯‹ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€‰ΰ€£ΰ₯‹ ΰ€•ΰ€°ΰ€ͺΰ€Ύΰ€šΰ₯€ ΰ€―ΰ₯‡ΰ€΅ΰ€œΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€•ΰ€°ΰ€ͺ."],
109
  ]
110
 
111
  demo = gr.ChatInterface(
112
  fn=generate_response,
113
+ type="messages",
114
  title=TITLE,
115
  description=DESCRIPTION,
116
  examples=examples,
117
+ cache_examples=True,
118
  theme="soft",
119
+ additional_inputs=[
120
+ gr.Textbox(value="", label="System message"),
121
+ gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
122
+ gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
123
+ gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
124
+ gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
125
+ gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
126
+ ],
127
  )
128
 
129
  # ── Launch ────────────────────────────────────────────────────────────────────
130
  if __name__ == "__main__":
131
  print("πŸš€ Starting Gradio app for ZeroGPU...")
132
+ demo.queue().launch()
requirements.txt CHANGED
@@ -1,6 +1,7 @@
 
1
  transformers>=4.41
2
  peft>=0.11.0
3
  accelerate>=0.31.0
4
- gradio>=4.0,<5.0
5
- torch>=2.2 ; sys_platform != "darwin"
6
- bitsandbytes>=0.43.1 ; platform_system == "Linux"
 
1
+ torch>=2.2
2
  transformers>=4.41
3
  peft>=0.11.0
4
  accelerate>=0.31.0
5
+ bitsandbytes>=0.43.1
6
+ gradio>=4.0
7
+ pydantic==2.10.6