Spaces:
Running
on
Zero
Running
on
Zero
Upload 3 files
Browse files1.5x faster and fixing small bug.
- README.md +0 -3
- app.py +93 -63
- requirements.txt +4 -3
README.md
CHANGED
@@ -7,9 +7,6 @@ sdk: gradio
|
|
7 |
sdk_version: 5.42.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
hf_oauth: true
|
11 |
-
hf_oauth_scopes:
|
12 |
-
- inference-api
|
13 |
license: apache-2.0
|
14 |
short_description: Konkani LLM with Gemma 3
|
15 |
---
|
|
|
7 |
sdk_version: 5.42.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
10 |
license: apache-2.0
|
11 |
short_description: Konkani LLM with Gemma 3
|
12 |
---
|
app.py
CHANGED
@@ -6,97 +6,127 @@
|
|
6 |
import os
|
7 |
import torch
|
8 |
import gradio as gr
|
9 |
-
from transformers import
|
10 |
import spaces # 1. Import the spaces library
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
MODEL_ID = "Reubencf/gemma3-konkani"
|
14 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
15 |
|
16 |
TITLE = "Konkani LLM Fine Tuned on Gemma 3"
|
17 |
DESCRIPTION = (
|
18 |
"Version 1 of the Konkani LLM.\n"
|
19 |
"This release may contain inconsistencies, but improvements will follow in future updates."
|
20 |
)
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# ββ Generation Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
25 |
-
@spaces.GPU(duration=
|
26 |
-
|
|
|
27 |
"""
|
28 |
This function is called for each user message.
|
29 |
The @spaces.GPU decorator ensures a GPU is allocated when this runs.
|
30 |
"""
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
top_p=0.95
|
77 |
-
)
|
78 |
-
|
79 |
-
# Extract only the newly generated text
|
80 |
-
response = outputs[0]["generated_text"]
|
81 |
-
new_response = response[len(prompt):].strip()
|
82 |
-
|
83 |
-
return new_response
|
84 |
|
85 |
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
86 |
examples = [
|
87 |
-
"Translate From English to Devnagri Konkani: what is color?",
|
88 |
-
"ΰ€ΰ€°ΰ€Ύΰ€ΰ€€ ΰ€΅ΰ€Ώΰ€ΰ₯ΰ€ΰ₯ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€ΰ€£ΰ₯ ΰ€ΰ€°ΰ€ͺΰ€Ύΰ€ΰ₯ ΰ€―ΰ₯ΰ€΅ΰ€ΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€ΰ€°ΰ€ͺ.",
|
89 |
]
|
90 |
|
91 |
demo = gr.ChatInterface(
|
92 |
fn=generate_response,
|
|
|
93 |
title=TITLE,
|
94 |
description=DESCRIPTION,
|
95 |
examples=examples,
|
|
|
96 |
theme="soft",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
)
|
98 |
|
99 |
# ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
100 |
if __name__ == "__main__":
|
101 |
print("π Starting Gradio app for ZeroGPU...")
|
102 |
-
demo.launch()
|
|
|
6 |
import os
|
7 |
import torch
|
8 |
import gradio as gr
|
9 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
10 |
import spaces # 1. Import the spaces library
|
11 |
|
12 |
+
IS_CUDA = torch.cuda.is_available()
|
13 |
+
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
|
14 |
+
if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
|
15 |
+
|
16 |
+
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
17 |
MODEL_ID = "Reubencf/gemma3-konkani"
|
18 |
+
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
19 |
|
20 |
TITLE = "Konkani LLM Fine Tuned on Gemma 3"
|
21 |
DESCRIPTION = (
|
22 |
"Version 1 of the Konkani LLM.\n"
|
23 |
"This release may contain inconsistencies, but improvements will follow in future updates."
|
24 |
)
|
25 |
+
|
26 |
+
# ββ Loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
27 |
+
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
|
28 |
+
def load_model():
|
29 |
+
try:
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
|
31 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
|
32 |
+
device_map="auto", token=HF_TOKEN)
|
33 |
+
print("[Init] Model loaded successfully.")
|
34 |
+
return model, tokenizer
|
35 |
+
except Exception as e:
|
36 |
+
# If model loading fails, we can't proceed.
|
37 |
+
print(f"[Fatal] Could not load model: {e}")
|
38 |
+
raise Exception(f"β Model failed to load: {e}")
|
39 |
+
|
40 |
+
model, tokenizer = load_model()
|
41 |
+
|
42 |
+
DEF_TOKENS = 256
|
43 |
+
DEF_TEMPERATURE = 0.7
|
44 |
+
DEF_TOPK = 50
|
45 |
+
DEF_TOPP = 0.95
|
46 |
+
DEF_DURATION = 59
|
47 |
+
|
48 |
+
def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
|
49 |
+
return int(duration if duration is not None else DEF_DURATION)
|
50 |
|
51 |
# ββ Generation Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
52 |
+
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
|
53 |
+
@torch.inference_mode()
|
54 |
+
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
|
55 |
"""
|
56 |
This function is called for each user message.
|
57 |
The @spaces.GPU decorator ensures a GPU is allocated when this runs.
|
58 |
"""
|
59 |
+
try:
|
60 |
+
# Format the conversation history
|
61 |
+
conversation = []
|
62 |
+
if system_message: conversation.append({"role": "system", "content": system_message})
|
63 |
+
for msg in history: # https://www.gradio.app/docs/gradio/chatbot
|
64 |
+
if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
|
65 |
+
conversation.append({"role": msg["role"], "content": msg["content"]})
|
66 |
+
|
67 |
+
# Add the current user's message
|
68 |
+
conversation.append({"role": "user", "content": message})
|
69 |
+
|
70 |
+
# Apply the chat template
|
71 |
+
inputs = tokenizer.apply_chat_template(
|
72 |
+
conversation,
|
73 |
+
tokenize=True,
|
74 |
+
add_generation_prompt=True,
|
75 |
+
return_tensors="pt",
|
76 |
+
return_dict=True,
|
77 |
+
).to(model.device)
|
78 |
+
|
79 |
+
# Generate the response
|
80 |
+
gen_kwargs = dict(
|
81 |
+
input_ids=inputs["input_ids"],
|
82 |
+
attention_mask=inputs["attention_mask"],
|
83 |
+
max_new_tokens=max_tokens,
|
84 |
+
do_sample=True,
|
85 |
+
temperature=temperature,
|
86 |
+
top_k=top_k,
|
87 |
+
top_p=top_p,
|
88 |
+
#eos_token_id=tokenizer.eos_token_id,
|
89 |
+
#num_beams=1,
|
90 |
+
output_scores=False,
|
91 |
+
cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
|
92 |
+
)
|
93 |
+
outputs = model.generate(**gen_kwargs)
|
94 |
+
|
95 |
+
# Extract only the newly generated text
|
96 |
+
gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
|
97 |
+
new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
98 |
+
|
99 |
+
return new_response
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Error: {e}")
|
102 |
+
gr.Warning(f"Error: {e}")
|
103 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
106 |
examples = [
|
107 |
+
["Translate From English to Devnagri Konkani: what is color?"],
|
108 |
+
["ΰ€ΰ€°ΰ€Ύΰ€ΰ€€ ΰ€΅ΰ€Ώΰ€ΰ₯ΰ€ΰ₯ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€ΰ€£ΰ₯ ΰ€ΰ€°ΰ€ͺΰ€Ύΰ€ΰ₯ ΰ€―ΰ₯ΰ€΅ΰ€ΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€ΰ€°ΰ€ͺ."],
|
109 |
]
|
110 |
|
111 |
demo = gr.ChatInterface(
|
112 |
fn=generate_response,
|
113 |
+
type="messages",
|
114 |
title=TITLE,
|
115 |
description=DESCRIPTION,
|
116 |
examples=examples,
|
117 |
+
cache_examples=True,
|
118 |
theme="soft",
|
119 |
+
additional_inputs=[
|
120 |
+
gr.Textbox(value="", label="System message"),
|
121 |
+
gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
|
122 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
|
123 |
+
gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
|
124 |
+
gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
|
125 |
+
gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
|
126 |
+
],
|
127 |
)
|
128 |
|
129 |
# ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
130 |
if __name__ == "__main__":
|
131 |
print("π Starting Gradio app for ZeroGPU...")
|
132 |
+
demo.queue().launch()
|
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
|
|
1 |
transformers>=4.41
|
2 |
peft>=0.11.0
|
3 |
accelerate>=0.31.0
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
1 |
+
torch>=2.2
|
2 |
transformers>=4.41
|
3 |
peft>=0.11.0
|
4 |
accelerate>=0.31.0
|
5 |
+
bitsandbytes>=0.43.1
|
6 |
+
gradio>=4.0
|
7 |
+
pydantic==2.10.6
|