Spaces:
Running
Running
use proper models
Browse files- __pycache__/app.cpython-312.pyc +0 -0
- app.py +100 -26
__pycache__/app.cpython-312.pyc
ADDED
Binary file (28.3 kB). View file
|
|
app.py
CHANGED
@@ -40,6 +40,12 @@ cancel_event = threading.Event()
|
|
40 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
41 |
# ------------------------------
|
42 |
MODELS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# β¦ your existing entries β¦
|
44 |
"gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
|
45 |
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
@@ -51,9 +57,9 @@ MODELS = {
|
|
51 |
"repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
|
52 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
53 |
},
|
54 |
-
"gemma-
|
55 |
-
"repo_id":"google/gemma-
|
56 |
-
"description":"Gemma
|
57 |
},
|
58 |
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
59 |
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
@@ -143,6 +149,33 @@ MODELS = {
|
|
143 |
# Global cache for pipelines to avoid re-loading.
|
144 |
PIPELINES = {}
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def load_pipeline(model_name):
|
147 |
"""
|
148 |
Load and cache a transformers pipeline for text generation.
|
@@ -159,6 +192,16 @@ def load_pipeline(model_name):
|
|
159 |
repo = MODELS[model_name]["repo_id"]
|
160 |
logger.info(f"π¦ Repository: {repo}")
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
# Load tokenizer
|
163 |
logger.info(f"π€ Loading tokenizer for {repo}...")
|
164 |
try:
|
@@ -166,8 +209,15 @@ def load_pipeline(model_name):
|
|
166 |
token=access_token if access_token else None)
|
167 |
logger.info(f"β
Tokenizer loaded successfully")
|
168 |
except Exception as e:
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
# Try different data types for optimal performance
|
173 |
dtypes_to_try = [
|
@@ -195,8 +245,14 @@ def load_pipeline(model_name):
|
|
195 |
return pipe
|
196 |
|
197 |
except Exception as e:
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
# Final fallback without specific dtype
|
202 |
logger.warning(f"π Attempting final fallback load without specific dtype...")
|
@@ -243,13 +299,21 @@ def retrieve_context(query, max_results=6, max_chars=600):
|
|
243 |
return []
|
244 |
|
245 |
def format_conversation(history, system_prompt, tokenizer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
247 |
-
messages = [{"role": "system", "content": system_prompt.strip()}] + history
|
248 |
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
|
249 |
else:
|
250 |
# Fallback for base LMs without chat template
|
251 |
prompt = system_prompt.strip() + "\n"
|
252 |
-
for msg in
|
253 |
if msg['role'] == 'user':
|
254 |
prompt += "User: " + msg['content'].strip() + "\n"
|
255 |
elif msg['role'] == 'assistant':
|
@@ -273,9 +337,18 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
273 |
logger.info(f"π Web search enabled: {enable_search}")
|
274 |
logger.info(f"βοΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
|
275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
cancel_event.clear()
|
277 |
history = list(chat_history or [])
|
278 |
-
history.append(
|
279 |
logger.info(f"π Chat history length: {len(history)} messages")
|
280 |
|
281 |
# Launch web search if enabled
|
@@ -404,25 +477,21 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
404 |
logger.info("π Detected thinking block start")
|
405 |
in_thought = True
|
406 |
# Insert thought placeholder
|
407 |
-
history.append(
|
408 |
-
'role': 'assistant',
|
409 |
-
'content': '',
|
410 |
-
'metadata': {'title': 'π Thought'}
|
411 |
-
})
|
412 |
# Capture after opening tag
|
413 |
after = text.split('<think>', 1)[1]
|
414 |
thought_buf += after
|
415 |
# If closing tag in same chunk
|
416 |
if '</think>' in thought_buf:
|
417 |
before, after2 = thought_buf.split('</think>', 1)
|
418 |
-
history[-1]
|
419 |
in_thought = False
|
420 |
logger.info("π Thinking block completed, starting answer")
|
421 |
# Start answer buffer
|
422 |
answer_buf = after2
|
423 |
-
history.append(
|
424 |
else:
|
425 |
-
history[-1]
|
426 |
yield history, debug
|
427 |
continue
|
428 |
|
@@ -431,23 +500,23 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
431 |
thought_buf += text
|
432 |
if '</think>' in thought_buf:
|
433 |
before, after2 = thought_buf.split('</think>', 1)
|
434 |
-
history[-1]
|
435 |
in_thought = False
|
436 |
logger.info("π Thinking block completed, starting answer")
|
437 |
# Start answer buffer
|
438 |
answer_buf = after2
|
439 |
-
history.append(
|
440 |
else:
|
441 |
-
history[-1]
|
442 |
yield history, debug
|
443 |
continue
|
444 |
|
445 |
# Stream answer
|
446 |
if not answer_buf:
|
447 |
logger.info("π Starting answer generation")
|
448 |
-
history.append(
|
449 |
answer_buf += text
|
450 |
-
history[-1]
|
451 |
yield history, debug
|
452 |
|
453 |
gen_thread.join()
|
@@ -455,7 +524,7 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
455 |
yield history, debug + prompt_debug
|
456 |
except Exception as e:
|
457 |
logger.error(f"β Error during generation: {e}")
|
458 |
-
history.append(
|
459 |
yield history, debug
|
460 |
finally:
|
461 |
logger.info("π§Ή Cleaning up memory...")
|
@@ -478,6 +547,7 @@ def update_default_prompt(enable_search):
|
|
478 |
with gr.Blocks(title="LLM Inference") as demo:
|
479 |
gr.Markdown("## π§ LLM Inference with Web Search")
|
480 |
gr.Markdown("Interact with the model. Select parameters and chat below.")
|
|
|
481 |
with gr.Row():
|
482 |
with gr.Column(scale=3):
|
483 |
model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
|
@@ -496,7 +566,7 @@ with gr.Blocks(title="LLM Inference") as demo:
|
|
496 |
clr = gr.Button("Clear Chat")
|
497 |
cnl = gr.Button("Cancel Generation")
|
498 |
with gr.Column(scale=7):
|
499 |
-
chat = gr.Chatbot(
|
500 |
txt = gr.Textbox(placeholder="Type your message and press Enter...")
|
501 |
dbg = gr.Markdown()
|
502 |
|
@@ -508,4 +578,8 @@ with gr.Blocks(title="LLM Inference") as demo:
|
|
508 |
model_dd, max_tok, temp, k, p, rp, st],
|
509 |
outputs=[chat, dbg])
|
510 |
logger.info("π Starting Gradio application...")
|
511 |
-
|
|
|
|
|
|
|
|
|
|
40 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
41 |
# ------------------------------
|
42 |
MODELS = {
|
43 |
+
# Accessible models (no gating required)
|
44 |
+
"Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct - accessible and reliable"},
|
45 |
+
"Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct - accessible and reliable"},
|
46 |
+
"microsoft-DialoGPT-medium": {"repo_id": "microsoft/DialoGPT-medium", "description": "Microsoft DialoGPT Medium - accessible conversational model"},
|
47 |
+
"microsoft-DialoGPT-large": {"repo_id": "microsoft/DialoGPT-large", "description": "Microsoft DialoGPT Large - accessible conversational model"},
|
48 |
+
|
49 |
# β¦ your existing entries β¦
|
50 |
"gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
|
51 |
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
|
|
57 |
"repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
|
58 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
59 |
},
|
60 |
+
"gemma-2-2b-it":{
|
61 |
+
"repo_id":"google/gemma-2-2b-it",
|
62 |
+
"description":"Gemma 2 2B Instruction-Tuned model - accessible alternative to Gemma 3",
|
63 |
},
|
64 |
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
65 |
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
|
|
149 |
# Global cache for pipelines to avoid re-loading.
|
150 |
PIPELINES = {}
|
151 |
|
152 |
+
def check_model_accessibility(repo_id, token=None):
|
153 |
+
"""
|
154 |
+
Check if a model is accessible without actually loading it.
|
155 |
+
Returns True if accessible, False if gated, raises exception for other errors.
|
156 |
+
"""
|
157 |
+
try:
|
158 |
+
from huggingface_hub import HfApi
|
159 |
+
api = HfApi(token=token)
|
160 |
+
model_info = api.model_info(repo_id)
|
161 |
+
|
162 |
+
# Check if model is gated
|
163 |
+
if hasattr(model_info, 'gated') and model_info.gated:
|
164 |
+
logger.warning(f"β οΈ Model {repo_id} is gated and requires special access")
|
165 |
+
return False
|
166 |
+
|
167 |
+
logger.info(f"β
Model {repo_id} is accessible")
|
168 |
+
return True
|
169 |
+
|
170 |
+
except Exception as e:
|
171 |
+
error_msg = str(e)
|
172 |
+
if "gated" in error_msg.lower() or "401" in error_msg or "access" in error_msg.lower():
|
173 |
+
logger.warning(f"β οΈ Model {repo_id} appears to be gated or requires access")
|
174 |
+
return False
|
175 |
+
else:
|
176 |
+
logger.error(f"β Error checking model accessibility: {e}")
|
177 |
+
raise
|
178 |
+
|
179 |
def load_pipeline(model_name):
|
180 |
"""
|
181 |
Load and cache a transformers pipeline for text generation.
|
|
|
192 |
repo = MODELS[model_name]["repo_id"]
|
193 |
logger.info(f"π¦ Repository: {repo}")
|
194 |
|
195 |
+
# Check model accessibility first
|
196 |
+
try:
|
197 |
+
if not check_model_accessibility(repo, access_token):
|
198 |
+
raise Exception(f"Model {repo} is gated and requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
199 |
+
except Exception as e:
|
200 |
+
if "gated" in str(e).lower() or "access" in str(e).lower():
|
201 |
+
raise
|
202 |
+
else:
|
203 |
+
logger.warning(f"β οΈ Could not check model accessibility, proceeding with load attempt: {e}")
|
204 |
+
|
205 |
# Load tokenizer
|
206 |
logger.info(f"π€ Loading tokenizer for {repo}...")
|
207 |
try:
|
|
|
209 |
token=access_token if access_token else None)
|
210 |
logger.info(f"β
Tokenizer loaded successfully")
|
211 |
except Exception as e:
|
212 |
+
error_msg = str(e)
|
213 |
+
if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
|
214 |
+
logger.error(f"β Model {repo} is gated and requires special access permissions")
|
215 |
+
logger.error(f"π‘ Please visit https://huggingface.co/{repo} to request access")
|
216 |
+
logger.error(f"π‘ Or try a different model from the list")
|
217 |
+
raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
218 |
+
else:
|
219 |
+
logger.error(f"β Failed to load tokenizer: {e}")
|
220 |
+
raise
|
221 |
|
222 |
# Try different data types for optimal performance
|
223 |
dtypes_to_try = [
|
|
|
245 |
return pipe
|
246 |
|
247 |
except Exception as e:
|
248 |
+
error_msg = str(e)
|
249 |
+
if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
|
250 |
+
logger.error(f"β Model {repo} is gated and requires special access permissions")
|
251 |
+
logger.error(f"π‘ Please visit https://huggingface.co/{repo} to request access")
|
252 |
+
raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
253 |
+
else:
|
254 |
+
logger.warning(f"β οΈ Failed to load with {dtype_desc}: {e}")
|
255 |
+
continue
|
256 |
|
257 |
# Final fallback without specific dtype
|
258 |
logger.warning(f"π Attempting final fallback load without specific dtype...")
|
|
|
299 |
return []
|
300 |
|
301 |
def format_conversation(history, system_prompt, tokenizer):
|
302 |
+
# Convert Gradio tuple format to message format for tokenizer
|
303 |
+
messages = [{"role": "system", "content": system_prompt.strip()}]
|
304 |
+
|
305 |
+
for user_msg, bot_msg in history:
|
306 |
+
if user_msg: # Add user message
|
307 |
+
messages.append({"role": "user", "content": user_msg})
|
308 |
+
if bot_msg: # Add bot message
|
309 |
+
messages.append({"role": "assistant", "content": bot_msg})
|
310 |
+
|
311 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
|
|
312 |
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
|
313 |
else:
|
314 |
# Fallback for base LMs without chat template
|
315 |
prompt = system_prompt.strip() + "\n"
|
316 |
+
for msg in messages[1:]: # Skip system message
|
317 |
if msg['role'] == 'user':
|
318 |
prompt += "User: " + msg['content'].strip() + "\n"
|
319 |
elif msg['role'] == 'assistant':
|
|
|
337 |
logger.info(f"π Web search enabled: {enable_search}")
|
338 |
logger.info(f"βοΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
|
339 |
|
340 |
+
# Validate inputs
|
341 |
+
if not user_msg or not user_msg.strip():
|
342 |
+
logger.error("β Empty user message received")
|
343 |
+
return [], "Error: Empty message received"
|
344 |
+
|
345 |
+
if model_name not in MODELS:
|
346 |
+
logger.error(f"β Invalid model name: {model_name}")
|
347 |
+
return [], f"Error: Invalid model '{model_name}'"
|
348 |
+
|
349 |
cancel_event.clear()
|
350 |
history = list(chat_history or [])
|
351 |
+
history.append((user_msg, None)) # Add user message, bot response will be added later
|
352 |
logger.info(f"π Chat history length: {len(history)} messages")
|
353 |
|
354 |
# Launch web search if enabled
|
|
|
477 |
logger.info("π Detected thinking block start")
|
478 |
in_thought = True
|
479 |
# Insert thought placeholder
|
480 |
+
history.append((None, "π Thinking..."))
|
|
|
|
|
|
|
|
|
481 |
# Capture after opening tag
|
482 |
after = text.split('<think>', 1)[1]
|
483 |
thought_buf += after
|
484 |
# If closing tag in same chunk
|
485 |
if '</think>' in thought_buf:
|
486 |
before, after2 = thought_buf.split('</think>', 1)
|
487 |
+
history[-1] = (None, f"π {before.strip()}")
|
488 |
in_thought = False
|
489 |
logger.info("π Thinking block completed, starting answer")
|
490 |
# Start answer buffer
|
491 |
answer_buf = after2
|
492 |
+
history.append((None, answer_buf))
|
493 |
else:
|
494 |
+
history[-1] = (None, f"π {thought_buf}")
|
495 |
yield history, debug
|
496 |
continue
|
497 |
|
|
|
500 |
thought_buf += text
|
501 |
if '</think>' in thought_buf:
|
502 |
before, after2 = thought_buf.split('</think>', 1)
|
503 |
+
history[-1] = (None, f"π {before.strip()}")
|
504 |
in_thought = False
|
505 |
logger.info("π Thinking block completed, starting answer")
|
506 |
# Start answer buffer
|
507 |
answer_buf = after2
|
508 |
+
history.append((None, answer_buf))
|
509 |
else:
|
510 |
+
history[-1] = (None, f"π {thought_buf}")
|
511 |
yield history, debug
|
512 |
continue
|
513 |
|
514 |
# Stream answer
|
515 |
if not answer_buf:
|
516 |
logger.info("π Starting answer generation")
|
517 |
+
history.append((None, ''))
|
518 |
answer_buf += text
|
519 |
+
history[-1] = (None, answer_buf)
|
520 |
yield history, debug
|
521 |
|
522 |
gen_thread.join()
|
|
|
524 |
yield history, debug + prompt_debug
|
525 |
except Exception as e:
|
526 |
logger.error(f"β Error during generation: {e}")
|
527 |
+
history.append((None, f"Error: {e}"))
|
528 |
yield history, debug
|
529 |
finally:
|
530 |
logger.info("π§Ή Cleaning up memory...")
|
|
|
547 |
with gr.Blocks(title="LLM Inference") as demo:
|
548 |
gr.Markdown("## π§ LLM Inference with Web Search")
|
549 |
gr.Markdown("Interact with the model. Select parameters and chat below.")
|
550 |
+
gr.Markdown("π‘ **Tip**: If you get access errors, try models like 'Qwen2.5-3B-Instruct' or 'microsoft-DialoGPT-medium' which are publicly accessible.")
|
551 |
with gr.Row():
|
552 |
with gr.Column(scale=3):
|
553 |
model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
|
|
|
566 |
clr = gr.Button("Clear Chat")
|
567 |
cnl = gr.Button("Cancel Generation")
|
568 |
with gr.Column(scale=7):
|
569 |
+
chat = gr.Chatbot()
|
570 |
txt = gr.Textbox(placeholder="Type your message and press Enter...")
|
571 |
dbg = gr.Markdown()
|
572 |
|
|
|
578 |
model_dd, max_tok, temp, k, p, rp, st],
|
579 |
outputs=[chat, dbg])
|
580 |
logger.info("π Starting Gradio application...")
|
581 |
+
try:
|
582 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
583 |
+
except Exception as e:
|
584 |
+
logger.error(f"β Failed to launch Gradio app: {e}")
|
585 |
+
raise
|