yarenty commited on
Commit
eb28058
Β·
1 Parent(s): ab99686

use proper models

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-312.pyc +0 -0
  2. app.py +100 -26
__pycache__/app.cpython-312.pyc ADDED
Binary file (28.3 kB). View file
 
app.py CHANGED
@@ -40,6 +40,12 @@ cancel_event = threading.Event()
40
  # Torch-Compatible Model Definitions with Adjusted Descriptions
41
  # ------------------------------
42
  MODELS = {
 
 
 
 
 
 
43
  # … your existing entries …
44
  "gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
45
  "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
@@ -51,9 +57,9 @@ MODELS = {
51
  "repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
52
  "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
53
  },
54
- "gemma-3-270m-it":{
55
- "repo_id":"google/gemma-3-270m-it",
56
- "description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
57
  },
58
  "SmolLM-135M-Taiwan-Instruct-v1.0": {
59
  "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
@@ -143,6 +149,33 @@ MODELS = {
143
  # Global cache for pipelines to avoid re-loading.
144
  PIPELINES = {}
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def load_pipeline(model_name):
147
  """
148
  Load and cache a transformers pipeline for text generation.
@@ -159,6 +192,16 @@ def load_pipeline(model_name):
159
  repo = MODELS[model_name]["repo_id"]
160
  logger.info(f"πŸ“¦ Repository: {repo}")
161
 
 
 
 
 
 
 
 
 
 
 
162
  # Load tokenizer
163
  logger.info(f"πŸ”€ Loading tokenizer for {repo}...")
164
  try:
@@ -166,8 +209,15 @@ def load_pipeline(model_name):
166
  token=access_token if access_token else None)
167
  logger.info(f"βœ… Tokenizer loaded successfully")
168
  except Exception as e:
169
- logger.error(f"❌ Failed to load tokenizer: {e}")
170
- raise
 
 
 
 
 
 
 
171
 
172
  # Try different data types for optimal performance
173
  dtypes_to_try = [
@@ -195,8 +245,14 @@ def load_pipeline(model_name):
195
  return pipe
196
 
197
  except Exception as e:
198
- logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
199
- continue
 
 
 
 
 
 
200
 
201
  # Final fallback without specific dtype
202
  logger.warning(f"πŸ”„ Attempting final fallback load without specific dtype...")
@@ -243,13 +299,21 @@ def retrieve_context(query, max_results=6, max_chars=600):
243
  return []
244
 
245
  def format_conversation(history, system_prompt, tokenizer):
 
 
 
 
 
 
 
 
 
246
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
247
- messages = [{"role": "system", "content": system_prompt.strip()}] + history
248
  return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
249
  else:
250
  # Fallback for base LMs without chat template
251
  prompt = system_prompt.strip() + "\n"
252
- for msg in history:
253
  if msg['role'] == 'user':
254
  prompt += "User: " + msg['content'].strip() + "\n"
255
  elif msg['role'] == 'assistant':
@@ -273,9 +337,18 @@ def chat_response(user_msg, chat_history, system_prompt,
273
  logger.info(f"πŸ” Web search enabled: {enable_search}")
274
  logger.info(f"βš™οΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
275
 
 
 
 
 
 
 
 
 
 
276
  cancel_event.clear()
277
  history = list(chat_history or [])
278
- history.append({'role': 'user', 'content': user_msg})
279
  logger.info(f"πŸ“ Chat history length: {len(history)} messages")
280
 
281
  # Launch web search if enabled
@@ -404,25 +477,21 @@ def chat_response(user_msg, chat_history, system_prompt,
404
  logger.info("πŸ’­ Detected thinking block start")
405
  in_thought = True
406
  # Insert thought placeholder
407
- history.append({
408
- 'role': 'assistant',
409
- 'content': '',
410
- 'metadata': {'title': 'πŸ’­ Thought'}
411
- })
412
  # Capture after opening tag
413
  after = text.split('<think>', 1)[1]
414
  thought_buf += after
415
  # If closing tag in same chunk
416
  if '</think>' in thought_buf:
417
  before, after2 = thought_buf.split('</think>', 1)
418
- history[-1]['content'] = before.strip()
419
  in_thought = False
420
  logger.info("πŸ’­ Thinking block completed, starting answer")
421
  # Start answer buffer
422
  answer_buf = after2
423
- history.append({'role': 'assistant', 'content': answer_buf})
424
  else:
425
- history[-1]['content'] = thought_buf
426
  yield history, debug
427
  continue
428
 
@@ -431,23 +500,23 @@ def chat_response(user_msg, chat_history, system_prompt,
431
  thought_buf += text
432
  if '</think>' in thought_buf:
433
  before, after2 = thought_buf.split('</think>', 1)
434
- history[-1]['content'] = before.strip()
435
  in_thought = False
436
  logger.info("πŸ’­ Thinking block completed, starting answer")
437
  # Start answer buffer
438
  answer_buf = after2
439
- history.append({'role': 'assistant', 'content': answer_buf})
440
  else:
441
- history[-1]['content'] = thought_buf
442
  yield history, debug
443
  continue
444
 
445
  # Stream answer
446
  if not answer_buf:
447
  logger.info("πŸ“ Starting answer generation")
448
- history.append({'role': 'assistant', 'content': ''})
449
  answer_buf += text
450
- history[-1]['content'] = answer_buf
451
  yield history, debug
452
 
453
  gen_thread.join()
@@ -455,7 +524,7 @@ def chat_response(user_msg, chat_history, system_prompt,
455
  yield history, debug + prompt_debug
456
  except Exception as e:
457
  logger.error(f"❌ Error during generation: {e}")
458
- history.append({'role': 'assistant', 'content': f"Error: {e}"})
459
  yield history, debug
460
  finally:
461
  logger.info("🧹 Cleaning up memory...")
@@ -478,6 +547,7 @@ def update_default_prompt(enable_search):
478
  with gr.Blocks(title="LLM Inference") as demo:
479
  gr.Markdown("## 🧠 LLM Inference with Web Search")
480
  gr.Markdown("Interact with the model. Select parameters and chat below.")
 
481
  with gr.Row():
482
  with gr.Column(scale=3):
483
  model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
@@ -496,7 +566,7 @@ with gr.Blocks(title="LLM Inference") as demo:
496
  clr = gr.Button("Clear Chat")
497
  cnl = gr.Button("Cancel Generation")
498
  with gr.Column(scale=7):
499
- chat = gr.Chatbot(type="messages")
500
  txt = gr.Textbox(placeholder="Type your message and press Enter...")
501
  dbg = gr.Markdown()
502
 
@@ -508,4 +578,8 @@ with gr.Blocks(title="LLM Inference") as demo:
508
  model_dd, max_tok, temp, k, p, rp, st],
509
  outputs=[chat, dbg])
510
  logger.info("πŸš€ Starting Gradio application...")
511
- demo.launch()
 
 
 
 
 
40
  # Torch-Compatible Model Definitions with Adjusted Descriptions
41
  # ------------------------------
42
  MODELS = {
43
+ # Accessible models (no gating required)
44
+ "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct - accessible and reliable"},
45
+ "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct - accessible and reliable"},
46
+ "microsoft-DialoGPT-medium": {"repo_id": "microsoft/DialoGPT-medium", "description": "Microsoft DialoGPT Medium - accessible conversational model"},
47
+ "microsoft-DialoGPT-large": {"repo_id": "microsoft/DialoGPT-large", "description": "Microsoft DialoGPT Large - accessible conversational model"},
48
+
49
  # … your existing entries …
50
  "gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
51
  "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
 
57
  "repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
58
  "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
59
  },
60
+ "gemma-2-2b-it":{
61
+ "repo_id":"google/gemma-2-2b-it",
62
+ "description":"Gemma 2 2B Instruction-Tuned model - accessible alternative to Gemma 3",
63
  },
64
  "SmolLM-135M-Taiwan-Instruct-v1.0": {
65
  "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
 
149
  # Global cache for pipelines to avoid re-loading.
150
  PIPELINES = {}
151
 
152
+ def check_model_accessibility(repo_id, token=None):
153
+ """
154
+ Check if a model is accessible without actually loading it.
155
+ Returns True if accessible, False if gated, raises exception for other errors.
156
+ """
157
+ try:
158
+ from huggingface_hub import HfApi
159
+ api = HfApi(token=token)
160
+ model_info = api.model_info(repo_id)
161
+
162
+ # Check if model is gated
163
+ if hasattr(model_info, 'gated') and model_info.gated:
164
+ logger.warning(f"⚠️ Model {repo_id} is gated and requires special access")
165
+ return False
166
+
167
+ logger.info(f"βœ… Model {repo_id} is accessible")
168
+ return True
169
+
170
+ except Exception as e:
171
+ error_msg = str(e)
172
+ if "gated" in error_msg.lower() or "401" in error_msg or "access" in error_msg.lower():
173
+ logger.warning(f"⚠️ Model {repo_id} appears to be gated or requires access")
174
+ return False
175
+ else:
176
+ logger.error(f"❌ Error checking model accessibility: {e}")
177
+ raise
178
+
179
  def load_pipeline(model_name):
180
  """
181
  Load and cache a transformers pipeline for text generation.
 
192
  repo = MODELS[model_name]["repo_id"]
193
  logger.info(f"πŸ“¦ Repository: {repo}")
194
 
195
+ # Check model accessibility first
196
+ try:
197
+ if not check_model_accessibility(repo, access_token):
198
+ raise Exception(f"Model {repo} is gated and requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
199
+ except Exception as e:
200
+ if "gated" in str(e).lower() or "access" in str(e).lower():
201
+ raise
202
+ else:
203
+ logger.warning(f"⚠️ Could not check model accessibility, proceeding with load attempt: {e}")
204
+
205
  # Load tokenizer
206
  logger.info(f"πŸ”€ Loading tokenizer for {repo}...")
207
  try:
 
209
  token=access_token if access_token else None)
210
  logger.info(f"βœ… Tokenizer loaded successfully")
211
  except Exception as e:
212
+ error_msg = str(e)
213
+ if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
214
+ logger.error(f"❌ Model {repo} is gated and requires special access permissions")
215
+ logger.error(f"πŸ’‘ Please visit https://huggingface.co/{repo} to request access")
216
+ logger.error(f"πŸ’‘ Or try a different model from the list")
217
+ raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
218
+ else:
219
+ logger.error(f"❌ Failed to load tokenizer: {e}")
220
+ raise
221
 
222
  # Try different data types for optimal performance
223
  dtypes_to_try = [
 
245
  return pipe
246
 
247
  except Exception as e:
248
+ error_msg = str(e)
249
+ if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
250
+ logger.error(f"❌ Model {repo} is gated and requires special access permissions")
251
+ logger.error(f"πŸ’‘ Please visit https://huggingface.co/{repo} to request access")
252
+ raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
253
+ else:
254
+ logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
255
+ continue
256
 
257
  # Final fallback without specific dtype
258
  logger.warning(f"πŸ”„ Attempting final fallback load without specific dtype...")
 
299
  return []
300
 
301
  def format_conversation(history, system_prompt, tokenizer):
302
+ # Convert Gradio tuple format to message format for tokenizer
303
+ messages = [{"role": "system", "content": system_prompt.strip()}]
304
+
305
+ for user_msg, bot_msg in history:
306
+ if user_msg: # Add user message
307
+ messages.append({"role": "user", "content": user_msg})
308
+ if bot_msg: # Add bot message
309
+ messages.append({"role": "assistant", "content": bot_msg})
310
+
311
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
 
312
  return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
313
  else:
314
  # Fallback for base LMs without chat template
315
  prompt = system_prompt.strip() + "\n"
316
+ for msg in messages[1:]: # Skip system message
317
  if msg['role'] == 'user':
318
  prompt += "User: " + msg['content'].strip() + "\n"
319
  elif msg['role'] == 'assistant':
 
337
  logger.info(f"πŸ” Web search enabled: {enable_search}")
338
  logger.info(f"βš™οΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
339
 
340
+ # Validate inputs
341
+ if not user_msg or not user_msg.strip():
342
+ logger.error("❌ Empty user message received")
343
+ return [], "Error: Empty message received"
344
+
345
+ if model_name not in MODELS:
346
+ logger.error(f"❌ Invalid model name: {model_name}")
347
+ return [], f"Error: Invalid model '{model_name}'"
348
+
349
  cancel_event.clear()
350
  history = list(chat_history or [])
351
+ history.append((user_msg, None)) # Add user message, bot response will be added later
352
  logger.info(f"πŸ“ Chat history length: {len(history)} messages")
353
 
354
  # Launch web search if enabled
 
477
  logger.info("πŸ’­ Detected thinking block start")
478
  in_thought = True
479
  # Insert thought placeholder
480
+ history.append((None, "πŸ’­ Thinking..."))
 
 
 
 
481
  # Capture after opening tag
482
  after = text.split('<think>', 1)[1]
483
  thought_buf += after
484
  # If closing tag in same chunk
485
  if '</think>' in thought_buf:
486
  before, after2 = thought_buf.split('</think>', 1)
487
+ history[-1] = (None, f"πŸ’­ {before.strip()}")
488
  in_thought = False
489
  logger.info("πŸ’­ Thinking block completed, starting answer")
490
  # Start answer buffer
491
  answer_buf = after2
492
+ history.append((None, answer_buf))
493
  else:
494
+ history[-1] = (None, f"πŸ’­ {thought_buf}")
495
  yield history, debug
496
  continue
497
 
 
500
  thought_buf += text
501
  if '</think>' in thought_buf:
502
  before, after2 = thought_buf.split('</think>', 1)
503
+ history[-1] = (None, f"πŸ’­ {before.strip()}")
504
  in_thought = False
505
  logger.info("πŸ’­ Thinking block completed, starting answer")
506
  # Start answer buffer
507
  answer_buf = after2
508
+ history.append((None, answer_buf))
509
  else:
510
+ history[-1] = (None, f"πŸ’­ {thought_buf}")
511
  yield history, debug
512
  continue
513
 
514
  # Stream answer
515
  if not answer_buf:
516
  logger.info("πŸ“ Starting answer generation")
517
+ history.append((None, ''))
518
  answer_buf += text
519
+ history[-1] = (None, answer_buf)
520
  yield history, debug
521
 
522
  gen_thread.join()
 
524
  yield history, debug + prompt_debug
525
  except Exception as e:
526
  logger.error(f"❌ Error during generation: {e}")
527
+ history.append((None, f"Error: {e}"))
528
  yield history, debug
529
  finally:
530
  logger.info("🧹 Cleaning up memory...")
 
547
  with gr.Blocks(title="LLM Inference") as demo:
548
  gr.Markdown("## 🧠 LLM Inference with Web Search")
549
  gr.Markdown("Interact with the model. Select parameters and chat below.")
550
+ gr.Markdown("πŸ’‘ **Tip**: If you get access errors, try models like 'Qwen2.5-3B-Instruct' or 'microsoft-DialoGPT-medium' which are publicly accessible.")
551
  with gr.Row():
552
  with gr.Column(scale=3):
553
  model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
 
566
  clr = gr.Button("Clear Chat")
567
  cnl = gr.Button("Cancel Generation")
568
  with gr.Column(scale=7):
569
+ chat = gr.Chatbot()
570
  txt = gr.Textbox(placeholder="Type your message and press Enter...")
571
  dbg = gr.Markdown()
572
 
 
578
  model_dd, max_tok, temp, k, p, rp, st],
579
  outputs=[chat, dbg])
580
  logger.info("πŸš€ Starting Gradio application...")
581
+ try:
582
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
583
+ except Exception as e:
584
+ logger.error(f"❌ Failed to launch Gradio app: {e}")
585
+ raise