yarenty commited on
Commit
ab99686
Β·
1 Parent(s): 8b89134

added proper logging

Browse files
Files changed (1) hide show
  1. app.py +117 -18
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import time
3
  import gc
4
  import threading
 
5
  from itertools import islice
6
  from datetime import datetime
7
  import re # for parsing <think> blocks
@@ -12,8 +13,20 @@ from transformers import AutoTokenizer
12
  from ddgs import DDGS
13
  import spaces # Import spaces early to enable ZeroGPU support
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Get Hugging Face token - works in both local and HF Spaces environments
16
  access_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') or None
 
17
 
18
  # Optional: Disable GPU visibility if you wish to force CPU usage
19
  # os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -136,12 +149,35 @@ def load_pipeline(model_name):
136
  Tries bfloat16, falls back to float16 or float32 if unsupported.
137
  """
138
  global PIPELINES
 
 
 
139
  if model_name in PIPELINES:
 
140
  return PIPELINES[model_name]
 
141
  repo = MODELS[model_name]["repo_id"]
142
- tokenizer = AutoTokenizer.from_pretrained(repo,
143
- token=access_token if access_token else None)
144
- for dtype in (torch.bfloat16, torch.float16, torch.float32):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  try:
146
  pipe = pipeline(
147
  task="text-generation",
@@ -152,20 +188,32 @@ def load_pipeline(model_name):
152
  device_map="auto",
153
  use_cache=False, # ← disable past-key-value caching
154
  token=access_token if access_token else None)
 
155
  PIPELINES[model_name] = pipe
 
 
156
  return pipe
157
- except Exception:
 
 
158
  continue
159
- # Final fallback
160
- pipe = pipeline(
161
- task="text-generation",
162
- model=repo,
163
- tokenizer=tokenizer,
164
- trust_remote_code=True,
165
- device_map="auto"
166
- )
167
- PIPELINES[model_name] = pipe
168
- return pipe
 
 
 
 
 
 
 
169
 
170
 
171
  def retrieve_context(query, max_results=6, max_chars=600):
@@ -173,11 +221,25 @@ def retrieve_context(query, max_results=6, max_chars=600):
173
  Retrieve search snippets from DuckDuckGo (runs in background).
174
  Returns a list of result strings.
175
  """
 
 
 
176
  try:
177
  with DDGS() as ddgs:
178
- return [f"{i+1}. {r.get('title','No Title')} - {r.get('body','')[:max_chars]}"
179
- for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))]
180
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
181
  return []
182
 
183
  def format_conversation(history, system_prompt, tokenizer):
@@ -204,14 +266,23 @@ def chat_response(user_msg, chat_history, system_prompt,
204
  """
205
  Generates streaming chat responses, optionally with background web search.
206
  """
 
 
 
 
 
 
 
207
  cancel_event.clear()
208
  history = list(chat_history or [])
209
  history.append({'role': 'user', 'content': user_msg})
 
210
 
211
  # Launch web search if enabled
212
  debug = ''
213
  search_results = []
214
  if enable_search:
 
215
  debug = 'Search task started.'
216
  thread_search = threading.Thread(
217
  target=lambda: search_results.extend(
@@ -220,7 +291,9 @@ def chat_response(user_msg, chat_history, system_prompt,
220
  )
221
  thread_search.daemon = True
222
  thread_search.start()
 
223
  else:
 
224
  debug = 'Web search disabled.'
225
 
226
  try:
@@ -247,14 +320,17 @@ def chat_response(user_msg, chat_history, system_prompt,
247
  else:
248
  enriched = system_prompt
249
 
250
- # wait up to 1s for snippets, then replace debug with them
251
  if enable_search:
 
252
  thread_search.join(timeout=float(search_timeout))
253
  if search_results:
 
254
  debug = "### Search results merged into prompt\n\n" + "\n".join(
255
  f"- {r}" for r in search_results
256
  )
257
  else:
 
258
  debug = "*No web search results found.*"
259
 
260
  # merge fetched snippets into the system prompt
@@ -278,12 +354,20 @@ def chat_response(user_msg, chat_history, system_prompt,
278
  else:
279
  enriched = system_prompt
280
 
 
281
  pipe = load_pipeline(model_name)
 
 
282
  prompt = format_conversation(history, enriched, pipe.tokenizer)
283
  prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
 
 
 
284
  streamer = TextIteratorStreamer(pipe.tokenizer,
285
  skip_prompt=True,
286
  skip_special_tokens=True)
 
 
287
  gen_thread = threading.Thread(
288
  target=pipe,
289
  args=(prompt,),
@@ -298,20 +382,26 @@ def chat_response(user_msg, chat_history, system_prompt,
298
  }
299
  )
300
  gen_thread.start()
 
301
 
302
  # Buffers for thought vs answer
303
  thought_buf = ''
304
  answer_buf = ''
305
  in_thought = False
 
306
 
 
307
  # Stream tokens
308
  for chunk in streamer:
309
  if cancel_event.is_set():
 
310
  break
311
  text = chunk
 
312
 
313
  # Detect start of thinking
314
  if not in_thought and '<think>' in text:
 
315
  in_thought = True
316
  # Insert thought placeholder
317
  history.append({
@@ -327,6 +417,7 @@ def chat_response(user_msg, chat_history, system_prompt,
327
  before, after2 = thought_buf.split('</think>', 1)
328
  history[-1]['content'] = before.strip()
329
  in_thought = False
 
330
  # Start answer buffer
331
  answer_buf = after2
332
  history.append({'role': 'assistant', 'content': answer_buf})
@@ -342,6 +433,7 @@ def chat_response(user_msg, chat_history, system_prompt,
342
  before, after2 = thought_buf.split('</think>', 1)
343
  history[-1]['content'] = before.strip()
344
  in_thought = False
 
345
  # Start answer buffer
346
  answer_buf = after2
347
  history.append({'role': 'assistant', 'content': answer_buf})
@@ -352,21 +444,27 @@ def chat_response(user_msg, chat_history, system_prompt,
352
 
353
  # Stream answer
354
  if not answer_buf:
 
355
  history.append({'role': 'assistant', 'content': ''})
356
  answer_buf += text
357
  history[-1]['content'] = answer_buf
358
  yield history, debug
359
 
360
  gen_thread.join()
 
361
  yield history, debug + prompt_debug
362
  except Exception as e:
 
363
  history.append({'role': 'assistant', 'content': f"Error: {e}"})
364
  yield history, debug
365
  finally:
 
366
  gc.collect()
 
367
 
368
 
369
  def cancel_generation():
 
370
  cancel_event.set()
371
  return 'Generation cancelled.'
372
 
@@ -409,4 +507,5 @@ with gr.Blocks(title="LLM Inference") as demo:
409
  inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
410
  model_dd, max_tok, temp, k, p, rp, st],
411
  outputs=[chat, dbg])
 
412
  demo.launch()
 
2
  import time
3
  import gc
4
  import threading
5
+ import logging
6
  from itertools import islice
7
  from datetime import datetime
8
  import re # for parsing <think> blocks
 
13
  from ddgs import DDGS
14
  import spaces # Import spaces early to enable ZeroGPU support
15
 
16
+ # Configure logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
+ handlers=[
21
+ logging.StreamHandler(),
22
+ logging.FileHandler('app.log')
23
+ ]
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
  # Get Hugging Face token - works in both local and HF Spaces environments
28
  access_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') or None
29
+ logger.info(f"πŸ”‘ Hugging Face token status: {'Available' if access_token else 'Not available (using public models only)'}")
30
 
31
  # Optional: Disable GPU visibility if you wish to force CPU usage
32
  # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
149
  Tries bfloat16, falls back to float16 or float32 if unsupported.
150
  """
151
  global PIPELINES
152
+
153
+ logger.info(f"πŸ€– Loading model: {model_name}")
154
+
155
  if model_name in PIPELINES:
156
+ logger.info(f"βœ… Model {model_name} already cached, using existing pipeline")
157
  return PIPELINES[model_name]
158
+
159
  repo = MODELS[model_name]["repo_id"]
160
+ logger.info(f"πŸ“¦ Repository: {repo}")
161
+
162
+ # Load tokenizer
163
+ logger.info(f"πŸ”€ Loading tokenizer for {repo}...")
164
+ try:
165
+ tokenizer = AutoTokenizer.from_pretrained(repo,
166
+ token=access_token if access_token else None)
167
+ logger.info(f"βœ… Tokenizer loaded successfully")
168
+ except Exception as e:
169
+ logger.error(f"❌ Failed to load tokenizer: {e}")
170
+ raise
171
+
172
+ # Try different data types for optimal performance
173
+ dtypes_to_try = [
174
+ (torch.bfloat16, "bfloat16 (recommended)"),
175
+ (torch.float16, "float16 (good performance)"),
176
+ (torch.float32, "float32 (fallback)")
177
+ ]
178
+
179
+ for dtype, dtype_desc in dtypes_to_try:
180
+ logger.info(f"πŸ”„ Attempting to load model with {dtype_desc}...")
181
  try:
182
  pipe = pipeline(
183
  task="text-generation",
 
188
  device_map="auto",
189
  use_cache=False, # ← disable past-key-value caching
190
  token=access_token if access_token else None)
191
+
192
  PIPELINES[model_name] = pipe
193
+ logger.info(f"βœ… Model {model_name} loaded successfully with {dtype_desc}")
194
+ logger.info(f"πŸ’Ύ Model cached for future use")
195
  return pipe
196
+
197
+ except Exception as e:
198
+ logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
199
  continue
200
+
201
+ # Final fallback without specific dtype
202
+ logger.warning(f"πŸ”„ Attempting final fallback load without specific dtype...")
203
+ try:
204
+ pipe = pipeline(
205
+ task="text-generation",
206
+ model=repo,
207
+ tokenizer=tokenizer,
208
+ trust_remote_code=True,
209
+ device_map="auto"
210
+ )
211
+ PIPELINES[model_name] = pipe
212
+ logger.info(f"βœ… Model {model_name} loaded with fallback configuration")
213
+ return pipe
214
+ except Exception as e:
215
+ logger.error(f"❌ Failed to load model {model_name}: {e}")
216
+ raise
217
 
218
 
219
  def retrieve_context(query, max_results=6, max_chars=600):
 
221
  Retrieve search snippets from DuckDuckGo (runs in background).
222
  Returns a list of result strings.
223
  """
224
+ logger.info(f"πŸ” Starting web search for query: '{query[:100]}{'...' if len(query) > 100 else ''}'")
225
+ logger.info(f"πŸ“Š Search parameters: max_results={max_results}, max_chars={max_chars}")
226
+
227
  try:
228
  with DDGS() as ddgs:
229
+ logger.info("🌐 Connected to DuckDuckGo search API")
230
+ results = []
231
+ for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results)):
232
+ title = r.get('title', 'No Title')
233
+ body = r.get('body', '')[:max_chars]
234
+ result = f"{i+1}. {title} - {body}"
235
+ results.append(result)
236
+ logger.info(f"πŸ“„ Found result {i+1}: {title[:50]}{'...' if len(title) > 50 else ''}")
237
+
238
+ logger.info(f"βœ… Web search completed: {len(results)} results found")
239
+ return results
240
+
241
+ except Exception as e:
242
+ logger.error(f"❌ Web search failed: {e}")
243
  return []
244
 
245
  def format_conversation(history, system_prompt, tokenizer):
 
266
  """
267
  Generates streaming chat responses, optionally with background web search.
268
  """
269
+ logger.info("=" * 60)
270
+ logger.info("πŸš€ Starting new chat response generation")
271
+ logger.info(f"πŸ‘€ User message: '{user_msg[:100]}{'...' if len(user_msg) > 100 else ''}'")
272
+ logger.info(f"πŸ€– Selected model: {model_name}")
273
+ logger.info(f"πŸ” Web search enabled: {enable_search}")
274
+ logger.info(f"βš™οΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
275
+
276
  cancel_event.clear()
277
  history = list(chat_history or [])
278
  history.append({'role': 'user', 'content': user_msg})
279
+ logger.info(f"πŸ“ Chat history length: {len(history)} messages")
280
 
281
  # Launch web search if enabled
282
  debug = ''
283
  search_results = []
284
  if enable_search:
285
+ logger.info("πŸ” Initiating background web search...")
286
  debug = 'Search task started.'
287
  thread_search = threading.Thread(
288
  target=lambda: search_results.extend(
 
291
  )
292
  thread_search.daemon = True
293
  thread_search.start()
294
+ logger.info("βœ… Web search thread started in background")
295
  else:
296
+ logger.info("🚫 Web search disabled by user")
297
  debug = 'Web search disabled.'
298
 
299
  try:
 
320
  else:
321
  enriched = system_prompt
322
 
323
+ # wait up to search_timeout for snippets, then replace debug with them
324
  if enable_search:
325
+ logger.info(f"⏳ Waiting for search results (timeout: {search_timeout}s)...")
326
  thread_search.join(timeout=float(search_timeout))
327
  if search_results:
328
+ logger.info(f"βœ… Search completed: {len(search_results)} results found")
329
  debug = "### Search results merged into prompt\n\n" + "\n".join(
330
  f"- {r}" for r in search_results
331
  )
332
  else:
333
+ logger.warning("⚠️ No web search results found")
334
  debug = "*No web search results found.*"
335
 
336
  # merge fetched snippets into the system prompt
 
354
  else:
355
  enriched = system_prompt
356
 
357
+ logger.info("πŸ€– Loading model pipeline...")
358
  pipe = load_pipeline(model_name)
359
+
360
+ logger.info("πŸ“ Formatting conversation prompt...")
361
  prompt = format_conversation(history, enriched, pipe.tokenizer)
362
  prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
363
+ logger.info(f"πŸ“Š Prompt length: {len(prompt)} characters")
364
+
365
+ logger.info("🎯 Setting up text streaming...")
366
  streamer = TextIteratorStreamer(pipe.tokenizer,
367
  skip_prompt=True,
368
  skip_special_tokens=True)
369
+
370
+ logger.info("πŸš€ Starting text generation...")
371
  gen_thread = threading.Thread(
372
  target=pipe,
373
  args=(prompt,),
 
382
  }
383
  )
384
  gen_thread.start()
385
+ logger.info("βœ… Generation thread started")
386
 
387
  # Buffers for thought vs answer
388
  thought_buf = ''
389
  answer_buf = ''
390
  in_thought = False
391
+ token_count = 0
392
 
393
+ logger.info("πŸ“‘ Starting token streaming...")
394
  # Stream tokens
395
  for chunk in streamer:
396
  if cancel_event.is_set():
397
+ logger.info("πŸ›‘ Generation cancelled by user")
398
  break
399
  text = chunk
400
+ token_count += 1
401
 
402
  # Detect start of thinking
403
  if not in_thought and '<think>' in text:
404
+ logger.info("πŸ’­ Detected thinking block start")
405
  in_thought = True
406
  # Insert thought placeholder
407
  history.append({
 
417
  before, after2 = thought_buf.split('</think>', 1)
418
  history[-1]['content'] = before.strip()
419
  in_thought = False
420
+ logger.info("πŸ’­ Thinking block completed, starting answer")
421
  # Start answer buffer
422
  answer_buf = after2
423
  history.append({'role': 'assistant', 'content': answer_buf})
 
433
  before, after2 = thought_buf.split('</think>', 1)
434
  history[-1]['content'] = before.strip()
435
  in_thought = False
436
+ logger.info("πŸ’­ Thinking block completed, starting answer")
437
  # Start answer buffer
438
  answer_buf = after2
439
  history.append({'role': 'assistant', 'content': answer_buf})
 
444
 
445
  # Stream answer
446
  if not answer_buf:
447
+ logger.info("πŸ“ Starting answer generation")
448
  history.append({'role': 'assistant', 'content': ''})
449
  answer_buf += text
450
  history[-1]['content'] = answer_buf
451
  yield history, debug
452
 
453
  gen_thread.join()
454
+ logger.info(f"βœ… Generation completed: {token_count} tokens generated")
455
  yield history, debug + prompt_debug
456
  except Exception as e:
457
+ logger.error(f"❌ Error during generation: {e}")
458
  history.append({'role': 'assistant', 'content': f"Error: {e}"})
459
  yield history, debug
460
  finally:
461
+ logger.info("🧹 Cleaning up memory...")
462
  gc.collect()
463
+ logger.info("=" * 60)
464
 
465
 
466
  def cancel_generation():
467
+ logger.info("πŸ›‘ User requested generation cancellation")
468
  cancel_event.set()
469
  return 'Generation cancelled.'
470
 
 
507
  inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
508
  model_dd, max_tok, temp, k, p, rp, st],
509
  outputs=[chat, dbg])
510
+ logger.info("πŸš€ Starting Gradio application...")
511
  demo.launch()