Spaces:

yarenty
/

Chat_tester

Running

App Files Files Community

yarenty commited on 16 days ago

Commit

ab99686

1 Parent(s): 8b89134

added proper logging

Browse files

Files changed (1) hide show

app.py +117 -18

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import time
 import gc
 import threading
 from itertools import islice
 from datetime import datetime
 import re  # for parsing <think> blocks
@@ -12,8 +13,20 @@ from transformers import AutoTokenizer
 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
 # Get Hugging Face token - works in both local and HF Spaces environments
 access_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') or None
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -136,12 +149,35 @@ def load_pipeline(model_name):
     Tries bfloat16, falls back to float16 or float32 if unsupported.
     """
     global PIPELINES
     if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
-    tokenizer = AutoTokenizer.from_pretrained(repo,
-                token=access_token if access_token else None)
-    for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
             pipe = pipeline(
                 task="text-generation",
@@ -152,20 +188,32 @@ def load_pipeline(model_name):
                 device_map="auto",
                 use_cache=False,      # ← disable past-key-value caching
                 token=access_token if access_token else None)
             PIPELINES[model_name] = pipe
             return pipe
-        except Exception:
             continue
-    # Final fallback
-    pipe = pipeline(
-        task="text-generation",
-        model=repo,
-        tokenizer=tokenizer,
-        trust_remote_code=True,
-        device_map="auto"
-    )
-    PIPELINES[model_name] = pipe
-    return pipe
 def retrieve_context(query, max_results=6, max_chars=600):
@@ -173,11 +221,25 @@ def retrieve_context(query, max_results=6, max_chars=600):
     Retrieve search snippets from DuckDuckGo (runs in background).
     Returns a list of result strings.
     """
     try:
         with DDGS() as ddgs:
-            return [f"{i+1}. {r.get('title','No Title')} - {r.get('body','')[:max_chars]}"
-                    for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))]
-    except Exception:
         return []
 def format_conversation(history, system_prompt, tokenizer):
@@ -204,14 +266,23 @@ def chat_response(user_msg, chat_history, system_prompt,
     """
     Generates streaming chat responses, optionally with background web search.
     """
     cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
     # Launch web search if enabled
     debug = ''
     search_results = []
     if enable_search:
         debug = 'Search task started.'
         thread_search = threading.Thread(
             target=lambda: search_results.extend(
@@ -220,7 +291,9 @@ def chat_response(user_msg, chat_history, system_prompt,
         )
         thread_search.daemon = True
         thread_search.start()
     else:
         debug = 'Web search disabled.'
     try:
@@ -247,14 +320,17 @@ def chat_response(user_msg, chat_history, system_prompt,
         else:
             enriched = system_prompt
-        # wait up to 1s for snippets, then replace debug with them
         if enable_search:
             thread_search.join(timeout=float(search_timeout))
             if search_results:
                 debug = "### Search results merged into prompt\n\n" + "\n".join(
                     f"- {r}" for r in search_results
                 )
             else:
                 debug = "*No web search results found.*"
         # merge fetched snippets into the system prompt
@@ -278,12 +354,20 @@ def chat_response(user_msg, chat_history, system_prompt,
         else:
             enriched = system_prompt
         pipe = load_pipeline(model_name)
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)
         gen_thread = threading.Thread(
             target=pipe,
             args=(prompt,),
@@ -298,20 +382,26 @@ def chat_response(user_msg, chat_history, system_prompt,
             }
         )
         gen_thread.start()
         # Buffers for thought vs answer
         thought_buf = ''
         answer_buf = ''
         in_thought = False
         # Stream tokens
         for chunk in streamer:
             if cancel_event.is_set():
                 break
             text = chunk
             # Detect start of thinking
             if not in_thought and '<think>' in text:
                 in_thought = True
                 # Insert thought placeholder
                 history.append({
@@ -327,6 +417,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
                     # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
@@ -342,6 +433,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
                     # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
@@ -352,21 +444,27 @@ def chat_response(user_msg, chat_history, system_prompt,
             # Stream answer
             if not answer_buf:
                 history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
             history[-1]['content'] = answer_buf
             yield history, debug
         gen_thread.join()
         yield history, debug + prompt_debug
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
         gc.collect()
 def cancel_generation():
     cancel_event.set()
     return 'Generation cancelled.'
@@ -409,4 +507,5 @@ with gr.Blocks(title="LLM Inference") as demo:
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
                        model_dd, max_tok, temp, k, p, rp, st],
                outputs=[chat, dbg])
     demo.launch()

 import time
 import gc
 import threading
+import logging
 from itertools import islice
 from datetime import datetime
 import re  # for parsing <think> blocks
 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('app.log')
+    ]
+)
+logger = logging.getLogger(__name__)
 # Get Hugging Face token - works in both local and HF Spaces environments
 access_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') or None
+logger.info(f"🔑 Hugging Face token status: {'Available' if access_token else 'Not available (using public models only)'}")
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
     Tries bfloat16, falls back to float16 or float32 if unsupported.
     """
     global PIPELINES
+    logger.info(f"🤖 Loading model: {model_name}")
     if model_name in PIPELINES:
+        logger.info(f"✅ Model {model_name} already cached, using existing pipeline")
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
+    logger.info(f"📦 Repository: {repo}")
+    # Load tokenizer
+    logger.info(f"🔤 Loading tokenizer for {repo}...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(repo,
+                    token=access_token if access_token else None)
+        logger.info(f"✅ Tokenizer loaded successfully")
+    except Exception as e:
+        logger.error(f"❌ Failed to load tokenizer: {e}")
+        raise
+    # Try different data types for optimal performance
+    dtypes_to_try = [
+        (torch.bfloat16, "bfloat16 (recommended)"),
+        (torch.float16, "float16 (good performance)"),
+        (torch.float32, "float32 (fallback)")
+    ]
+    for dtype, dtype_desc in dtypes_to_try:
+        logger.info(f"🔄 Attempting to load model with {dtype_desc}...")
         try:
             pipe = pipeline(
                 task="text-generation",
                 device_map="auto",
                 use_cache=False,      # ← disable past-key-value caching
                 token=access_token if access_token else None)
             PIPELINES[model_name] = pipe
+            logger.info(f"✅ Model {model_name} loaded successfully with {dtype_desc}")
+            logger.info(f"💾 Model cached for future use")
             return pipe
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
             continue
+    # Final fallback without specific dtype
+    logger.warning(f"🔄 Attempting final fallback load without specific dtype...")
+    try:
+        pipe = pipeline(
+            task="text-generation",
+            model=repo,
+            tokenizer=tokenizer,
+            trust_remote_code=True,
+            device_map="auto"
+        )
+        PIPELINES[model_name] = pipe
+        logger.info(f"✅ Model {model_name} loaded with fallback configuration")
+        return pipe
+    except Exception as e:
+        logger.error(f"❌ Failed to load model {model_name}: {e}")
+        raise
 def retrieve_context(query, max_results=6, max_chars=600):
     Retrieve search snippets from DuckDuckGo (runs in background).
     Returns a list of result strings.
     """
+    logger.info(f"🔍 Starting web search for query: '{query[:100]}{'...' if len(query) > 100 else ''}'")
+    logger.info(f"📊 Search parameters: max_results={max_results}, max_chars={max_chars}")
     try:
         with DDGS() as ddgs:
+            logger.info("🌐 Connected to DuckDuckGo search API")
+            results = []
+            for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results)):
+                title = r.get('title', 'No Title')
+                body = r.get('body', '')[:max_chars]
+                result = f"{i+1}. {title} - {body}"
+                results.append(result)
+                logger.info(f"📄 Found result {i+1}: {title[:50]}{'...' if len(title) > 50 else ''}")
+            logger.info(f"✅ Web search completed: {len(results)} results found")
+            return results
+    except Exception as e:
+        logger.error(f"❌ Web search failed: {e}")
         return []
 def format_conversation(history, system_prompt, tokenizer):
     """
     Generates streaming chat responses, optionally with background web search.
     """
+    logger.info("=" * 60)
+    logger.info("🚀 Starting new chat response generation")
+    logger.info(f"👤 User message: '{user_msg[:100]}{'...' if len(user_msg) > 100 else ''}'")
+    logger.info(f"🤖 Selected model: {model_name}")
+    logger.info(f"🔍 Web search enabled: {enable_search}")
+    logger.info(f"⚙️ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
     cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
+    logger.info(f"📝 Chat history length: {len(history)} messages")
     # Launch web search if enabled
     debug = ''
     search_results = []
     if enable_search:
+        logger.info("🔍 Initiating background web search...")
         debug = 'Search task started.'
         thread_search = threading.Thread(
             target=lambda: search_results.extend(
         )
         thread_search.daemon = True
         thread_search.start()
+        logger.info("✅ Web search thread started in background")
     else:
+        logger.info("🚫 Web search disabled by user")
         debug = 'Web search disabled.'
     try:
         else:
             enriched = system_prompt
+        # wait up to search_timeout for snippets, then replace debug with them
         if enable_search:
+            logger.info(f"⏳ Waiting for search results (timeout: {search_timeout}s)...")
             thread_search.join(timeout=float(search_timeout))
             if search_results:
+                logger.info(f"✅ Search completed: {len(search_results)} results found")
                 debug = "### Search results merged into prompt\n\n" + "\n".join(
                     f"- {r}" for r in search_results
                 )
             else:
+                logger.warning("⚠️ No web search results found")
                 debug = "*No web search results found.*"
         # merge fetched snippets into the system prompt
         else:
             enriched = system_prompt
+        logger.info("🤖 Loading model pipeline...")
         pipe = load_pipeline(model_name)
+        logger.info("📝 Formatting conversation prompt...")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
+        logger.info(f"📊 Prompt length: {len(prompt)} characters")
+        logger.info("🎯 Setting up text streaming...")
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)
+        logger.info("🚀 Starting text generation...")
         gen_thread = threading.Thread(
             target=pipe,
             args=(prompt,),
             }
         )
         gen_thread.start()
+        logger.info("✅ Generation thread started")
         # Buffers for thought vs answer
         thought_buf = ''
         answer_buf = ''
         in_thought = False
+        token_count = 0
+        logger.info("📡 Starting token streaming...")
         # Stream tokens
         for chunk in streamer:
             if cancel_event.is_set():
+                logger.info("🛑 Generation cancelled by user")
                 break
             text = chunk
+            token_count += 1
             # Detect start of thinking
             if not in_thought and '<think>' in text:
+                logger.info("💭 Detected thinking block start")
                 in_thought = True
                 # Insert thought placeholder
                 history.append({
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
+                    logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
+                    logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
             # Stream answer
             if not answer_buf:
+                logger.info("📝 Starting answer generation")
                 history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
             history[-1]['content'] = answer_buf
             yield history, debug
         gen_thread.join()
+        logger.info(f"✅ Generation completed: {token_count} tokens generated")
         yield history, debug + prompt_debug
     except Exception as e:
+        logger.error(f"❌ Error during generation: {e}")
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
+        logger.info("🧹 Cleaning up memory...")
         gc.collect()
+        logger.info("=" * 60)
 def cancel_generation():
+    logger.info("🛑 User requested generation cancellation")
     cancel_event.set()
     return 'Generation cancelled.'
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
                        model_dd, max_tok, temp, k, p, rp, st],
                outputs=[chat, dbg])
+    logger.info("🚀 Starting Gradio application...")
     demo.launch()