audit_assistant

Sleeping

App Files Files Community

mtyrrell commited on Feb 14

Commit

ff62217

1 Parent(s): 17923a5

record_id, timestamp & duration fix; user agent platform extract

Browse files

Files changed (3) hide show

app.py +41 -17
auditqa/utils.py +47 -6
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from auditqa.reports import files, report_list, new_files, new_report_list
 from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
 from auditqa.retriever import get_context
 from auditqa.reader import nvidia_client, dedicated_endpoint
-from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip
 from dotenv import load_dotenv
 load_dotenv()
 from threading import Lock
@@ -22,7 +22,7 @@ import json
 # # fetch tokens and model config params
 SPACES_LOG = os.environ["SPACES_LOG"]
-SPACES_LOG = os.getenv('SPACES_LOG')
 model_config = getconfig("model_params.cfg")
@@ -38,8 +38,7 @@ scheduler = CommitScheduler(
      repo_type="dataset",
      folder_path=JSON_DATASET_DIR,
      path_in_repo="audit_chatbot",
-     token=SPACES_LOG,
-     every=2) # TESTING: every 2 seconds
 #####--------------- VECTOR STORE -------------------------------------------------
 # reports contain the already created chunks from Markdown version of pdf reports
@@ -105,24 +104,32 @@ def submit_feedback(feedback, logs_data):
         if logs_data is None:
             return gr.update(visible=False), gr.update(visible=True)
         save_logs(scheduler, JSON_DATASET_PATH, logs_data, feedback)
         return gr.update(visible=False), gr.update(visible=True)
     except Exception as e:
-        # Still need to return the expected outputs even on error
         return gr.update(visible=False), gr.update(visible=True)
-# Session Manager added (track session duration & location)
 class SessionManager:
     def __init__(self):
         self.sessions = {}
-    def create_session(self, client_ip):
         session_id = str(uuid4())
         self.sessions[session_id] = {
             'start_time': datetime.now(),
             'last_activity': datetime.now(),
             'client_ip': client_ip,
-            'location_info': get_client_location(client_ip)
         }
         return session_id
@@ -143,11 +150,12 @@ class SessionManager:
 # Initialize session manager
 session_manager = SessionManager()
-async def chat(query, history, sources, reports, subtype, year, client_ip=None, session_id=None):
     """Update chat function to handle session data"""
-    if not session_id: # Session managment
-        session_id = session_manager.create_session(client_ip)
     else:
         session_manager.update_session(session_id)
@@ -209,12 +217,13 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
     ##-----------------------get answer from endpoints------------------------------
     answer_yet = ""
-    # Logs strcuture updated for feedback + session data (moved up here because: feedback)
-    timestamp = str(datetime.now().timestamp())
     logs_data = {
         "session_id": session_id,
         "session_duration_seconds": session_duration,
         "client_location": session_data['location_info'],
         # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
         # "sources": sources, #REMOVED FOR TESTING
         # "reports": reports, #REMOVED FOR TESTING
@@ -225,8 +234,6 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
         "endpoint_type": model_config.get('reader','TYPE'),
         "reader": model_config.get('reader','NVIDIA_MODEL'),
         # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
-        "answer": "",
-        "time": timestamp,
     }
     if model_config.get('reader','TYPE') == 'NVIDIA':
@@ -280,7 +287,7 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
                     answer_yet += word + " "
                     parsed_answer = parse_output_llm_with_sources(answer_yet)
                     history[-1] = (query, parsed_answer)
-                    # Update logs_data with current answer
                     logs_data["answer"] = parsed_answer
                     yield [tuple(x) for x in history], docs_html, logs_data, session_id
                     await asyncio.sleep(0.05)
@@ -291,6 +298,23 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
         async for update in process_stream():
             yield update
     try:
         # Save log after streaming is complete
         save_logs(scheduler, JSON_DATASET_PATH, logs_data)
@@ -337,7 +361,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
                     avatar_images = (None,"data-collection.png"),
                 )
-                # Add feedback elements directly under the chatbot
                 with gr.Column(elem_id="feedback-container"):
                     with gr.Row(visible=False) as feedback_row:
                         gr.Markdown("Was this response helpful?")

 from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
 from auditqa.retriever import get_context
 from auditqa.reader import nvidia_client, dedicated_endpoint
+from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip, get_platform_info
 from dotenv import load_dotenv
 load_dotenv()
 from threading import Lock
 # # fetch tokens and model config params
 SPACES_LOG = os.environ["SPACES_LOG"]
+# SPACES_LOG = os.getenv('SPACES_LOG')
 model_config = getconfig("model_params.cfg")
      repo_type="dataset",
      folder_path=JSON_DATASET_DIR,
      path_in_repo="audit_chatbot",
+     token=SPACES_LOG)
 #####--------------- VECTOR STORE -------------------------------------------------
 # reports contain the already created chunks from Markdown version of pdf reports
         if logs_data is None:
             return gr.update(visible=False), gr.update(visible=True)
+        session_id = logs_data.get("session_id")
+        if session_id:
+            # Update session last_activity to now
+            session_manager.update_session(session_id)
+            # Compute duration from the session manager and update the log.
+            logs_data["session_duration_seconds"] = session_manager.get_session_duration(session_id)
+        # Now save the (feedback) log record
         save_logs(scheduler, JSON_DATASET_PATH, logs_data, feedback)
         return gr.update(visible=False), gr.update(visible=True)
     except Exception as e:
         return gr.update(visible=False), gr.update(visible=True)
+# Session Manager added (track session duration, location, and platform)
 class SessionManager:
     def __init__(self):
         self.sessions = {}
+    def create_session(self, client_ip, user_agent):
         session_id = str(uuid4())
         self.sessions[session_id] = {
             'start_time': datetime.now(),
             'last_activity': datetime.now(),
             'client_ip': client_ip,
+            'location_info': get_client_location(client_ip),
+            'platform_info': get_platform_info(user_agent)
         }
         return session_id
 # Initialize session manager
 session_manager = SessionManager()
+async def chat(query, history, sources, reports, subtype, year, client_ip=None, session_id=None, request: gr.Request = None):
     """Update chat function to handle session data"""
+    if not session_id:
+        user_agent = request.headers.get('User-Agent', '') if request else ''
+        session_id = session_manager.create_session(client_ip, user_agent)
     else:
         session_manager.update_session(session_id)
     ##-----------------------get answer from endpoints------------------------------
     answer_yet = ""
+    # Logs strcuture updated for session data (feedback and timestamp added separately via save_logs)
     logs_data = {
+        "record_id": str(uuid4()),  # Add unique record ID
         "session_id": session_id,
         "session_duration_seconds": session_duration,
         "client_location": session_data['location_info'],
+        "platform": session_data['platform_info'],
         # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
         # "sources": sources, #REMOVED FOR TESTING
         # "reports": reports, #REMOVED FOR TESTING
         "endpoint_type": model_config.get('reader','TYPE'),
         "reader": model_config.get('reader','NVIDIA_MODEL'),
         # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
     }
     if model_config.get('reader','TYPE') == 'NVIDIA':
                     answer_yet += word + " "
                     parsed_answer = parse_output_llm_with_sources(answer_yet)
                     history[-1] = (query, parsed_answer)
+                    # Update logs_data with current answer (and get a new timestamp)
                     logs_data["answer"] = parsed_answer
                     yield [tuple(x) for x in history], docs_html, logs_data, session_id
                     await asyncio.sleep(0.05)
         async for update in process_stream():
             yield update
+        # chat_model = dedicated_endpoint()
+        # async def process_stream():
+        # # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
+        # # instead of modifying the one from the outer scope.
+        #     nonlocal answer_yet # Use the outer scope's answer_yet variable
+        #     # Iterate over the streaming response chunks
+        #     async for chunk in chat_model.astream(messages):
+        #         token = chunk.content
+        #         answer_yet += token
+        #         parsed_answer = parse_output_llm_with_sources(answer_yet)
+        #         history[-1] = (query, parsed_answer)
+        #         yield [tuple(x) for x in history], docs_html
+        # # Stream the response updates
+        # async for update in process_stream():
+        #     yield update
     try:
         # Save log after streaming is complete
         save_logs(scheduler, JSON_DATASET_PATH, logs_data)
                     avatar_images = (None,"data-collection.png"),
                 )
+                # feedback UI
                 with gr.Column(elem_id="feedback-container"):
                     with gr.Row(visible=False) as feedback_row:
                         gr.Markdown("Was this response helpful?")

auditqa/utils.py CHANGED Viewed

@@ -14,17 +14,47 @@ import random
 def save_logs(scheduler, JSON_DATASET_PATH, logs, feedback=None) -> None:
     """ Every interaction with app saves the log of question and answer,
-        this is to get the usage statistics of app and evaluate model performances.
-        Also saves user feedback (when provided).
     """
     try:
-        if feedback:
-            logs["feedback"] = feedback #optional
         with scheduler.lock:
             with open(JSON_DATASET_PATH, 'a') as f:
-                json.dump(logs, f)
                 f.write("\n")
     except Exception as e:
         raise
@@ -140,4 +170,15 @@ def get_client_location(ip_address) -> dict | None:
     except requests.exceptions.RequestException as e:
         logging.error(f"Request failed: {str(e)}")
-        return None

 def save_logs(scheduler, JSON_DATASET_PATH, logs, feedback=None) -> None:
     """ Every interaction with app saves the log of question and answer,
+        this is to get the usage statistics of app and evaluate model performances
+        Also adding feedback and timestamp
     """
     try:
+        # We get the timestamp here now because we are simply recording time of logging
+        current_time = datetime.now().timestamp()
+        logs["time"] = str(current_time)
+        # Save feedback (if any)
+        if feedback:
+            logs["feedback"] = feedback
+            logs["record_id"] = str(uuid4())
+        # Do some reordering to keep things clean (time up front)
+        field_order = [
+            "record_id",
+            "session_id",
+            "time",  # current log time
+            "session_duration_seconds",
+            "client_location",
+            "platform",
+            "system_prompt",
+            "sources",
+            "reports",
+            "subtype",
+            "year",
+            "question",
+            "retriever",
+            "endpoint_type",
+            "reader",
+            "docs",
+            "answer",
+            "feedback"
+        ]
+        ordered_logs = {k: logs.get(k) for k in field_order if k in logs}
         with scheduler.lock:
             with open(JSON_DATASET_PATH, 'a') as f:
+                json.dump(ordered_logs, f)
                 f.write("\n")
+                logging.info("logging done")
     except Exception as e:
         raise
     except requests.exceptions.RequestException as e:
         logging.error(f"Request failed: {str(e)}")
+        return None
+def get_platform_info(user_agent: str) -> str:
+    """Get platform info"""
+    # Make a best guess at the device type
+    if any(mobile_keyword in user_agent.lower() for mobile_keyword in ['mobile', 'android', 'iphone', 'ipad', 'ipod']):
+        platform_info = 'mobile'
+    else:
+        platform_info = 'desktop'
+    return platform_info

requirements.txt CHANGED Viewed

@@ -158,4 +158,8 @@ wcwidth==0.2.13
 websockets==11.0.3
 wheel==0.44.0
 xxhash==3.4.1
-yarl==1.9.4

 websockets==11.0.3
 wheel==0.44.0
 xxhash==3.4.1
+yarl==1.9.4
+# Platform info
+user-agents==2.2.0
+ua-parser==1.0.1
+ua-parser-builtins==0.18.0.post1