mtyrrell commited on
Commit
ff62217
·
1 Parent(s): 17923a5

record_id, timestamp & duration fix; user agent platform extract

Browse files
Files changed (3) hide show
  1. app.py +41 -17
  2. auditqa/utils.py +47 -6
  3. requirements.txt +5 -1
app.py CHANGED
@@ -12,7 +12,7 @@ from auditqa.reports import files, report_list, new_files, new_report_list
12
  from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
13
  from auditqa.retriever import get_context
14
  from auditqa.reader import nvidia_client, dedicated_endpoint
15
- from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip
16
  from dotenv import load_dotenv
17
  load_dotenv()
18
  from threading import Lock
@@ -22,7 +22,7 @@ import json
22
 
23
  # # fetch tokens and model config params
24
  SPACES_LOG = os.environ["SPACES_LOG"]
25
- SPACES_LOG = os.getenv('SPACES_LOG')
26
 
27
  model_config = getconfig("model_params.cfg")
28
 
@@ -38,8 +38,7 @@ scheduler = CommitScheduler(
38
  repo_type="dataset",
39
  folder_path=JSON_DATASET_DIR,
40
  path_in_repo="audit_chatbot",
41
- token=SPACES_LOG,
42
- every=2) # TESTING: every 2 seconds
43
 
44
  #####--------------- VECTOR STORE -------------------------------------------------
45
  # reports contain the already created chunks from Markdown version of pdf reports
@@ -105,24 +104,32 @@ def submit_feedback(feedback, logs_data):
105
  if logs_data is None:
106
  return gr.update(visible=False), gr.update(visible=True)
107
 
 
 
 
 
 
 
 
 
108
  save_logs(scheduler, JSON_DATASET_PATH, logs_data, feedback)
109
  return gr.update(visible=False), gr.update(visible=True)
110
  except Exception as e:
111
- # Still need to return the expected outputs even on error
112
  return gr.update(visible=False), gr.update(visible=True)
113
 
114
- # Session Manager added (track session duration & location)
115
  class SessionManager:
116
  def __init__(self):
117
  self.sessions = {}
118
 
119
- def create_session(self, client_ip):
120
  session_id = str(uuid4())
121
  self.sessions[session_id] = {
122
  'start_time': datetime.now(),
123
  'last_activity': datetime.now(),
124
  'client_ip': client_ip,
125
- 'location_info': get_client_location(client_ip)
 
126
  }
127
  return session_id
128
 
@@ -143,11 +150,12 @@ class SessionManager:
143
  # Initialize session manager
144
  session_manager = SessionManager()
145
 
146
- async def chat(query, history, sources, reports, subtype, year, client_ip=None, session_id=None):
147
  """Update chat function to handle session data"""
148
 
149
- if not session_id: # Session managment
150
- session_id = session_manager.create_session(client_ip)
 
151
  else:
152
  session_manager.update_session(session_id)
153
 
@@ -209,12 +217,13 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
209
 
210
  ##-----------------------get answer from endpoints------------------------------
211
  answer_yet = ""
212
- # Logs strcuture updated for feedback + session data (moved up here because: feedback)
213
- timestamp = str(datetime.now().timestamp())
214
  logs_data = {
 
215
  "session_id": session_id,
216
  "session_duration_seconds": session_duration,
217
  "client_location": session_data['location_info'],
 
218
  # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
219
  # "sources": sources, #REMOVED FOR TESTING
220
  # "reports": reports, #REMOVED FOR TESTING
@@ -225,8 +234,6 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
225
  "endpoint_type": model_config.get('reader','TYPE'),
226
  "reader": model_config.get('reader','NVIDIA_MODEL'),
227
  # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
228
- "answer": "",
229
- "time": timestamp,
230
  }
231
 
232
  if model_config.get('reader','TYPE') == 'NVIDIA':
@@ -280,7 +287,7 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
280
  answer_yet += word + " "
281
  parsed_answer = parse_output_llm_with_sources(answer_yet)
282
  history[-1] = (query, parsed_answer)
283
- # Update logs_data with current answer
284
  logs_data["answer"] = parsed_answer
285
  yield [tuple(x) for x in history], docs_html, logs_data, session_id
286
  await asyncio.sleep(0.05)
@@ -291,6 +298,23 @@ async def chat(query, history, sources, reports, subtype, year, client_ip=None,
291
  async for update in process_stream():
292
  yield update
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  try:
295
  # Save log after streaming is complete
296
  save_logs(scheduler, JSON_DATASET_PATH, logs_data)
@@ -337,7 +361,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
337
  avatar_images = (None,"data-collection.png"),
338
  )
339
 
340
- # Add feedback elements directly under the chatbot
341
  with gr.Column(elem_id="feedback-container"):
342
  with gr.Row(visible=False) as feedback_row:
343
  gr.Markdown("Was this response helpful?")
 
12
  from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
13
  from auditqa.retriever import get_context
14
  from auditqa.reader import nvidia_client, dedicated_endpoint
15
+ from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip, get_platform_info
16
  from dotenv import load_dotenv
17
  load_dotenv()
18
  from threading import Lock
 
22
 
23
  # # fetch tokens and model config params
24
  SPACES_LOG = os.environ["SPACES_LOG"]
25
+ # SPACES_LOG = os.getenv('SPACES_LOG')
26
 
27
  model_config = getconfig("model_params.cfg")
28
 
 
38
  repo_type="dataset",
39
  folder_path=JSON_DATASET_DIR,
40
  path_in_repo="audit_chatbot",
41
+ token=SPACES_LOG)
 
42
 
43
  #####--------------- VECTOR STORE -------------------------------------------------
44
  # reports contain the already created chunks from Markdown version of pdf reports
 
104
  if logs_data is None:
105
  return gr.update(visible=False), gr.update(visible=True)
106
 
107
+ session_id = logs_data.get("session_id")
108
+ if session_id:
109
+ # Update session last_activity to now
110
+ session_manager.update_session(session_id)
111
+ # Compute duration from the session manager and update the log.
112
+ logs_data["session_duration_seconds"] = session_manager.get_session_duration(session_id)
113
+
114
+ # Now save the (feedback) log record
115
  save_logs(scheduler, JSON_DATASET_PATH, logs_data, feedback)
116
  return gr.update(visible=False), gr.update(visible=True)
117
  except Exception as e:
 
118
  return gr.update(visible=False), gr.update(visible=True)
119
 
120
+ # Session Manager added (track session duration, location, and platform)
121
  class SessionManager:
122
  def __init__(self):
123
  self.sessions = {}
124
 
125
+ def create_session(self, client_ip, user_agent):
126
  session_id = str(uuid4())
127
  self.sessions[session_id] = {
128
  'start_time': datetime.now(),
129
  'last_activity': datetime.now(),
130
  'client_ip': client_ip,
131
+ 'location_info': get_client_location(client_ip),
132
+ 'platform_info': get_platform_info(user_agent)
133
  }
134
  return session_id
135
 
 
150
  # Initialize session manager
151
  session_manager = SessionManager()
152
 
153
+ async def chat(query, history, sources, reports, subtype, year, client_ip=None, session_id=None, request: gr.Request = None):
154
  """Update chat function to handle session data"""
155
 
156
+ if not session_id:
157
+ user_agent = request.headers.get('User-Agent', '') if request else ''
158
+ session_id = session_manager.create_session(client_ip, user_agent)
159
  else:
160
  session_manager.update_session(session_id)
161
 
 
217
 
218
  ##-----------------------get answer from endpoints------------------------------
219
  answer_yet = ""
220
+ # Logs strcuture updated for session data (feedback and timestamp added separately via save_logs)
 
221
  logs_data = {
222
+ "record_id": str(uuid4()), # Add unique record ID
223
  "session_id": session_id,
224
  "session_duration_seconds": session_duration,
225
  "client_location": session_data['location_info'],
226
+ "platform": session_data['platform_info'],
227
  # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
228
  # "sources": sources, #REMOVED FOR TESTING
229
  # "reports": reports, #REMOVED FOR TESTING
 
234
  "endpoint_type": model_config.get('reader','TYPE'),
235
  "reader": model_config.get('reader','NVIDIA_MODEL'),
236
  # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
 
 
237
  }
238
 
239
  if model_config.get('reader','TYPE') == 'NVIDIA':
 
287
  answer_yet += word + " "
288
  parsed_answer = parse_output_llm_with_sources(answer_yet)
289
  history[-1] = (query, parsed_answer)
290
+ # Update logs_data with current answer (and get a new timestamp)
291
  logs_data["answer"] = parsed_answer
292
  yield [tuple(x) for x in history], docs_html, logs_data, session_id
293
  await asyncio.sleep(0.05)
 
298
  async for update in process_stream():
299
  yield update
300
 
301
+ # chat_model = dedicated_endpoint()
302
+ # async def process_stream():
303
+ # # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
304
+ # # instead of modifying the one from the outer scope.
305
+ # nonlocal answer_yet # Use the outer scope's answer_yet variable
306
+ # # Iterate over the streaming response chunks
307
+ # async for chunk in chat_model.astream(messages):
308
+ # token = chunk.content
309
+ # answer_yet += token
310
+ # parsed_answer = parse_output_llm_with_sources(answer_yet)
311
+ # history[-1] = (query, parsed_answer)
312
+ # yield [tuple(x) for x in history], docs_html
313
+
314
+ # # Stream the response updates
315
+ # async for update in process_stream():
316
+ # yield update
317
+
318
  try:
319
  # Save log after streaming is complete
320
  save_logs(scheduler, JSON_DATASET_PATH, logs_data)
 
361
  avatar_images = (None,"data-collection.png"),
362
  )
363
 
364
+ # feedback UI
365
  with gr.Column(elem_id="feedback-container"):
366
  with gr.Row(visible=False) as feedback_row:
367
  gr.Markdown("Was this response helpful?")
auditqa/utils.py CHANGED
@@ -14,17 +14,47 @@ import random
14
 
15
  def save_logs(scheduler, JSON_DATASET_PATH, logs, feedback=None) -> None:
16
  """ Every interaction with app saves the log of question and answer,
17
- this is to get the usage statistics of app and evaluate model performances.
18
- Also saves user feedback (when provided).
19
  """
20
  try:
21
- if feedback:
22
- logs["feedback"] = feedback #optional
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  with scheduler.lock:
25
  with open(JSON_DATASET_PATH, 'a') as f:
26
- json.dump(logs, f)
27
  f.write("\n")
 
28
  except Exception as e:
29
  raise
30
 
@@ -140,4 +170,15 @@ def get_client_location(ip_address) -> dict | None:
140
 
141
  except requests.exceptions.RequestException as e:
142
  logging.error(f"Request failed: {str(e)}")
143
- return None
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def save_logs(scheduler, JSON_DATASET_PATH, logs, feedback=None) -> None:
16
  """ Every interaction with app saves the log of question and answer,
17
+ this is to get the usage statistics of app and evaluate model performances
18
+ Also adding feedback and timestamp
19
  """
20
  try:
21
+ # We get the timestamp here now because we are simply recording time of logging
22
+ current_time = datetime.now().timestamp()
23
+ logs["time"] = str(current_time)
24
 
25
+ # Save feedback (if any)
26
+ if feedback:
27
+ logs["feedback"] = feedback
28
+ logs["record_id"] = str(uuid4())
29
+
30
+ # Do some reordering to keep things clean (time up front)
31
+ field_order = [
32
+ "record_id",
33
+ "session_id",
34
+ "time", # current log time
35
+ "session_duration_seconds",
36
+ "client_location",
37
+ "platform",
38
+ "system_prompt",
39
+ "sources",
40
+ "reports",
41
+ "subtype",
42
+ "year",
43
+ "question",
44
+ "retriever",
45
+ "endpoint_type",
46
+ "reader",
47
+ "docs",
48
+ "answer",
49
+ "feedback"
50
+ ]
51
+ ordered_logs = {k: logs.get(k) for k in field_order if k in logs}
52
+
53
  with scheduler.lock:
54
  with open(JSON_DATASET_PATH, 'a') as f:
55
+ json.dump(ordered_logs, f)
56
  f.write("\n")
57
+ logging.info("logging done")
58
  except Exception as e:
59
  raise
60
 
 
170
 
171
  except requests.exceptions.RequestException as e:
172
  logging.error(f"Request failed: {str(e)}")
173
+ return None
174
+
175
+
176
+ def get_platform_info(user_agent: str) -> str:
177
+ """Get platform info"""
178
+ # Make a best guess at the device type
179
+ if any(mobile_keyword in user_agent.lower() for mobile_keyword in ['mobile', 'android', 'iphone', 'ipad', 'ipod']):
180
+ platform_info = 'mobile'
181
+ else:
182
+ platform_info = 'desktop'
183
+
184
+ return platform_info
requirements.txt CHANGED
@@ -158,4 +158,8 @@ wcwidth==0.2.13
158
  websockets==11.0.3
159
  wheel==0.44.0
160
  xxhash==3.4.1
161
- yarl==1.9.4
 
 
 
 
 
158
  websockets==11.0.3
159
  wheel==0.44.0
160
  xxhash==3.4.1
161
+ yarl==1.9.4
162
+ # Platform info
163
+ user-agents==2.2.0
164
+ ua-parser==1.0.1
165
+ ua-parser-builtins==0.18.0.post1