drewgenai commited on
Commit
2ba4984
·
1 Parent(s): 4756a36

remove dupicate code and clean up. fix agentic app

Browse files
Files changed (3) hide show
  1. 01-cleanragcsv.ipynb +18 -51
  2. app.py +18 -12
  3. app_working_on_agentic.py +26 -24
01-cleanragcsv.ipynb CHANGED
@@ -268,13 +268,6 @@
268
  "Example: Groups all related pain-assessment questions into one chunk."
269
  ]
270
  },
271
- {
272
- "cell_type": "markdown",
273
- "metadata": {},
274
- "source": [
275
- "###testingbelow\n"
276
- ]
277
- },
278
  {
279
  "cell_type": "code",
280
  "execution_count": 11,
@@ -286,24 +279,17 @@
286
  },
287
  {
288
  "cell_type": "code",
289
- "execution_count": 5,
290
  "metadata": {},
291
- "outputs": [
292
- {
293
- "name": "stderr",
294
- "output_type": "stream",
295
- "text": [
296
- "/tmp/ipykernel_361497/1110142159.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
297
- " embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
298
- ]
299
- }
300
- ],
301
  "source": [
302
  "from langchain_experimental.text_splitter import SemanticChunker\n",
303
  "\n",
304
  "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
305
  "\n",
306
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
 
 
307
  "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
308
  "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
309
  "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
@@ -319,22 +305,7 @@
319
  },
320
  {
321
  "cell_type": "code",
322
- "execution_count": 13,
323
- "metadata": {},
324
- "outputs": [],
325
- "source": [
326
- "#verify working\n",
327
- "# test_doc = all_documents[0].page_content if all_documents else \"\"\n",
328
- "# test_chunks = semantic_splitter.split_text(test_doc)\n",
329
- "\n",
330
- "# print(f\"\\n✅ Total Chunks for First Document: {len(test_chunks)}\")\n",
331
- "# for i, chunk in enumerate(test_chunks[:3]): # Show first 3 chunks\n",
332
- "# print(f\"\\n🔹 Chunk {i+1}: {chunk[:300]}\") # Print first 300 characters\n"
333
- ]
334
- },
335
- {
336
- "cell_type": "code",
337
- "execution_count": 14,
338
  "metadata": {},
339
  "outputs": [],
340
  "source": [
@@ -352,13 +323,6 @@
352
  " documents_with_metadata.append(doc_chunk)"
353
  ]
354
  },
355
- {
356
- "cell_type": "markdown",
357
- "metadata": {},
358
- "source": [
359
- "###testingabove"
360
- ]
361
- },
362
  {
363
  "cell_type": "code",
364
  "execution_count": 15,
@@ -372,7 +336,7 @@
372
  },
373
  {
374
  "cell_type": "code",
375
- "execution_count": 16,
376
  "metadata": {},
377
  "outputs": [],
378
  "source": [
@@ -381,10 +345,6 @@
381
  "from langchain.embeddings import HuggingFaceEmbeddings\n",
382
  "\n",
383
  "\n",
384
- "# Load the SentenceTransformer model\n",
385
- "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
386
- "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
387
- "\n",
388
  "# Load documents into Qdrant\n",
389
  "qdrant_vectorstore = Qdrant.from_documents(\n",
390
  " documents_with_metadata,\n",
@@ -399,7 +359,7 @@
399
  },
400
  {
401
  "cell_type": "code",
402
- "execution_count": 63,
403
  "metadata": {},
404
  "outputs": [],
405
  "source": [
@@ -438,7 +398,6 @@
438
  "\n",
439
  "from langchain_openai import ChatOpenAI\n",
440
  "\n",
441
- "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
442
  "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
443
  "\n",
444
  "from operator import itemgetter\n",
@@ -452,7 +411,7 @@
452
  },
453
  {
454
  "cell_type": "code",
455
- "execution_count": 64,
456
  "metadata": {},
457
  "outputs": [],
458
  "source": [
@@ -516,9 +475,17 @@
516
  },
517
  {
518
  "cell_type": "code",
519
- "execution_count": 67,
520
  "metadata": {},
521
- "outputs": [],
 
 
 
 
 
 
 
 
522
  "source": [
523
  "import json\n",
524
  "import pandas as pd\n",
 
268
  "Example: Groups all related pain-assessment questions into one chunk."
269
  ]
270
  },
 
 
 
 
 
 
 
271
  {
272
  "cell_type": "code",
273
  "execution_count": 11,
 
279
  },
280
  {
281
  "cell_type": "code",
282
+ "execution_count": 13,
283
  "metadata": {},
284
+ "outputs": [],
 
 
 
 
 
 
 
 
 
285
  "source": [
286
  "from langchain_experimental.text_splitter import SemanticChunker\n",
287
  "\n",
288
  "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
289
  "\n",
290
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
291
+ "\n",
292
+ "\n",
293
  "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
294
  "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
295
  "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
 
305
  },
306
  {
307
  "cell_type": "code",
308
+ "execution_count": 16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  "metadata": {},
310
  "outputs": [],
311
  "source": [
 
323
  " documents_with_metadata.append(doc_chunk)"
324
  ]
325
  },
 
 
 
 
 
 
 
326
  {
327
  "cell_type": "code",
328
  "execution_count": 15,
 
336
  },
337
  {
338
  "cell_type": "code",
339
+ "execution_count": 15,
340
  "metadata": {},
341
  "outputs": [],
342
  "source": [
 
345
  "from langchain.embeddings import HuggingFaceEmbeddings\n",
346
  "\n",
347
  "\n",
 
 
 
 
348
  "# Load documents into Qdrant\n",
349
  "qdrant_vectorstore = Qdrant.from_documents(\n",
350
  " documents_with_metadata,\n",
 
359
  },
360
  {
361
  "cell_type": "code",
362
+ "execution_count": 9,
363
  "metadata": {},
364
  "outputs": [],
365
  "source": [
 
398
  "\n",
399
  "from langchain_openai import ChatOpenAI\n",
400
  "\n",
 
401
  "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
402
  "\n",
403
  "from operator import itemgetter\n",
 
411
  },
412
  {
413
  "cell_type": "code",
414
+ "execution_count": 10,
415
  "metadata": {},
416
  "outputs": [],
417
  "source": [
 
475
  },
476
  {
477
  "cell_type": "code",
478
+ "execution_count": 12,
479
  "metadata": {},
480
+ "outputs": [
481
+ {
482
+ "name": "stdout",
483
+ "output_type": "stream",
484
+ "text": [
485
+ "✅ CSV file saved: matching_data_elements.csv\n"
486
+ ]
487
+ }
488
+ ],
489
  "source": [
490
  "import json\n",
491
  "import pandas as pd\n",
app.py CHANGED
@@ -28,19 +28,20 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
28
 
29
  model_id = "Snowflake/snowflake-arctic-embed-m"
30
  embedding_model = HuggingFaceEmbeddings(model_name=model_id)
31
- semantic_splitter = SemanticChunker(embedding_model)
32
  llm = ChatOpenAI(model="gpt-4o-mini")
33
 
34
 
35
- # comparison prompt
36
- export_prompt = """
37
- CONTEXT:
38
  CONTEXT:
39
  {context}
40
 
41
  QUERY:
42
  {question}
43
 
 
 
44
  Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
45
 
46
  ### **Output Format:**
@@ -103,18 +104,21 @@ def document_query_tool(question: str) -> str:
103
 
104
  retriever = cl.user_session.get("qdrant_retriever")
105
  if not retriever:
106
- return "Error: No documents available for retrieval. Please upload documents first."
 
107
 
108
- # Retrieve context from the vector database
109
- retrieved_docs = retriever.invoke(question)
110
- docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
 
 
 
111
 
112
- # Generate response using the natural query prompt
113
- messages = query_prompt.format_messages(question=question, context=docs_content)
114
- response = llm.invoke(messages)
115
 
116
  return {
117
- "messages": [HumanMessage(content=response.content)],
118
  "context": retrieved_docs
119
  }
120
 
@@ -126,6 +130,7 @@ def document_comparison_tool(question: str) -> str:
126
  retriever = cl.user_session.get("qdrant_retriever")
127
  if not retriever:
128
  return "Error: No documents available for retrieval. Please upload two PDF files first."
 
129
 
130
  # Process query using RAG
131
  rag_chain = (
@@ -178,6 +183,7 @@ async def process_files(files: list[cl.File]):
178
  return qdrant_vectorstore.as_retriever()
179
  return None
180
 
 
181
  @cl.on_chat_start
182
  async def start():
183
  cl.user_session.set("qdrant_retriever", None)
 
28
 
29
  model_id = "Snowflake/snowflake-arctic-embed-m"
30
  embedding_model = HuggingFaceEmbeddings(model_name=model_id)
31
+ semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
32
  llm = ChatOpenAI(model="gpt-4o-mini")
33
 
34
 
35
+ # Export comparison prompt
36
+ export_prompt = export_prompt = """
 
37
  CONTEXT:
38
  {context}
39
 
40
  QUERY:
41
  {question}
42
 
43
+ You are a helpful assistant. Use the available context to answer the question.
44
+
45
  Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
46
 
47
  ### **Output Format:**
 
104
 
105
  retriever = cl.user_session.get("qdrant_retriever")
106
  if not retriever:
107
+ return "Error: No documents available for retrieval. Please upload two PDF files first."
108
+ retriever = retriever.with_config({"k": 10})
109
 
110
+ # Use a RAG chain similar to the comparison tool
111
+ rag_chain = (
112
+ {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
113
+ | query_prompt | llm | StrOutputParser()
114
+ )
115
+ response_text = rag_chain.invoke({"question": question})
116
 
117
+ # Get the retrieved docs for context
118
+ retrieved_docs = retriever.invoke(question)
 
119
 
120
  return {
121
+ "messages": [HumanMessage(content=response_text)],
122
  "context": retrieved_docs
123
  }
124
 
 
130
  retriever = cl.user_session.get("qdrant_retriever")
131
  if not retriever:
132
  return "Error: No documents available for retrieval. Please upload two PDF files first."
133
+ retriever = retriever.with_config({"k": 10})
134
 
135
  # Process query using RAG
136
  rag_chain = (
 
183
  return qdrant_vectorstore.as_retriever()
184
  return None
185
 
186
+
187
  @cl.on_chat_start
188
  async def start():
189
  cl.user_session.set("qdrant_retriever", None)
app_working_on_agentic.py CHANGED
@@ -31,14 +31,10 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
31
  # Initialize embeddings model
32
  model_id = "Snowflake/snowflake-arctic-embed-m"
33
  embedding_model = HuggingFaceEmbeddings(model_name=model_id)
34
-
35
- # Define semantic chunker
36
- semantic_splitter = SemanticChunker(embedding_model)
37
-
38
- # Initialize LLM
39
  llm = ChatOpenAI(model="gpt-4o-mini")
40
 
41
- # Define RAG prompt
42
  export_prompt = """
43
  CONTEXT:
44
  {context}
@@ -108,18 +104,21 @@ def document_query_tool(question: str) -> str:
108
 
109
  retriever = cl.user_session.get("qdrant_retriever")
110
  if not retriever:
111
- return "Error: No documents available for retrieval. Please upload documents first."
 
112
 
113
- # Retrieve context from the vector database
114
- retrieved_docs = retriever.invoke(question)
115
- docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
 
 
 
116
 
117
- # Generate response using the natural query prompt
118
- messages = query_prompt.format_messages(question=question, context=docs_content)
119
- response = llm.invoke(messages)
120
 
121
  return {
122
- "answer": response.content,
123
  "context": retrieved_docs
124
  }
125
 
@@ -153,7 +152,16 @@ def document_comparison_tool(question: str) -> str:
153
  df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
154
  df.to_csv(file_path, index=False)
155
 
156
- return file_path # Return path to the CSV file
 
 
 
 
 
 
 
 
 
157
 
158
  except json.JSONDecodeError:
159
  return "Error: Response is not valid JSON."
@@ -258,17 +266,11 @@ async def handle_message(message: cl.Message):
258
  )
259
 
260
  # Handle the response based on the tool that was called
261
- if isinstance(response["output"], dict) and "answer" in response["output"]:
262
  # This is from document_query_tool
263
- await cl.Message(response["output"]["answer"]).send()
264
- elif isinstance(response["output"], str) and response["output"].endswith(".csv"):
265
- # This is from document_comparison_tool with a CSV file
266
- await cl.Message(
267
- content="Comparison complete! Download the CSV below:",
268
- elements=[cl.File(name="comparison_results.csv", path=response["output"], display="inline")],
269
- ).send()
270
  else:
271
- # Generic response
272
  await cl.Message(content=str(response["output"])).send()
273
 
274
  # Update chat history with the new exchange
 
31
  # Initialize embeddings model
32
  model_id = "Snowflake/snowflake-arctic-embed-m"
33
  embedding_model = HuggingFaceEmbeddings(model_name=model_id)
34
+ semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
 
 
 
 
35
  llm = ChatOpenAI(model="gpt-4o-mini")
36
 
37
+ # Export comparison prompt
38
  export_prompt = """
39
  CONTEXT:
40
  {context}
 
104
 
105
  retriever = cl.user_session.get("qdrant_retriever")
106
  if not retriever:
107
+ return "Error: No documents available for retrieval. Please upload two PDF files first."
108
+ retriever = retriever.with_config({"k": 10})
109
 
110
+ # Use a RAG chain similar to the comparison tool
111
+ rag_chain = (
112
+ {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
113
+ | query_prompt | llm | StrOutputParser()
114
+ )
115
+ response_text = rag_chain.invoke({"question": question})
116
 
117
+ # Get the retrieved docs for context
118
+ retrieved_docs = retriever.invoke(question)
 
119
 
120
  return {
121
+ "messages": [HumanMessage(content=response_text)],
122
  "context": retrieved_docs
123
  }
124
 
 
152
  df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
153
  df.to_csv(file_path, index=False)
154
 
155
+ # Send the message with the file directly from the tool
156
+ cl.run_sync(
157
+ cl.Message(
158
+ content="Comparison complete! Download the CSV below:",
159
+ elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
160
+ ).send()
161
+ )
162
+
163
+ # Return a simple confirmation message
164
+ return "Comparison results have been generated and displayed."
165
 
166
  except json.JSONDecodeError:
167
  return "Error: Response is not valid JSON."
 
266
  )
267
 
268
  # Handle the response based on the tool that was called
269
+ if isinstance(response["output"], dict) and "messages" in response["output"]:
270
  # This is from document_query_tool
271
+ await cl.Message(response["output"]["messages"][0].content).send()
 
 
 
 
 
 
272
  else:
273
+ # Generic response (including the confirmation from document_comparison_tool)
274
  await cl.Message(content=str(response["output"])).send()
275
 
276
  # Update chat history with the new exchange