Spaces:
Sleeping
Sleeping
remove dupicate code and clean up. fix agentic app
Browse files- 01-cleanragcsv.ipynb +18 -51
- app.py +18 -12
- app_working_on_agentic.py +26 -24
01-cleanragcsv.ipynb
CHANGED
@@ -268,13 +268,6 @@
|
|
268 |
"Example: Groups all related pain-assessment questions into one chunk."
|
269 |
]
|
270 |
},
|
271 |
-
{
|
272 |
-
"cell_type": "markdown",
|
273 |
-
"metadata": {},
|
274 |
-
"source": [
|
275 |
-
"###testingbelow\n"
|
276 |
-
]
|
277 |
-
},
|
278 |
{
|
279 |
"cell_type": "code",
|
280 |
"execution_count": 11,
|
@@ -286,24 +279,17 @@
|
|
286 |
},
|
287 |
{
|
288 |
"cell_type": "code",
|
289 |
-
"execution_count":
|
290 |
"metadata": {},
|
291 |
-
"outputs": [
|
292 |
-
{
|
293 |
-
"name": "stderr",
|
294 |
-
"output_type": "stream",
|
295 |
-
"text": [
|
296 |
-
"/tmp/ipykernel_361497/1110142159.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
|
297 |
-
" embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
|
298 |
-
]
|
299 |
-
}
|
300 |
-
],
|
301 |
"source": [
|
302 |
"from langchain_experimental.text_splitter import SemanticChunker\n",
|
303 |
"\n",
|
304 |
"from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
|
305 |
"\n",
|
306 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
|
|
|
|
307 |
"model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
|
308 |
"embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
|
309 |
"# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
|
@@ -319,22 +305,7 @@
|
|
319 |
},
|
320 |
{
|
321 |
"cell_type": "code",
|
322 |
-
"execution_count":
|
323 |
-
"metadata": {},
|
324 |
-
"outputs": [],
|
325 |
-
"source": [
|
326 |
-
"#verify working\n",
|
327 |
-
"# test_doc = all_documents[0].page_content if all_documents else \"\"\n",
|
328 |
-
"# test_chunks = semantic_splitter.split_text(test_doc)\n",
|
329 |
-
"\n",
|
330 |
-
"# print(f\"\\n✅ Total Chunks for First Document: {len(test_chunks)}\")\n",
|
331 |
-
"# for i, chunk in enumerate(test_chunks[:3]): # Show first 3 chunks\n",
|
332 |
-
"# print(f\"\\n🔹 Chunk {i+1}: {chunk[:300]}\") # Print first 300 characters\n"
|
333 |
-
]
|
334 |
-
},
|
335 |
-
{
|
336 |
-
"cell_type": "code",
|
337 |
-
"execution_count": 14,
|
338 |
"metadata": {},
|
339 |
"outputs": [],
|
340 |
"source": [
|
@@ -352,13 +323,6 @@
|
|
352 |
" documents_with_metadata.append(doc_chunk)"
|
353 |
]
|
354 |
},
|
355 |
-
{
|
356 |
-
"cell_type": "markdown",
|
357 |
-
"metadata": {},
|
358 |
-
"source": [
|
359 |
-
"###testingabove"
|
360 |
-
]
|
361 |
-
},
|
362 |
{
|
363 |
"cell_type": "code",
|
364 |
"execution_count": 15,
|
@@ -372,7 +336,7 @@
|
|
372 |
},
|
373 |
{
|
374 |
"cell_type": "code",
|
375 |
-
"execution_count":
|
376 |
"metadata": {},
|
377 |
"outputs": [],
|
378 |
"source": [
|
@@ -381,10 +345,6 @@
|
|
381 |
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
382 |
"\n",
|
383 |
"\n",
|
384 |
-
"# Load the SentenceTransformer model\n",
|
385 |
-
"model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
|
386 |
-
"embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
|
387 |
-
"\n",
|
388 |
"# Load documents into Qdrant\n",
|
389 |
"qdrant_vectorstore = Qdrant.from_documents(\n",
|
390 |
" documents_with_metadata,\n",
|
@@ -399,7 +359,7 @@
|
|
399 |
},
|
400 |
{
|
401 |
"cell_type": "code",
|
402 |
-
"execution_count":
|
403 |
"metadata": {},
|
404 |
"outputs": [],
|
405 |
"source": [
|
@@ -438,7 +398,6 @@
|
|
438 |
"\n",
|
439 |
"from langchain_openai import ChatOpenAI\n",
|
440 |
"\n",
|
441 |
-
"#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
|
442 |
"openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
|
443 |
"\n",
|
444 |
"from operator import itemgetter\n",
|
@@ -452,7 +411,7 @@
|
|
452 |
},
|
453 |
{
|
454 |
"cell_type": "code",
|
455 |
-
"execution_count":
|
456 |
"metadata": {},
|
457 |
"outputs": [],
|
458 |
"source": [
|
@@ -516,9 +475,17 @@
|
|
516 |
},
|
517 |
{
|
518 |
"cell_type": "code",
|
519 |
-
"execution_count":
|
520 |
"metadata": {},
|
521 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
"source": [
|
523 |
"import json\n",
|
524 |
"import pandas as pd\n",
|
|
|
268 |
"Example: Groups all related pain-assessment questions into one chunk."
|
269 |
]
|
270 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
{
|
272 |
"cell_type": "code",
|
273 |
"execution_count": 11,
|
|
|
279 |
},
|
280 |
{
|
281 |
"cell_type": "code",
|
282 |
+
"execution_count": 13,
|
283 |
"metadata": {},
|
284 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
"source": [
|
286 |
"from langchain_experimental.text_splitter import SemanticChunker\n",
|
287 |
"\n",
|
288 |
"from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
|
289 |
"\n",
|
290 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
291 |
+
"\n",
|
292 |
+
"\n",
|
293 |
"model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
|
294 |
"embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
|
295 |
"# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
|
|
|
305 |
},
|
306 |
{
|
307 |
"cell_type": "code",
|
308 |
+
"execution_count": 16,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
"metadata": {},
|
310 |
"outputs": [],
|
311 |
"source": [
|
|
|
323 |
" documents_with_metadata.append(doc_chunk)"
|
324 |
]
|
325 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
{
|
327 |
"cell_type": "code",
|
328 |
"execution_count": 15,
|
|
|
336 |
},
|
337 |
{
|
338 |
"cell_type": "code",
|
339 |
+
"execution_count": 15,
|
340 |
"metadata": {},
|
341 |
"outputs": [],
|
342 |
"source": [
|
|
|
345 |
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
346 |
"\n",
|
347 |
"\n",
|
|
|
|
|
|
|
|
|
348 |
"# Load documents into Qdrant\n",
|
349 |
"qdrant_vectorstore = Qdrant.from_documents(\n",
|
350 |
" documents_with_metadata,\n",
|
|
|
359 |
},
|
360 |
{
|
361 |
"cell_type": "code",
|
362 |
+
"execution_count": 9,
|
363 |
"metadata": {},
|
364 |
"outputs": [],
|
365 |
"source": [
|
|
|
398 |
"\n",
|
399 |
"from langchain_openai import ChatOpenAI\n",
|
400 |
"\n",
|
|
|
401 |
"openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
|
402 |
"\n",
|
403 |
"from operator import itemgetter\n",
|
|
|
411 |
},
|
412 |
{
|
413 |
"cell_type": "code",
|
414 |
+
"execution_count": 10,
|
415 |
"metadata": {},
|
416 |
"outputs": [],
|
417 |
"source": [
|
|
|
475 |
},
|
476 |
{
|
477 |
"cell_type": "code",
|
478 |
+
"execution_count": 12,
|
479 |
"metadata": {},
|
480 |
+
"outputs": [
|
481 |
+
{
|
482 |
+
"name": "stdout",
|
483 |
+
"output_type": "stream",
|
484 |
+
"text": [
|
485 |
+
"✅ CSV file saved: matching_data_elements.csv\n"
|
486 |
+
]
|
487 |
+
}
|
488 |
+
],
|
489 |
"source": [
|
490 |
"import json\n",
|
491 |
"import pandas as pd\n",
|
app.py
CHANGED
@@ -28,19 +28,20 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
|
|
28 |
|
29 |
model_id = "Snowflake/snowflake-arctic-embed-m"
|
30 |
embedding_model = HuggingFaceEmbeddings(model_name=model_id)
|
31 |
-
semantic_splitter = SemanticChunker(embedding_model)
|
32 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
33 |
|
34 |
|
35 |
-
# comparison prompt
|
36 |
-
export_prompt = """
|
37 |
-
CONTEXT:
|
38 |
CONTEXT:
|
39 |
{context}
|
40 |
|
41 |
QUERY:
|
42 |
{question}
|
43 |
|
|
|
|
|
44 |
Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
|
45 |
|
46 |
### **Output Format:**
|
@@ -103,18 +104,21 @@ def document_query_tool(question: str) -> str:
|
|
103 |
|
104 |
retriever = cl.user_session.get("qdrant_retriever")
|
105 |
if not retriever:
|
106 |
-
return "Error: No documents available for retrieval. Please upload
|
|
|
107 |
|
108 |
-
#
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
-
#
|
113 |
-
|
114 |
-
response = llm.invoke(messages)
|
115 |
|
116 |
return {
|
117 |
-
"messages": [HumanMessage(content=
|
118 |
"context": retrieved_docs
|
119 |
}
|
120 |
|
@@ -126,6 +130,7 @@ def document_comparison_tool(question: str) -> str:
|
|
126 |
retriever = cl.user_session.get("qdrant_retriever")
|
127 |
if not retriever:
|
128 |
return "Error: No documents available for retrieval. Please upload two PDF files first."
|
|
|
129 |
|
130 |
# Process query using RAG
|
131 |
rag_chain = (
|
@@ -178,6 +183,7 @@ async def process_files(files: list[cl.File]):
|
|
178 |
return qdrant_vectorstore.as_retriever()
|
179 |
return None
|
180 |
|
|
|
181 |
@cl.on_chat_start
|
182 |
async def start():
|
183 |
cl.user_session.set("qdrant_retriever", None)
|
|
|
28 |
|
29 |
model_id = "Snowflake/snowflake-arctic-embed-m"
|
30 |
embedding_model = HuggingFaceEmbeddings(model_name=model_id)
|
31 |
+
semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
|
32 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
33 |
|
34 |
|
35 |
+
# Export comparison prompt
|
36 |
+
export_prompt = export_prompt = """
|
|
|
37 |
CONTEXT:
|
38 |
{context}
|
39 |
|
40 |
QUERY:
|
41 |
{question}
|
42 |
|
43 |
+
You are a helpful assistant. Use the available context to answer the question.
|
44 |
+
|
45 |
Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
|
46 |
|
47 |
### **Output Format:**
|
|
|
104 |
|
105 |
retriever = cl.user_session.get("qdrant_retriever")
|
106 |
if not retriever:
|
107 |
+
return "Error: No documents available for retrieval. Please upload two PDF files first."
|
108 |
+
retriever = retriever.with_config({"k": 10})
|
109 |
|
110 |
+
# Use a RAG chain similar to the comparison tool
|
111 |
+
rag_chain = (
|
112 |
+
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
|
113 |
+
| query_prompt | llm | StrOutputParser()
|
114 |
+
)
|
115 |
+
response_text = rag_chain.invoke({"question": question})
|
116 |
|
117 |
+
# Get the retrieved docs for context
|
118 |
+
retrieved_docs = retriever.invoke(question)
|
|
|
119 |
|
120 |
return {
|
121 |
+
"messages": [HumanMessage(content=response_text)],
|
122 |
"context": retrieved_docs
|
123 |
}
|
124 |
|
|
|
130 |
retriever = cl.user_session.get("qdrant_retriever")
|
131 |
if not retriever:
|
132 |
return "Error: No documents available for retrieval. Please upload two PDF files first."
|
133 |
+
retriever = retriever.with_config({"k": 10})
|
134 |
|
135 |
# Process query using RAG
|
136 |
rag_chain = (
|
|
|
183 |
return qdrant_vectorstore.as_retriever()
|
184 |
return None
|
185 |
|
186 |
+
|
187 |
@cl.on_chat_start
|
188 |
async def start():
|
189 |
cl.user_session.set("qdrant_retriever", None)
|
app_working_on_agentic.py
CHANGED
@@ -31,14 +31,10 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
|
|
31 |
# Initialize embeddings model
|
32 |
model_id = "Snowflake/snowflake-arctic-embed-m"
|
33 |
embedding_model = HuggingFaceEmbeddings(model_name=model_id)
|
34 |
-
|
35 |
-
# Define semantic chunker
|
36 |
-
semantic_splitter = SemanticChunker(embedding_model)
|
37 |
-
|
38 |
-
# Initialize LLM
|
39 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
40 |
|
41 |
-
#
|
42 |
export_prompt = """
|
43 |
CONTEXT:
|
44 |
{context}
|
@@ -108,18 +104,21 @@ def document_query_tool(question: str) -> str:
|
|
108 |
|
109 |
retriever = cl.user_session.get("qdrant_retriever")
|
110 |
if not retriever:
|
111 |
-
return "Error: No documents available for retrieval. Please upload
|
|
|
112 |
|
113 |
-
#
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
116 |
|
117 |
-
#
|
118 |
-
|
119 |
-
response = llm.invoke(messages)
|
120 |
|
121 |
return {
|
122 |
-
"
|
123 |
"context": retrieved_docs
|
124 |
}
|
125 |
|
@@ -153,7 +152,16 @@ def document_comparison_tool(question: str) -> str:
|
|
153 |
df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
|
154 |
df.to_csv(file_path, index=False)
|
155 |
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
except json.JSONDecodeError:
|
159 |
return "Error: Response is not valid JSON."
|
@@ -258,17 +266,11 @@ async def handle_message(message: cl.Message):
|
|
258 |
)
|
259 |
|
260 |
# Handle the response based on the tool that was called
|
261 |
-
if isinstance(response["output"], dict) and "
|
262 |
# This is from document_query_tool
|
263 |
-
await cl.Message(response["output"]["
|
264 |
-
elif isinstance(response["output"], str) and response["output"].endswith(".csv"):
|
265 |
-
# This is from document_comparison_tool with a CSV file
|
266 |
-
await cl.Message(
|
267 |
-
content="Comparison complete! Download the CSV below:",
|
268 |
-
elements=[cl.File(name="comparison_results.csv", path=response["output"], display="inline")],
|
269 |
-
).send()
|
270 |
else:
|
271 |
-
# Generic response
|
272 |
await cl.Message(content=str(response["output"])).send()
|
273 |
|
274 |
# Update chat history with the new exchange
|
|
|
31 |
# Initialize embeddings model
|
32 |
model_id = "Snowflake/snowflake-arctic-embed-m"
|
33 |
embedding_model = HuggingFaceEmbeddings(model_name=model_id)
|
34 |
+
semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
|
|
|
|
|
|
|
|
|
35 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
36 |
|
37 |
+
# Export comparison prompt
|
38 |
export_prompt = """
|
39 |
CONTEXT:
|
40 |
{context}
|
|
|
104 |
|
105 |
retriever = cl.user_session.get("qdrant_retriever")
|
106 |
if not retriever:
|
107 |
+
return "Error: No documents available for retrieval. Please upload two PDF files first."
|
108 |
+
retriever = retriever.with_config({"k": 10})
|
109 |
|
110 |
+
# Use a RAG chain similar to the comparison tool
|
111 |
+
rag_chain = (
|
112 |
+
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
|
113 |
+
| query_prompt | llm | StrOutputParser()
|
114 |
+
)
|
115 |
+
response_text = rag_chain.invoke({"question": question})
|
116 |
|
117 |
+
# Get the retrieved docs for context
|
118 |
+
retrieved_docs = retriever.invoke(question)
|
|
|
119 |
|
120 |
return {
|
121 |
+
"messages": [HumanMessage(content=response_text)],
|
122 |
"context": retrieved_docs
|
123 |
}
|
124 |
|
|
|
152 |
df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
|
153 |
df.to_csv(file_path, index=False)
|
154 |
|
155 |
+
# Send the message with the file directly from the tool
|
156 |
+
cl.run_sync(
|
157 |
+
cl.Message(
|
158 |
+
content="Comparison complete! Download the CSV below:",
|
159 |
+
elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
|
160 |
+
).send()
|
161 |
+
)
|
162 |
+
|
163 |
+
# Return a simple confirmation message
|
164 |
+
return "Comparison results have been generated and displayed."
|
165 |
|
166 |
except json.JSONDecodeError:
|
167 |
return "Error: Response is not valid JSON."
|
|
|
266 |
)
|
267 |
|
268 |
# Handle the response based on the tool that was called
|
269 |
+
if isinstance(response["output"], dict) and "messages" in response["output"]:
|
270 |
# This is from document_query_tool
|
271 |
+
await cl.Message(response["output"]["messages"][0].content).send()
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
else:
|
273 |
+
# Generic response (including the confirmation from document_comparison_tool)
|
274 |
await cl.Message(content=str(response["output"])).send()
|
275 |
|
276 |
# Update chat history with the new exchange
|