Spaces:

drewgenai
/

midterm_poc

Sleeping

App Files Files Community

drewgenai commited on 26 days ago

Commit

2ba4984

1 Parent(s): 4756a36

remove dupicate code and clean up. fix agentic app

Browse files

Files changed (3) hide show

01-cleanragcsv.ipynb +18 -51
app.py +18 -12
app_working_on_agentic.py +26 -24

01-cleanragcsv.ipynb CHANGED Viewed

@@ -268,13 +268,6 @@
     "Example: Groups all related pain-assessment questions into one chunk."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###testingbelow\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 11,
@@ -286,24 +279,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_361497/1110142159.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
-      "  embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
-     ]
-    }
-   ],
    "source": [
     "from langchain_experimental.text_splitter import SemanticChunker\n",
     "\n",
     "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
     "\n",
     "from langchain_huggingface import HuggingFaceEmbeddings\n",
     "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
     "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
     "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
@@ -319,22 +305,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#verify working\n",
-    "# test_doc = all_documents[0].page_content if all_documents else \"\"\n",
-    "# test_chunks = semantic_splitter.split_text(test_doc)\n",
-    "\n",
-    "# print(f\"\\n✅ Total Chunks for First Document: {len(test_chunks)}\")\n",
-    "# for i, chunk in enumerate(test_chunks[:3]):  # Show first 3 chunks\n",
-    "#     print(f\"\\n🔹 Chunk {i+1}: {chunk[:300]}\")  # Print first 300 characters\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -352,13 +323,6 @@
     "        documents_with_metadata.append(doc_chunk)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###testingabove"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -372,7 +336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -381,10 +345,6 @@
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "\n",
     "\n",
-    "# Load the SentenceTransformer model\n",
-    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
-    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
-    "\n",
     "# Load documents into Qdrant\n",
     "qdrant_vectorstore = Qdrant.from_documents(\n",
     "    documents_with_metadata,\n",
@@ -399,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -438,7 +398,6 @@
     "\n",
     "from langchain_openai import ChatOpenAI\n",
     "\n",
-    "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
     "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
     "\n",
     "from operator import itemgetter\n",
@@ -452,7 +411,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -516,9 +475,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import json\n",
     "import pandas as pd\n",

     "Example: Groups all related pain-assessment questions into one chunk."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from langchain_experimental.text_splitter import SemanticChunker\n",
     "\n",
     "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
     "\n",
     "from langchain_huggingface import HuggingFaceEmbeddings\n",
+    "\n",
+    "\n",
     "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
     "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
     "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
     "        documents_with_metadata.append(doc_chunk)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 15,
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "\n",
     "\n",
     "# Load documents into Qdrant\n",
     "qdrant_vectorstore = Qdrant.from_documents(\n",
     "    documents_with_metadata,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "from langchain_openai import ChatOpenAI\n",
     "\n",
     "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
     "\n",
     "from operator import itemgetter\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ CSV file saved: matching_data_elements.csv\n"
+     ]
+    }
+   ],
    "source": [
     "import json\n",
     "import pandas as pd\n",

app.py CHANGED Viewed

@@ -28,19 +28,20 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
 model_id = "Snowflake/snowflake-arctic-embed-m"
 embedding_model = HuggingFaceEmbeddings(model_name=model_id)
-semantic_splitter = SemanticChunker(embedding_model)
 llm = ChatOpenAI(model="gpt-4o-mini")
-# comparison prompt
-export_prompt = """
-CONTEXT:
 CONTEXT:
 {context}
 QUERY:
 {question}
 Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
 ### **Output Format:**
@@ -103,18 +104,21 @@ def document_query_tool(question: str) -> str:
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
-        return "Error: No documents available for retrieval. Please upload documents first."
-    # Retrieve context from the vector database
-    retrieved_docs = retriever.invoke(question)
-    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
-    # Generate response using the natural query prompt
-    messages = query_prompt.format_messages(question=question, context=docs_content)
-    response = llm.invoke(messages)
     return {
-        "messages": [HumanMessage(content=response.content)],
         "context": retrieved_docs
     }
@@ -126,6 +130,7 @@ def document_comparison_tool(question: str) -> str:
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
         return "Error: No documents available for retrieval. Please upload two PDF files first."
     # Process query using RAG
     rag_chain = (
@@ -178,6 +183,7 @@ async def process_files(files: list[cl.File]):
         return qdrant_vectorstore.as_retriever()
     return None
 @cl.on_chat_start
 async def start():
     cl.user_session.set("qdrant_retriever", None)

 model_id = "Snowflake/snowflake-arctic-embed-m"
 embedding_model = HuggingFaceEmbeddings(model_name=model_id)
+semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
 llm = ChatOpenAI(model="gpt-4o-mini")
+# Export comparison prompt
+export_prompt = export_prompt = """
 CONTEXT:
 {context}
 QUERY:
 {question}
+You are a helpful assistant. Use the available context to answer the question.
 Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
 ### **Output Format:**
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
+        return "Error: No documents available for retrieval. Please upload two PDF files first."
+    retriever = retriever.with_config({"k": 10})
+    # Use a RAG chain similar to the comparison tool
+    rag_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | query_prompt | llm | StrOutputParser()
+    )
+    response_text = rag_chain.invoke({"question": question})
+    # Get the retrieved docs for context
+    retrieved_docs = retriever.invoke(question)
     return {
+        "messages": [HumanMessage(content=response_text)],
         "context": retrieved_docs
     }
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
         return "Error: No documents available for retrieval. Please upload two PDF files first."
+    retriever = retriever.with_config({"k": 10})
     # Process query using RAG
     rag_chain = (
         return qdrant_vectorstore.as_retriever()
     return None
 @cl.on_chat_start
 async def start():
     cl.user_session.set("qdrant_retriever", None)

app_working_on_agentic.py CHANGED Viewed

@@ -31,14 +31,10 @@ os.makedirs(OUTPUT_PATH, exist_ok=True)
 # Initialize embeddings model
 model_id = "Snowflake/snowflake-arctic-embed-m"
 embedding_model = HuggingFaceEmbeddings(model_name=model_id)
-# Define semantic chunker
-semantic_splitter = SemanticChunker(embedding_model)
-# Initialize LLM
 llm = ChatOpenAI(model="gpt-4o-mini")
-# Define RAG prompt
 export_prompt = """
 CONTEXT:
 {context}
@@ -108,18 +104,21 @@ def document_query_tool(question: str) -> str:
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
-        return "Error: No documents available for retrieval. Please upload documents first."
-    # Retrieve context from the vector database
-    retrieved_docs = retriever.invoke(question)
-    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
-    # Generate response using the natural query prompt
-    messages = query_prompt.format_messages(question=question, context=docs_content)
-    response = llm.invoke(messages)
     return {
-        "answer": response.content,
         "context": retrieved_docs
     }
@@ -153,7 +152,16 @@ def document_comparison_tool(question: str) -> str:
         df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
         df.to_csv(file_path, index=False)
-        return file_path  # Return path to the CSV file
     except json.JSONDecodeError:
         return "Error: Response is not valid JSON."
@@ -258,17 +266,11 @@ async def handle_message(message: cl.Message):
         )
     # Handle the response based on the tool that was called
-    if isinstance(response["output"], dict) and "answer" in response["output"]:
         # This is from document_query_tool
-        await cl.Message(response["output"]["answer"]).send()
-    elif isinstance(response["output"], str) and response["output"].endswith(".csv"):
-        # This is from document_comparison_tool with a CSV file
-        await cl.Message(
-            content="Comparison complete! Download the CSV below:",
-            elements=[cl.File(name="comparison_results.csv", path=response["output"], display="inline")],
-        ).send()
     else:
-        # Generic response
         await cl.Message(content=str(response["output"])).send()
     # Update chat history with the new exchange

 # Initialize embeddings model
 model_id = "Snowflake/snowflake-arctic-embed-m"
 embedding_model = HuggingFaceEmbeddings(model_name=model_id)
+semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
 llm = ChatOpenAI(model="gpt-4o-mini")
+# Export comparison prompt
 export_prompt = """
 CONTEXT:
 {context}
     retriever = cl.user_session.get("qdrant_retriever")
     if not retriever:
+        return "Error: No documents available for retrieval. Please upload two PDF files first."
+    retriever = retriever.with_config({"k": 10})
+    # Use a RAG chain similar to the comparison tool
+    rag_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | query_prompt | llm | StrOutputParser()
+    )
+    response_text = rag_chain.invoke({"question": question})
+    # Get the retrieved docs for context
+    retrieved_docs = retriever.invoke(question)
     return {
+        "messages": [HumanMessage(content=response_text)],
         "context": retrieved_docs
     }
         df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
         df.to_csv(file_path, index=False)
+        # Send the message with the file directly from the tool
+        cl.run_sync(
+            cl.Message(
+                content="Comparison complete! Download the CSV below:",
+                elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
+            ).send()
+        )
+        # Return a simple confirmation message
+        return "Comparison results have been generated and displayed."
     except json.JSONDecodeError:
         return "Error: Response is not valid JSON."
         )
     # Handle the response based on the tool that was called
+    if isinstance(response["output"], dict) and "messages" in response["output"]:
         # This is from document_query_tool
+        await cl.Message(response["output"]["messages"][0].content).send()
     else:
+        # Generic response (including the confirmation from document_comparison_tool)
         await cl.Message(content=str(response["output"])).send()
     # Update chat history with the new exchange