Spaces:

akashlives
/

aie4-rag-w2

Sleeping

App Files Files Community

akashlives commited on Aug 27, 2024

Commit

f37291b

verified ·

1 Parent(s): eb395dd

Support txt and pdf

Browse files

Files changed (5) hide show

aimakerspace/openai_utils/embedding.py +12 -0
aimakerspace/pdfloader.py +60 -0
aimakerspace/semantic_chunking.py +60 -0
app.py +41 -13
requirements.txt +6 -1

aimakerspace/openai_utils/embedding.py CHANGED Viewed

@@ -48,6 +48,18 @@ class EmbeddingModel:
         return embedding.data[0].embedding
 if __name__ == "__main__":
     embedding_model = EmbeddingModel()

         return embedding.data[0].embedding
+    def embed_documents(self, list_of_text: List[str]) -> List[List[float]]:
+        """
+        Embed a list of documents (text strings) using the OpenAI embeddings model.
+        Args:
+            list_of_text (List[str]): A list of text strings to be embedded.
+        Returns:
+            List[List[float]]: A list of embedding vectors.
+        """
+        return self.get_embeddings(list_of_text)
 if __name__ == "__main__":
     embedding_model = EmbeddingModel()

aimakerspace/pdfloader.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from langchain_community.document_loaders import PyPDFLoader
+class PDFLoader:
+    def __init__(self, file_path: str):
+        """
+        Initialize the PDFLoader class with the path to the PDF file.
+        Args:
+            file_path (str): The path to the PDF file to be loaded.
+        """
+        self.file_path = file_path
+        self.loader = PyPDFLoader(self.file_path)
+        self.pages = None
+    def load_and_split(self):
+        """
+        Load and split the PDF file into pages.
+        Returns:
+            list: A list of pages after loading and splitting the PDF file.
+        """
+        self.pages = self.loader.load_and_split()
+        return self.pages
+    def get_page(self, page_number: int):
+        """
+        Get a specific page from the loaded PDF.
+        Args:
+            page_number (int): The page number to retrieve.
+        Returns:
+            dict: The content of the specified page.
+        """
+        if self.pages is None:
+            raise ValueError(
+                "The PDF has not been loaded yet. Call load_and_split() first."
+            )
+        if page_number < 1 or page_number > len(self.pages):
+            raise ValueError(
+                f"Page number out of range. Please choose a value between 1 and {len(self.pages)}."
+            )
+        return self.pages[page_number - 1]
+    def get_total_pages(self):
+        """
+        Get the total number of pages in the PDF.
+        Returns:
+            int: The total number of pages.
+        """
+        if self.pages is None:
+            raise ValueError(
+                "The PDF has not been loaded yet. Call load_and_split() first."
+            )
+        return len(self.pages)

aimakerspace/semantic_chunking.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from langchain_experimental.text_splitter import SemanticChunker
+from typing import List
+class SemanticChunking:
+    def __init__(self, embedding_model, breakpoint_threshold_type="percentile"):
+        """
+        Initialize the SemanticChunking class with a specified breakpoint threshold type and an embedding model.
+        Args:
+            embedding_model: An instance of the EmbeddingModel class to generate embeddings.
+            breakpoint_threshold_type (str): The type of breakpoint threshold to use for chunking.
+                                             Options include 'percentile', 'standard_deviation', 'interquartile'.
+        """
+        self.text_splitter = SemanticChunker(
+            embedding_model, breakpoint_threshold_type=breakpoint_threshold_type
+        )
+    def split_text(self, text: str) -> List:
+        """
+        Split the provided text into semantic chunks.
+        Args:
+            text (str): The text to be split into chunks.
+        Returns:
+            list: A list of documents (chunks) obtained from the text.
+        """
+        docs = self.text_splitter.create_documents([text])
+        return docs
+    def get_chunk(self, docs: List, chunk_index: int) -> str:
+        """
+        Get a specific chunk from the list of documents.
+        Args:
+            docs (list): The list of documents (chunks).
+            chunk_index (int): The index of the chunk to retrieve.
+        Returns:
+            str: The content of the specified chunk.
+        """
+        if chunk_index < 0 or chunk_index >= len(docs):
+            raise ValueError(
+                f"Chunk index out of range. Please choose a value between 0 and {len(docs) - 1}."
+            )
+        return docs[chunk_index].page_content
+    def get_total_chunks(self, docs: List) -> int:
+        """
+        Get the total number of chunks in the list of documents.
+        Args:
+            docs (list): The list of documents (chunks).
+        Returns:
+            int: The total number of chunks.
+        """
+        return len(docs)

app.py CHANGED Viewed

@@ -7,9 +7,11 @@ from aimakerspace.openai_utils.prompts import (
     SystemRolePrompt,
     AssistantRolePrompt,
 )
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
 system_template = """\
@@ -25,6 +27,7 @@ Question:
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 class RetrievalAugmentedQAPipeline:
     def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
         self.llm = llm
@@ -39,29 +42,55 @@ class RetrievalAugmentedQAPipeline:
         formatted_system_prompt = system_role_prompt.create_message()
-        formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
         async def generate_response():
-            async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
                 yield chunk
         return {"response": generate_response(), "context": context_list}
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
     import tempfile
-    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
         temp_file_path = temp_file.name
     with open(temp_file_path, "wb") as f:
         f.write(file.content)
-    text_loader = TextFileLoader(temp_file_path)
-    documents = text_loader.load_documents()
-    texts = text_splitter.split_texts(documents)
     return texts
@@ -72,8 +101,8 @@ async def on_chat_start():
     # Wait for the user to upload a file
     while files == None:
         files = await cl.AskFileMessage(
-            content="Please upload a Text File file to begin!",
-            accept=["text/plain"],
             max_size_mb=2,
             timeout=180,
         ).send()
@@ -93,15 +122,14 @@ async def on_chat_start():
     # Create a dict vector store
     vector_db = VectorDatabase()
     vector_db = await vector_db.abuild_from_list(texts)
     chat_openai = ChatOpenAI()
     # Create a chain
     retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
-        vector_db_retriever=vector_db,
-        llm=chat_openai
     )
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
@@ -119,4 +147,4 @@ async def main(message):
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
-    await msg.send()

     SystemRolePrompt,
     AssistantRolePrompt,
 )
+from aimakerspace.pdfloader import PDFLoader
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
+from aimakerspace.semantic_chunking import SemanticChunking
 import chainlit as cl
 system_template = """\
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 class RetrievalAugmentedQAPipeline:
     def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
         self.llm = llm
         formatted_system_prompt = system_role_prompt.create_message()
+        formatted_user_prompt = user_role_prompt.create_message(
+            question=user_query, context=context_prompt
+        )
         async def generate_response():
+            async for chunk in self.llm.astream(
+                [formatted_system_prompt, formatted_user_prompt]
+            ):
                 yield chunk
         return {"response": generate_response(), "context": context_list}
 text_splitter = CharacterTextSplitter()
+embedding_model = EmbeddingModel()
+chunker = SemanticChunking(embedding_model, breakpoint_threshold_type="percentile")
 def process_text_file(file: AskFileResponse):
     import tempfile
+    file_extension = os.path.splitext(file.name)[1].lower()
+    if file_extension == ".txt":
+        suffix = ".txt"
+    elif file_extension == ".pdf":
+        suffix = ".pdf"
+    else:
+        raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, suffix=suffix
+    ) as temp_file:
         temp_file_path = temp_file.name
     with open(temp_file_path, "wb") as f:
         f.write(file.content)
+    if suffix == ".txt":
+        file_loader = TextFileLoader(temp_file_path)
+    elif suffix == ".pdf":
+        file_loader = PDFLoader(temp_file_path)
+    else:
+        raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
+    documents = file_loader.load_documents()
+    split_pages = []
+    for doc in documents:
+        split_pages += chunker.split_text(doc.page_content)
+    texts = [i.page_content for i in split_pages]
+    # texts = text_splitter.split_texts(documents)
     return texts
     # Wait for the user to upload a file
     while files == None:
         files = await cl.AskFileMessage(
+            content="Please upload a Text or PDF File file to begin!",
+            accept=["text/plain", "application/pdf"],
             max_size_mb=2,
             timeout=180,
         ).send()
     # Create a dict vector store
     vector_db = VectorDatabase()
     vector_db = await vector_db.abuild_from_list(texts)
     chat_openai = ChatOpenAI()
     # Create a chain
     retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
+        vector_db_retriever=vector_db, llm=chat_openai
     )
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
+    await msg.send()

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
 numpy
 chainlit==0.7.700
-openai

 numpy
 chainlit==0.7.700
+openai
+langchain
+langchain-community
+langchain-experimental
+langchain-openai
+langchain-core