Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on Jun 16, 2024

Commit

f2daaee

1 Parent(s): ea7b686

added raptor and literalai

Browse files

Files changed (22) hide show

Dockerfile.dev +6 -2
code/main.py +27 -5
code/modules/chat/helpers.py +2 -2
code/modules/chat/llm_tutor.py +1 -1
code/modules/chat_processor/__init__.py +0 -0
code/modules/chat_processor/base.py +6 -0
code/modules/chat_processor/chat_processor.py +25 -0
code/modules/chat_processor/literal_ai.py +37 -0
code/modules/config/config.yml +9 -2
code/modules/config/constants.py +1 -1
code/modules/retriever/__init__.py +1 -0
code/modules/retriever/colbert_retriever.py +1 -1
code/modules/retriever/raptor_retriever.py +16 -0
code/modules/retriever/retriever.py +2 -0
code/modules/vectorstore/__init__.py +0 -2
code/modules/vectorstore/raptor.py +438 -0
code/modules/vectorstore/store_manager.py +2 -2
code/modules/vectorstore/vectorstore.py +2 -0
{public → code/public}/logo_dark.png +0 -0
{public → code/public}/logo_light.png +0 -0
{public → code/public}/test.css +0 -0
requirements.txt +2 -0

Dockerfile.dev CHANGED Viewed

@@ -10,7 +10,8 @@ RUN pip install --no-cache-dir -r /code/requirements.txt
 COPY . /code
-RUN ls -R
 # Change permissions to allow writing to the directory
 RUN chmod -R 777 /code
@@ -21,7 +22,10 @@ RUN mkdir /code/logs && chmod 777 /code/logs
 # Create a cache directory within the application's working directory
 RUN mkdir /.cache && chmod -R 777 /.cache
 # Expose the port the app runs on
 EXPOSE 8051
-CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051

 COPY . /code
+# List the contents of the /code directory to verify files are copied correctly
+RUN ls -R /code
 # Change permissions to allow writing to the directory
 RUN chmod -R 777 /code
 # Create a cache directory within the application's working directory
 RUN mkdir /.cache && chmod -R 777 /.cache
+WORKDIR /code/code
 # Expose the port the app runs on
 EXPOSE 8051
+# Default command to run the application
+CMD ["sh", "-c", "python -m modules.vectorstore.store_manager && chainlit run main.py --host 0.0.0.0 --port 8051"]

code/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
-from langchain import PromptTemplate
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
@@ -20,7 +20,7 @@ sys.path.append(current_dir)
 from modules.chat.llm_tutor import LLMTutor
 from modules.config.constants import *
 from modules.chat.helpers import get_sources
 global logger
 logger = logging.getLogger(__name__)
@@ -113,7 +113,16 @@ async def start():
     msg.content = opening_message
     await msg.update()
     cl.user_session.set("chain", chain)
 @cl.on_message
@@ -121,15 +130,28 @@ async def main(message):
     global logger
     user = cl.user_session.get("user")
     chain = cl.user_session.get("chain")
     cb = cl.AsyncLangchainCallbackHandler()  # TODO: fix streaming here
     cb.answer_reached = True
-    res = await chain.acall(message.content, callbacks=[cb])
-    # res = await chain.acall(message.content)
     try:
         answer = res["answer"]
     except:
         answer = res["result"]
-    answer_with_sources, source_elements = get_sources(res, answer)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

 from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain_core.prompts import PromptTemplate
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from modules.chat.llm_tutor import LLMTutor
 from modules.config.constants import *
 from modules.chat.helpers import get_sources
+from modules.chat_processor.chat_processor import ChatProcessor
 global logger
 logger = logging.getLogger(__name__)
     msg.content = opening_message
     await msg.update()
+    tags = [chat_profile, config["vectorstore"]["db_option"]]
+    chat_processor = ChatProcessor(config["chat_logging"]["platform"], tags=tags)
     cl.user_session.set("chain", chain)
+    cl.user_session.set("counter", 0)
+    cl.user_session.set("chat_processor", chat_processor)
+@cl.on_chat_end
+async def on_chat_end():
+    await cl.Message(content="Sorry, I have to go now. Goodbye!").send()
 @cl.on_message
     global logger
     user = cl.user_session.get("user")
     chain = cl.user_session.get("chain")
+    counter = cl.user_session.get("counter")
+    counter += 1
+    cl.user_session.set("counter", counter)
+    # if counter >= 3:  # Ensure the counter condition is checked
+    #     await cl.Message(content="Your credits are up!").send()
+    #     await on_chat_end()  # Call the on_chat_end function to handle the end of the chat
+    #     return  # Exit the function to stop further processing
+    # else:
     cb = cl.AsyncLangchainCallbackHandler()  # TODO: fix streaming here
     cb.answer_reached = True
+    processor = cl.user_session.get("chat_processor")
+    res = await processor.rag(message.content, chain, cb)
     try:
         answer = res["answer"]
     except:
         answer = res["result"]
+    answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
+    processor._process(message.content, answer, sources_dict)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

code/modules/chat/helpers.py CHANGED Viewed

@@ -9,7 +9,7 @@ def get_sources(res, answer):
     for idx, source in enumerate(res["source_documents"]):
         source_metadata = source.metadata
-        url = source_metadata["source"]
         score = source_metadata.get("score", "N/A")
         page = source_metadata.get("page", 1)
@@ -75,7 +75,7 @@ def get_sources(res, answer):
             )
         )
-    return full_answer, source_elements
 def get_prompt(config):

     for idx, source in enumerate(res["source_documents"]):
         source_metadata = source.metadata
+        url = source_metadata.get("source", "N/A")
         score = source_metadata.get("score", "N/A")
         page = source_metadata.get("page", 1)
             )
         )
+    return full_answer, source_elements, source_dict
 def get_prompt(config):

code/modules/chat/llm_tutor.py CHANGED Viewed

@@ -102,7 +102,7 @@ class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
             # Prepare the final prompt with metadata
             context = "\n\n".join(
                 [
-                    f"Context {idx+1}: \n(Document content: {doc.page_content}\nMetadata: (source_file: {doc.metadata['source']}))"
                     for idx, doc in enumerate(docs)
                 ]
             )

             # Prepare the final prompt with metadata
             context = "\n\n".join(
                 [
+                    f"Context {idx+1}: \n(Document content: {doc.page_content}\nMetadata: (source_file: {doc.metadata['source'] if 'source' in doc.metadata else 'unknown'}))"
                     for idx, doc in enumerate(docs)
                 ]
             )

code/modules/chat_processor/__init__.py ADDED Viewed

File without changes

code/modules/chat_processor/base.py ADDED Viewed

	@@ -0,0 +1,6 @@

+class ChatProcessorBase:
+    def __init__(self, config):
+        self.config = config
+    def process(self, message):
+        raise NotImplementedError("process method not implemented")

code/modules/chat_processor/chat_processor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from modules.chat_processor.literal_ai import LiteralaiChatProcessor
+class ChatProcessor:
+    def __init__(self, chat_processor_type, tags=None):
+        self.chat_processor_type = chat_processor_type
+        self.tags = tags
+        self._init_processor()
+    def _init_processor(self):
+        if self.chat_processor_type == "literalai":
+            self.processor = LiteralaiChatProcessor(self.tags)
+        else:
+            raise ValueError(
+                f"Chat processor type {self.chat_processor_type} not supported"
+            )
+    def _process(self, user_message, assistant_message, source_dict):
+        self.processor.process(user_message, assistant_message, source_dict)
+    async def rag(self, user_query: str, chain, cb):
+        try:
+            return await self.processor.rag(user_query, chain, cb)
+        except:
+            return await chain.acall(user_query, callbacks=[cb])

code/modules/chat_processor/literal_ai.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from literalai import LiteralClient
+import os
+from .base import ChatProcessorBase
+class LiteralaiChatProcessor(ChatProcessorBase):
+    def __init__(self, tags=None):
+        self.literal_client = LiteralClient(api_key=os.getenv("LITERAL_API_KEY"))
+        self.literal_client.reset_context()
+        with self.literal_client.thread(name="TEST") as thread:
+            self.thread_id = thread.id
+            self.thread = thread
+            if tags is not None and type(tags) == list:
+                self.thread.tags = tags
+        print(f"Thread ID: {self.thread}")
+    def process(self, user_message, assistant_message, source_dict):
+        with self.literal_client.thread(thread_id=self.thread_id) as thread:
+            self.literal_client.message(
+                content=user_message,
+                type="user_message",
+                name="User",
+            )
+            self.literal_client.message(
+                content=assistant_message,
+                type="assistant_message",
+                name="AI_Tutor",
+            )
+    async def rag(self, user_query: str, chain, cb):
+        with self.literal_client.step(
+            type="retrieval", name="RAG", thread_id=self.thread_id
+        ) as step:
+            step.input = {"question": user_query}
+            res = await chain.acall(user_query, callbacks=[cb])
+            step.output = res
+        return res

code/modules/config/config.yml CHANGED Viewed

@@ -6,8 +6,8 @@ vectorstore:
   embedd_files: False # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
-  expand_urls: False # bool
-  db_option : 'Chroma' # str [FAISS, Chroma, RAGatouille]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
@@ -29,6 +29,13 @@ llm_params:
   llm_loader: 'openai' # str [local_llm, openai]
   openai_params:
     model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
 splitter_options:
   use_splitter: True # bool

   embedd_files: False # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
+  expand_urls: True # bool
+  db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
   llm_loader: 'openai' # str [local_llm, openai]
   openai_params:
     model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
+  local_llm_params:
+    model: 'tiny-llama'
+    temperature: 0.7
+chat_logging:
+  log_chat: True # bool
+  platform: 'literalai'
 splitter_options:
   use_splitter: True # bool

code/modules/config/constants.py CHANGED Viewed

@@ -77,5 +77,5 @@ Question: {question}
 # Model Paths
-LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
 MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"

 # Model Paths
+LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
 MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"

code/modules/retriever/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from .faiss_retriever import FaissRetriever
 from .chroma_retriever import ChromaRetriever
 from .colbert_retriever import ColbertRetriever
 from .retriever import Retriever

 from .faiss_retriever import FaissRetriever
 from .chroma_retriever import ChromaRetriever
 from .colbert_retriever import ColbertRetriever
+from .raptor_retriever import RaptorRetriever
 from .retriever import Retriever

code/modules/retriever/colbert_retriever.py CHANGED Viewed

@@ -6,5 +6,5 @@ class ColbertRetriever(BaseRetriever):
         pass
     def return_retriever(self, db, config):
-        retriever = db.as_retriever()
         return retriever

         pass
     def return_retriever(self, db, config):
+        retriever = db.as_langchain_retriever(k=config["vectorstore"]["search_top_k"])
         return retriever

code/modules/retriever/raptor_retriever.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .helpers import VectorStoreRetrieverScore
+from .base import BaseRetriever
+class RaptorRetriever(BaseRetriever):
+    def __init__(self):
+        pass
+    def return_retriever(self, db, config):
+        retriever = VectorStoreRetrieverScore(
+            vectorstore=db,
+            search_kwargs={
+                "k": config["vectorstore"]["search_top_k"],
+            },
+        )
+        return retriever

code/modules/retriever/retriever.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from modules.retriever.faiss_retriever import FaissRetriever
 from modules.retriever.chroma_retriever import ChromaRetriever
 from modules.retriever.colbert_retriever import ColbertRetriever
 class Retriever:
@@ -10,6 +11,7 @@ class Retriever:
             "FAISS": FaissRetriever,
             "Chroma": ChromaRetriever,
             "RAGatouille": ColbertRetriever,
         }
         self._create_retriever()

 from modules.retriever.faiss_retriever import FaissRetriever
 from modules.retriever.chroma_retriever import ChromaRetriever
 from modules.retriever.colbert_retriever import ColbertRetriever
+from modules.retriever.raptor_retriever import RaptorRetriever
 class Retriever:
             "FAISS": FaissRetriever,
             "Chroma": ChromaRetriever,
             "RAGatouille": ColbertRetriever,
+            "RAPTOR": RaptorRetriever,
         }
         self._create_retriever()

code/modules/vectorstore/__init__.py CHANGED Viewed

	@@ -1,2 +0,0 @@
1	- from .base import VectorStoreBase
2	- from .faiss import FAISS

code/modules/vectorstore/raptor.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# code modified from https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb
+from typing import Dict, List, Optional, Tuple
+import os
+import numpy as np
+import pandas as pd
+import umap
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from sklearn.mixture import GaussianMixture
+from langchain_community.chat_models import ChatOpenAI
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from modules.vectorstore.base import VectorStoreBase
+RANDOM_SEED = 42
+class RAPTORVectoreStore(VectorStoreBase):
+    def __init__(self, config, documents=[], text_splitter=None, embedding_model=None):
+        self.documents = documents
+        self.config = config
+        self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=self.config["splitter_options"]["chunk_size"],
+            chunk_overlap=self.config["splitter_options"]["chunk_overlap"],
+            separators=self.config["splitter_options"]["chunk_separators"],
+            disallowed_special=(),
+        )
+        self.embd = embedding_model
+        self.model = ChatOpenAI(
+            model="gpt-3.5-turbo",
+        )
+    def concat_documents(self, documents):
+        d_sorted = sorted(documents, key=lambda x: x.metadata["source"])
+        d_reversed = list(reversed(d_sorted))
+        concatenated_content = "\n\n\n --- \n\n\n".join(
+            [doc.page_content for doc in d_reversed]
+        )
+        return concatenated_content
+    def split_documents(self, documents):
+        concatenated_content = self.concat_documents(documents)
+        texts_split = self.text_splitter.split_text(concatenated_content)
+        return texts_split
+    def add_documents(self, documents):
+        self.documents.extend(documents)
+    def global_cluster_embeddings(
+        self,
+        embeddings: np.ndarray,
+        dim: int,
+        n_neighbors: Optional[int] = None,
+        metric: str = "cosine",
+    ) -> np.ndarray:
+        """
+        Perform global dimensionality reduction on the embeddings using UMAP.
+        Parameters:
+        - embeddings: The input embeddings as a numpy array.
+        - dim: The target dimensionality for the reduced space.
+        - n_neighbors: Optional; the number of neighbors to consider for each point.
+                    If not provided, it defaults to the square root of the number of embeddings.
+        - metric: The distance metric to use for UMAP.
+        Returns:
+        - A numpy array of the embeddings reduced to the specified dimensionality.
+        """
+        if n_neighbors is None:
+            n_neighbors = int((len(embeddings) - 1) ** 0.5)
+        return umap.UMAP(
+            n_neighbors=n_neighbors, n_components=dim, metric=metric
+        ).fit_transform(embeddings)
+    def local_cluster_embeddings(
+        self,
+        embeddings: np.ndarray,
+        dim: int,
+        num_neighbors: int = 10,
+        metric: str = "cosine",
+    ) -> np.ndarray:
+        """
+        Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.
+        Parameters:
+        - embeddings: The input embeddings as a numpy array.
+        - dim: The target dimensionality for the reduced space.
+        - num_neighbors: The number of neighbors to consider for each point.
+        - metric: The distance metric to use for UMAP.
+        Returns:
+        - A numpy array of the embeddings reduced to the specified dimensionality.
+        """
+        return umap.UMAP(
+            n_neighbors=num_neighbors, n_components=dim, metric=metric
+        ).fit_transform(embeddings)
+    def get_optimal_clusters(
+        self,
+        embeddings: np.ndarray,
+        max_clusters: int = 50,
+        random_state: int = RANDOM_SEED,
+    ) -> int:
+        """
+        Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.
+        Parameters:
+        - embeddings: The input embeddings as a numpy array.
+        - max_clusters: The maximum number of clusters to consider.
+        - random_state: Seed for reproducibility.
+        Returns:
+        - An integer representing the optimal number of clusters found.
+        """
+        max_clusters = min(max_clusters, len(embeddings))
+        n_clusters = np.arange(1, max_clusters)
+        bics = []
+        for n in n_clusters:
+            gm = GaussianMixture(n_components=n, random_state=random_state)
+            gm.fit(embeddings)
+            bics.append(gm.bic(embeddings))
+        return n_clusters[np.argmin(bics)]
+    def GMM_cluster(
+        self, embeddings: np.ndarray, threshold: float, random_state: int = 0
+    ):
+        """
+        Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.
+        Parameters:
+        - embeddings: The input embeddings as a numpy array.
+        - threshold: The probability threshold for assigning an embedding to a cluster.
+        - random_state: Seed for reproducibility.
+        Returns:
+        - A tuple containing the cluster labels and the number of clusters determined.
+        """
+        n_clusters = self.get_optimal_clusters(embeddings)
+        gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
+        gm.fit(embeddings)
+        probs = gm.predict_proba(embeddings)
+        labels = [np.where(prob > threshold)[0] for prob in probs]
+        return labels, n_clusters
+    def perform_clustering(
+        self,
+        embeddings: np.ndarray,
+        dim: int,
+        threshold: float,
+    ) -> List[np.ndarray]:
+        """
+        Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering
+        using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.
+        Parameters:
+        - embeddings: The input embeddings as a numpy array.
+        - dim: The target dimensionality for UMAP reduction.
+        - threshold: The probability threshold for assigning an embedding to a cluster in GMM.
+        Returns:
+        - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
+        """
+        if len(embeddings) <= dim + 1:
+            # Avoid clustering when there's insufficient data
+            return [np.array([0]) for _ in range(len(embeddings))]
+        # Global dimensionality reduction
+        reduced_embeddings_global = self.global_cluster_embeddings(embeddings, dim)
+        # Global clustering
+        global_clusters, n_global_clusters = self.GMM_cluster(
+            reduced_embeddings_global, threshold
+        )
+        all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
+        total_clusters = 0
+        # Iterate through each global cluster to perform local clustering
+        for i in range(n_global_clusters):
+            # Extract embeddings belonging to the current global cluster
+            global_cluster_embeddings_ = embeddings[
+                np.array([i in gc for gc in global_clusters])
+            ]
+            if len(global_cluster_embeddings_) == 0:
+                continue
+            if len(global_cluster_embeddings_) <= dim + 1:
+                # Handle small clusters with direct assignment
+                local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
+                n_local_clusters = 1
+            else:
+                # Local dimensionality reduction and clustering
+                reduced_embeddings_local = self.local_cluster_embeddings(
+                    global_cluster_embeddings_, dim
+                )
+                local_clusters, n_local_clusters = self.GMM_cluster(
+                    reduced_embeddings_local, threshold
+                )
+            # Assign local cluster IDs, adjusting for total clusters already processed
+            for j in range(n_local_clusters):
+                local_cluster_embeddings_ = global_cluster_embeddings_[
+                    np.array([j in lc for lc in local_clusters])
+                ]
+                indices = np.where(
+                    (embeddings == local_cluster_embeddings_[:, None]).all(-1)
+                )[1]
+                for idx in indices:
+                    all_local_clusters[idx] = np.append(
+                        all_local_clusters[idx], j + total_clusters
+                    )
+            total_clusters += n_local_clusters
+        return all_local_clusters
+    def embed(self, texts):
+        """
+        Generate embeddings for a list of text documents.
+        This function assumes the existence of an `embd` object with a method `embed_documents`
+        that takes a list of texts and returns their embeddings.
+        Parameters:
+        - texts: List[str], a list of text documents to be embedded.
+        Returns:
+        - numpy.ndarray: An array of embeddings for the given text documents.
+        """
+        text_embeddings = self.embd.embed_documents(texts)
+        text_embeddings_np = np.array(text_embeddings)
+        return text_embeddings_np
+    def embed_cluster_texts(self, texts):
+        """
+        Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.
+        This function combines embedding generation and clustering into a single step. It assumes the existence
+        of a previously defined `perform_clustering` function that performs clustering on the embeddings.
+        Parameters:
+        - texts: List[str], a list of text documents to be processed.
+        Returns:
+        - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.
+        """
+        text_embeddings_np = self.embed(texts)  # Generate embeddings
+        cluster_labels = self.perform_clustering(
+            text_embeddings_np, 10, 0.1
+        )  # Perform clustering on the embeddings
+        df = pd.DataFrame()  # Initialize a DataFrame to store the results
+        df["text"] = texts  # Store original texts
+        df["embd"] = list(
+            text_embeddings_np
+        )  # Store embeddings as a list in the DataFrame
+        df["cluster"] = cluster_labels  # Store cluster labels
+        return df
+    def fmt_txt(self, df: pd.DataFrame) -> str:
+        """
+        Formats the text documents in a DataFrame into a single string.
+        Parameters:
+        - df: DataFrame containing the 'text' column with text documents to format.
+        Returns:
+        - A single string where all text documents are joined by a specific delimiter.
+        """
+        unique_txt = df["text"].tolist()
+        return "--- --- \n --- --- ".join(unique_txt)
+    def embed_cluster_summarize_texts(
+        self, texts: List[str], level: int
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,
+        clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes
+        the content within each cluster.
+        Parameters:
+        - texts: A list of text documents to be processed.
+        - level: An integer parameter that could define the depth or detail of processing.
+        Returns:
+        - Tuple containing two DataFrames:
+        1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.
+        2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,
+            and the cluster identifiers.
+        """
+        # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns
+        df_clusters = self.embed_cluster_texts(texts)
+        # Prepare to expand the DataFrame for easier manipulation of clusters
+        expanded_list = []
+        # Expand DataFrame entries to document-cluster pairings for straightforward processing
+        for index, row in df_clusters.iterrows():
+            for cluster in row["cluster"]:
+                expanded_list.append(
+                    {"text": row["text"], "embd": row["embd"], "cluster": cluster}
+                )
+        # Create a new DataFrame from the expanded list
+        expanded_df = pd.DataFrame(expanded_list)
+        # Retrieve unique cluster identifiers for processing
+        all_clusters = expanded_df["cluster"].unique()
+        print(f"--Generated {len(all_clusters)} clusters--")
+        # Summarization
+        template = """Here is content from the course DS598: Deep Learning for Data Science.
+        The content may be form webapge about the course, or lecture content, or any other relevant information.
+        If the content is in bullet points (from  pdf lectre slides), you can summarize the bullet points.
+        Give a detailed summary of the content below.
+        Documentation:
+        {context}
+        """
+        prompt = ChatPromptTemplate.from_template(template)
+        chain = prompt | self.model | StrOutputParser()
+        # Format text within each cluster for summarization
+        summaries = []
+        for i in all_clusters:
+            df_cluster = expanded_df[expanded_df["cluster"] == i]
+            formatted_txt = self.fmt_txt(df_cluster)
+            summaries.append(chain.invoke({"context": formatted_txt}))
+        # Create a DataFrame to store summaries with their corresponding cluster and level
+        df_summary = pd.DataFrame(
+            {
+                "summaries": summaries,
+                "level": [level] * len(summaries),
+                "cluster": list(all_clusters),
+            }
+        )
+        return df_clusters, df_summary
+    def recursive_embed_cluster_summarize(
+        self, texts: List[str], level: int = 1, n_levels: int = 3
+    ) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
+        """
+        Recursively embeds, clusters, and summarizes texts up to a specified level or until
+        the number of unique clusters becomes 1, storing the results at each level.
+        Parameters:
+        - texts: List[str], texts to be processed.
+        - level: int, current recursion level (starts at 1).
+        - n_levels: int, maximum depth of recursion.
+        Returns:
+        - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
+        levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
+        """
+        results = {}  # Dictionary to store results at each level
+        # Perform embedding, clustering, and summarization for the current level
+        df_clusters, df_summary = self.embed_cluster_summarize_texts(texts, level)
+        # Store the results of the current level
+        results[level] = (df_clusters, df_summary)
+        # Determine if further recursion is possible and meaningful
+        unique_clusters = df_summary["cluster"].nunique()
+        if level < n_levels and unique_clusters > 1:
+            # Use summaries as the input texts for the next level of recursion
+            new_texts = df_summary["summaries"].tolist()
+            next_level_results = self.recursive_embed_cluster_summarize(
+                new_texts, level + 1, n_levels
+            )
+            # Merge the results from the next level into the current results dictionary
+            results.update(next_level_results)
+        return results
+    def get_vector_db(self):
+        """
+        Generate a retriever object from a list of documents.
+        Parameters:
+        - documents: List of document objects.
+        Returns:
+        - A retriever object.
+        """
+        leaf_texts = self.split_documents(self.documents)
+        results = self.recursive_embed_cluster_summarize(
+            leaf_texts, level=1, n_levels=10
+        )
+        all_texts = leaf_texts.copy()
+        # Iterate through the results to extract summaries from each level and add them to all_texts
+        for level in sorted(results.keys()):
+            # Extract summaries from the current level's DataFrame
+            summaries = results[level][1]["summaries"].tolist()
+            # Extend all_texts with the summaries from the current level
+            all_texts.extend(summaries)
+        # Now, use all_texts to build the vectorstore
+        vectorstore = FAISS.from_texts(texts=all_texts, embedding=self.embd)
+        return vectorstore
+    def create_database(self, documents, embedding_model):
+        self.documents = documents
+        self.embd = embedding_model
+        self.vectorstore = self.get_vector_db()
+        self.vectorstore.save_local(
+            os.path.join(
+                self.config["vectorstore"]["db_path"],
+                "db_"
+                + self.config["vectorstore"]["db_option"]
+                + "_"
+                + self.config["vectorstore"]["model"],
+            )
+        )
+    def load_database(self, embedding_model):
+        self.vectorstore = FAISS.load_local(
+            os.path.join(
+                self.config["vectorstore"]["db_path"],
+                "db_"
+                + self.config["vectorstore"]["db_option"]
+                + "_"
+                + self.config["vectorstore"]["model"],
+            ),
+            embedding_model,
+            allow_dangerous_deserialization=True,
+        )
+        return self.vectorstore
+    def as_retriever(self):
+        return self.vectorstore.as_retriever()

code/modules/vectorstore/store_manager.py CHANGED Viewed

@@ -84,7 +84,7 @@ class VectorStoreManager:
         documents: list,
         document_metadata: list,
     ):
-        if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
             self.embedding_model = self.create_embedding_model()
         else:
             self.embedding_model = None
@@ -132,7 +132,7 @@ class VectorStoreManager:
     def load_database(self):
         start_time = time.time()  # Start time for loading database
-        if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
             self.embedding_model = self.create_embedding_model()
         else:
             self.embedding_model = None

         documents: list,
         document_metadata: list,
     ):
+        if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()
         else:
             self.embedding_model = None
     def load_database(self):
         start_time = time.time()  # Start time for loading database
+        if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()
         else:
             self.embedding_model = None

code/modules/vectorstore/vectorstore.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from modules.vectorstore.faiss import FaissVectorStore
 from modules.vectorstore.chroma import ChromaVectorStore
 from modules.vectorstore.colbert import ColbertVectorStore
 class VectorStore:
@@ -11,6 +12,7 @@ class VectorStore:
             "FAISS": FaissVectorStore,
             "Chroma": ChromaVectorStore,
             "RAGatouille": ColbertVectorStore,
         }
     def _create_database(

 from modules.vectorstore.faiss import FaissVectorStore
 from modules.vectorstore.chroma import ChromaVectorStore
 from modules.vectorstore.colbert import ColbertVectorStore
+from modules.vectorstore.raptor import RAPTORVectoreStore
 class VectorStore:
             "FAISS": FaissVectorStore,
             "Chroma": ChromaVectorStore,
             "RAGatouille": ColbertVectorStore,
+            "RAPTOR": RAPTORVectoreStore,
         }
     def _create_database(

{public → code/public}/logo_dark.png RENAMED Viewed

File without changes

{public → code/public}/logo_light.png RENAMED Viewed

File without changes

{public → code/public}/test.css RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -18,3 +18,5 @@ llama-cpp-python==0.2.77
 fake_useragent==1.5.1
 chromadb==0.5.0
 pymupdf==1.24.5

 fake_useragent==1.5.1
 chromadb==0.5.0
 pymupdf==1.24.5
+literalai==0.0.601
+umap-learn==0.5.6