Spaces:

arpita-23
/

Shlok_Summerizer

Sleeping

App Files Files Community

arpita-23 commited on Dec 25, 2024

Commit

6df9459

verified ·

1 Parent(s): 95a9774

Upload 4 files

Browse files

Files changed (4) hide show

app.py +66 -0
config.json +1 -0
requirements.txt +13 -0
vectorize_documents.py +56 -0

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import json
+import streamlit as st
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from vectorize_documents import embeddings  # Import embeddings from the vectorization script
+# Set up working directory and API configuration
+working_dir = os.path.dirname(os.path.abspath(__file__))
+config_data = json.load(open(f"{working_dir}/config.json"))
+os.environ["GROQ_API_KEY"] = config_data["GROQ_API_KEY"]
+def setup_vectorstore():
+    persist_directory = f"{working_dir}/vector_db_dir"
+    vectorstore = Chroma(
+        persist_directory=persist_directory,
+        embedding_function=embeddings
+    )
+    return vectorstore
+def chat_chain(vectorstore):
+    from langchain_groq import ChatGroq  # Import the LLM class
+    llm = ChatGroq(
+        model="llama-3.1-70b-versatile",  # Replace with your LLM of choice
+        temperature=0  # Set low temperature to reduce hallucinations
+    )
+    retriever = vectorstore.as_retriever()  # Retrieve relevant chunks
+    memory = ConversationBufferMemory(
+        llm=llm,
+        output_key="answer",
+        memory_key="chat_history",
+        return_messages=True
+    )
+    # Build the conversational retrieval chain
+    chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",  # Define how documents are combined
+        memory=memory,
+        verbose=True,
+        return_source_documents=True
+    )
+    return chain
+# Streamlit UI
+st.title("Bhagavad Gita & Yoga Sutras Query Assistant")
+vectorstore = setup_vectorstore()
+chain = chat_chain(vectorstore)
+# User input
+user_query = st.text_input("Ask a question about the Bhagavad Gita or Yoga Sutras:")
+if user_query:
+    # Use `__call__` to get all outputs as a dictionary
+    response = chain({"question": user_query})
+    answer = response.get("answer", "No answer found.")
+    source_documents = response.get("source_documents", [])
+    st.write(f"**Answer:** {answer}")
+    st.write(f"**Source Documents:**")
+    for doc in source_documents:
+        st.write(doc)

config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"GROQ_API_KEY": "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"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit==1.38.0
+langchain-community==0.2.16
+langchain-text-splitters==0.2.4
+langchain-chroma==0.1.3
+langchain-huggingface==0.0.3
+langchain-groq==0.1.9
+unstructured==0.15.0
+nltk==3.8.1
+psycopg2-binary
+pgvector
+langchain_postgres
+docx2txt

vectorize_documents.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain.docstore.document import Document
+import pandas as pd
+import os
+import glob
+# Define a function to perform vectorization for multiple CSV files
+def vectorize_documents():
+    embeddings = HuggingFaceEmbeddings()
+    # Directory containing multiple CSV files
+    csv_directory = "Data"  # Replace with your folder name
+    csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))  # Find all CSV files in the folder
+    documents = []
+    # Load and concatenate all CSV files
+    for file_path in csv_files:
+        df = pd.read_csv(file_path)
+        for _, row in df.iterrows():
+            # Combine all columns in the row into a single string
+            row_content = " ".join(row.astype(str))
+            documents.append(Document(page_content=row_content))
+    # Splitting the text and creating chunks of these documents
+    text_splitter = CharacterTextSplitter(
+        chunk_size=2000,
+        chunk_overlap=500
+    )
+    text_chunks = text_splitter.split_documents(documents)
+    # Process text chunks in batches
+    batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
+    for i in range(0, len(text_chunks), batch_size):
+        batch = text_chunks[i:i + batch_size]
+        # Store the batch in Chroma vector DB
+        vectordb = Chroma.from_documents(
+            documents=batch,
+            embedding=embeddings,
+            persist_directory="vector_db_dir"
+        )
+    print("Documents Vectorized and saved in VectorDB")
+# Expose embeddings if needed
+embeddings = HuggingFaceEmbeddings()
+# Main guard to prevent execution on import
+if __name__ == "__main__":
+    vectorize_documents()