arpita-23 commited on
Commit
6df9459
·
verified ·
1 Parent(s): 95a9774

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +66 -0
  2. config.json +1 -0
  3. requirements.txt +13 -0
  4. vectorize_documents.py +56 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import streamlit as st
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from vectorize_documents import embeddings # Import embeddings from the vectorization script
9
+
10
+ # Set up working directory and API configuration
11
+ working_dir = os.path.dirname(os.path.abspath(__file__))
12
+ config_data = json.load(open(f"{working_dir}/config.json"))
13
+ os.environ["GROQ_API_KEY"] = config_data["GROQ_API_KEY"]
14
+
15
+ def setup_vectorstore():
16
+ persist_directory = f"{working_dir}/vector_db_dir"
17
+ vectorstore = Chroma(
18
+ persist_directory=persist_directory,
19
+ embedding_function=embeddings
20
+ )
21
+ return vectorstore
22
+
23
+ def chat_chain(vectorstore):
24
+ from langchain_groq import ChatGroq # Import the LLM class
25
+
26
+ llm = ChatGroq(
27
+ model="llama-3.1-70b-versatile", # Replace with your LLM of choice
28
+ temperature=0 # Set low temperature to reduce hallucinations
29
+ )
30
+ retriever = vectorstore.as_retriever() # Retrieve relevant chunks
31
+ memory = ConversationBufferMemory(
32
+ llm=llm,
33
+ output_key="answer",
34
+ memory_key="chat_history",
35
+ return_messages=True
36
+ )
37
+
38
+ # Build the conversational retrieval chain
39
+ chain = ConversationalRetrievalChain.from_llm(
40
+ llm=llm,
41
+ retriever=retriever,
42
+ chain_type="stuff", # Define how documents are combined
43
+ memory=memory,
44
+ verbose=True,
45
+ return_source_documents=True
46
+ )
47
+ return chain
48
+
49
+ # Streamlit UI
50
+ st.title("Bhagavad Gita & Yoga Sutras Query Assistant")
51
+
52
+ vectorstore = setup_vectorstore()
53
+ chain = chat_chain(vectorstore)
54
+
55
+ # User input
56
+ user_query = st.text_input("Ask a question about the Bhagavad Gita or Yoga Sutras:")
57
+ if user_query:
58
+ # Use `__call__` to get all outputs as a dictionary
59
+ response = chain({"question": user_query})
60
+ answer = response.get("answer", "No answer found.")
61
+ source_documents = response.get("source_documents", [])
62
+
63
+ st.write(f"**Answer:** {answer}")
64
+ st.write(f"**Source Documents:**")
65
+ for doc in source_documents:
66
+ st.write(doc)
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GROQ_API_KEY": "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"}
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ streamlit==1.38.0
3
+ langchain-community==0.2.16
4
+ langchain-text-splitters==0.2.4
5
+ langchain-chroma==0.1.3
6
+ langchain-huggingface==0.0.3
7
+ langchain-groq==0.1.9
8
+ unstructured==0.15.0
9
+ nltk==3.8.1
10
+ psycopg2-binary
11
+ pgvector
12
+ langchain_postgres
13
+ docx2txt
vectorize_documents.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import CharacterTextSplitter
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain_chroma import Chroma
4
+ from langchain.docstore.document import Document
5
+ import pandas as pd
6
+ import os
7
+ import glob
8
+
9
+ # Define a function to perform vectorization for multiple CSV files
10
+ def vectorize_documents():
11
+ embeddings = HuggingFaceEmbeddings()
12
+
13
+ # Directory containing multiple CSV files
14
+ csv_directory = "Data" # Replace with your folder name
15
+ csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder
16
+
17
+ documents = []
18
+
19
+ # Load and concatenate all CSV files
20
+ for file_path in csv_files:
21
+ df = pd.read_csv(file_path)
22
+ for _, row in df.iterrows():
23
+ # Combine all columns in the row into a single string
24
+ row_content = " ".join(row.astype(str))
25
+ documents.append(Document(page_content=row_content))
26
+
27
+ # Splitting the text and creating chunks of these documents
28
+ text_splitter = CharacterTextSplitter(
29
+ chunk_size=2000,
30
+ chunk_overlap=500
31
+ )
32
+
33
+ text_chunks = text_splitter.split_documents(documents)
34
+
35
+ # Process text chunks in batches
36
+ batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
37
+ for i in range(0, len(text_chunks), batch_size):
38
+ batch = text_chunks[i:i + batch_size]
39
+
40
+ # Store the batch in Chroma vector DB
41
+ vectordb = Chroma.from_documents(
42
+ documents=batch,
43
+ embedding=embeddings,
44
+ persist_directory="vector_db_dir"
45
+ )
46
+
47
+ print("Documents Vectorized and saved in VectorDB")
48
+
49
+ # Expose embeddings if needed
50
+ embeddings = HuggingFaceEmbeddings()
51
+
52
+
53
+
54
+ # Main guard to prevent execution on import
55
+ if __name__ == "__main__":
56
+ vectorize_documents()