Bimal Bhattarai commited on
Commit
96cd987
·
0 Parent(s):
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PERSIST_DIRECTORY=db
2
+ MODEL_TYPE=GPT4All
3
+ MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
4
+ EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
5
+ MODEL_N_CTX=1000
6
+ MODEL_N_BATCH=8
7
+ TARGET_SOURCE_CHUNKS=5
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ source_documents
3
+ models
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tsetlin-Chat
3
+ emoji: ⚕
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.2.0
8
+ app_file: app.py
9
+ models: models/gpt4all
10
+ pinned: true
11
+ ---
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from dotenv import load_dotenv
3
+ from langchain.chains import RetrievalQA
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.llms import GPT4All, LlamaCpp
8
+ import chromadb
9
+ import os
10
+ import argparse
11
+ import time
12
+ import streamlit as st
13
+ from htmlTemplates import css, bot_template, user_template
14
+ from langchain.memory import ConversationBufferMemory
15
+ from langchain.chains import ConversationalRetrievalChain
16
+ import langchain
17
+ langchain.verbose = False
18
+ if not load_dotenv():
19
+ print("Could not load .env file or it is empty. Please check if it exists and is readable.")
20
+ exit(1)
21
+
22
+ embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
23
+ persist_directory = os.environ.get('PERSIST_DIRECTORY')
24
+
25
+ model_type = os.environ.get('MODEL_TYPE')
26
+ model_path = os.environ.get('MODEL_PATH')
27
+ model_n_ctx = os.environ.get('MODEL_N_CTX')
28
+ model_n_batch = int(os.environ.get('MODEL_N_BATCH',8))
29
+ target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
30
+
31
+ from constants import CHROMA_SETTINGS
32
+
33
+ def handle_userinput(user_question):
34
+ response = st.session_state.conversation({'question': user_question})
35
+ st.session_state.chat_history = response['chat_history']
36
+
37
+ for i, message in enumerate(st.session_state.chat_history):
38
+ if i % 2 == 0:
39
+ st.write(user_template.replace(
40
+ "{{MSG}}", message.content), unsafe_allow_html=True)
41
+ else:
42
+ st.write(bot_template.replace(
43
+ "{{MSG}}", message.content), unsafe_allow_html=True)
44
+
45
+
46
+ def get_conversation_chain(llm, retriever):
47
+ #llm = ChatOpenAI()
48
+ #llm= GPT4All(model=model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=False)
49
+ memory = ConversationBufferMemory(
50
+ memory_key='chat_history', return_messages=True)
51
+ conversation_chain = ConversationalRetrievalChain.from_llm(
52
+ llm=llm,
53
+ retriever=retriever,
54
+ memory=memory
55
+ )
56
+ return conversation_chain
57
+
58
+ def main():
59
+ # Parse the command line arguments
60
+ args = parse_arguments()
61
+ st.set_page_config(page_title="Chat with multiple PDFs",
62
+ page_icon=":books:")
63
+ st.write(css, unsafe_allow_html=True)
64
+
65
+ if "conversation" not in st.session_state:
66
+ st.session_state.conversation = None
67
+ if "chat_history" not in st.session_state:
68
+ st.session_state.chat_history = None
69
+
70
+ st.header("Tsetlin LLM Powered Chatbot")
71
+ user_question = st.text_input("Ask a question about Tsetlin Machine:")
72
+
73
+ if user_question:
74
+ handle_userinput(user_question)
75
+
76
+ embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
77
+ chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
78
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
79
+ retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
80
+ # activate/deactivate the streaming StdOut callback for LLMs
81
+ callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
82
+ # Prepare the LLM
83
+ #what is match equivalent in python 3.9?
84
+
85
+ llm = GPT4All(model=model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=False)
86
+
87
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
88
+ # Interactive questions and answers
89
+
90
+ st.session_state.conversation = get_conversation_chain(llm, retriever)
91
+
92
+
93
+ def parse_arguments():
94
+ parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
95
+ 'using the power of LLMs.')
96
+ parser.add_argument("--hide-source", "-S", action='store_true',
97
+ help='Use this flag to disable printing of source documents used for answers.')
98
+
99
+ parser.add_argument("--mute-stream", "-M",
100
+ action='store_true',
101
+ help='Use this flag to disable the streaming StdOut callback for LLMs.')
102
+
103
+ return parser.parse_args()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
constants.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from chromadb.config import Settings
4
+
5
+ load_dotenv()
6
+
7
+ # Define the folder for storing database
8
+ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
9
+ if PERSIST_DIRECTORY is None:
10
+ raise Exception("Please set the PERSIST_DIRECTORY environment variable")
11
+
12
+ # Define the Chroma settings
13
+ CHROMA_SETTINGS = Settings(
14
+ persist_directory=PERSIST_DIRECTORY,
15
+ anonymized_telemetry=False
16
+ )
db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faed94a1be48de21f54651ece93f3e27e36b8d70ece9812f105713ef8104fb35
3
+ size 34115584
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a817b4dada67e3f64ef3ce4102e176fca480e057c2dac39212158444698da2d2
3
+ size 8380000
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9ef91d2871d89044cd5874a1689a09b8e45127f88b6f1e757908874e684a6c
3
+ size 100
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/index_metadata.pickle ADDED
Binary file (288 kB). View file
 
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa6a78fccf41054a748238c8168ecf7e2ddab452feeacadb00c10d6b093c404a
3
+ size 20000
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4af9781581a47abe381aa548704f949a76960b1bca7a99253c81e43fb21fdc
3
+ size 44752
htmlTemplates.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex; box-shadow: 0 0 10px rgba(0,0,0,0.1);
5
+ }
6
+ .chat-message:hover {
7
+ box-shadow: 0 0 20px rgba(0,0,0,0.2);
8
+ }
9
+ .chat-message.user {
10
+ background-color: #2b313e
11
+ }
12
+ .chat-message.bot {
13
+ background-color: #475063
14
+ }
15
+ .chat-message .avatar {
16
+ width: 20%;
17
+ margin-right: 1rem;
18
+ }
19
+ .chat-message .avatar img {
20
+ max-width: 78px;
21
+ max-height: 78px;
22
+ border-radius: 50%;
23
+ object-fit: cover;
24
+ }
25
+ .chat-message .message {
26
+ width: 80%;
27
+ padding: 1rem;
28
+ border-radius: 0.5rem;
29
+ color: #fff;
30
+ font-family: Arial, sans-serif;
31
+ }
32
+ </style>
33
+ '''
34
+
35
+ bot_template = '''
36
+ <div class="chat-message bot">
37
+ <div class="avatar">
38
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
39
+ </div>
40
+ <div class="message">{{MSG}}</div>
41
+ </div>
42
+ '''
43
+
44
+ user_template = '''
45
+ <div class="chat-message user">
46
+ <div class="avatar">
47
+ <img src="https://freeimage.host/i/JIFVVIf">
48
+ </div>
49
+ <div class="message">{{MSG}}</div>
50
+ </div>
51
+ '''
ingest.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import glob
4
+ from typing import List
5
+ from dotenv import load_dotenv
6
+ from multiprocessing import Pool
7
+ from tqdm import tqdm
8
+
9
+ from langchain.document_loaders import (
10
+ CSVLoader,
11
+ EverNoteLoader,
12
+ PyMuPDFLoader,
13
+ TextLoader,
14
+ UnstructuredEmailLoader,
15
+ UnstructuredEPubLoader,
16
+ UnstructuredHTMLLoader,
17
+ UnstructuredMarkdownLoader,
18
+ UnstructuredODTLoader,
19
+ UnstructuredPowerPointLoader,
20
+ UnstructuredWordDocumentLoader,
21
+ PyPDFLoader
22
+ )
23
+
24
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+ from langchain.vectorstores import Chroma
26
+ from langchain.embeddings import HuggingFaceEmbeddings
27
+ from langchain.docstore.document import Document
28
+
29
+ if not load_dotenv():
30
+ print("Could not load .env file or it is empty. Please check if it exists and is readable.")
31
+ exit(1)
32
+
33
+ from constants import CHROMA_SETTINGS
34
+ import chromadb
35
+
36
+ # Load environment variables
37
+ persist_directory = os.environ.get('PERSIST_DIRECTORY')
38
+ source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
39
+ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
40
+ chunk_size = 500
41
+ chunk_overlap = 50
42
+
43
+
44
+ # Custom document loaders
45
+ # class MyElmLoader(UnstructuredEmailLoader):
46
+ # """Wrapper to fallback to text/plain when default does not work"""
47
+
48
+ # def load(self) -> List[Document]:
49
+ # """Wrapper adding fallback for elm without html"""
50
+ # try:
51
+ # try:
52
+ # doc = UnstructuredEmailLoader.load(self)
53
+ # except ValueError as e:
54
+ # if 'text/html content not found in email' in str(e):
55
+ # # Try plain text
56
+ # self.unstructured_kwargs["content_source"]="text/plain"
57
+ # doc = UnstructuredEmailLoader.load(self)
58
+ # else:
59
+ # raise
60
+ # except Exception as e:
61
+ # # Add file_path to exception message
62
+ # raise type(e)(f"{self.file_path}: {e}") from e
63
+
64
+ # return doc
65
+
66
+
67
+ # Map file extensions to document loaders and their arguments
68
+ LOADER_MAPPING = {
69
+ ".csv": (CSVLoader, {}),
70
+ # ".docx": (Docx2txtLoader, {}),
71
+ ".doc": (UnstructuredWordDocumentLoader, {}),
72
+ ".docx": (UnstructuredWordDocumentLoader, {}),
73
+ ".enex": (EverNoteLoader, {}),
74
+ # ".eml": (MyElmLoader, {}),
75
+ ".epub": (UnstructuredEPubLoader, {}),
76
+ ".html": (UnstructuredHTMLLoader, {}),
77
+ ".md": (UnstructuredMarkdownLoader, {}),
78
+ ".odt": (UnstructuredODTLoader, {}),
79
+ # ".pdf": (PyMuPDFLoader, {}),
80
+ ".pdf": (PyPDFLoader, {}),
81
+ ".ppt": (UnstructuredPowerPointLoader, {}),
82
+ ".pptx": (UnstructuredPowerPointLoader, {}),
83
+ ".txt": (TextLoader, {"encoding": "utf8"}),
84
+ # Add more mappings for other file extensions and loaders as needed
85
+ }
86
+
87
+
88
+ def load_single_document(file_path: str) -> List[Document]:
89
+ ext = "." + file_path.rsplit(".", 1)[-1].lower()
90
+ if ext in LOADER_MAPPING:
91
+ loader_class, loader_args = LOADER_MAPPING[ext]
92
+ loader = loader_class(file_path, **loader_args)
93
+ return loader.load()
94
+
95
+ raise ValueError(f"Unsupported file extension '{ext}'")
96
+
97
+ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
98
+ """
99
+ Loads all documents from the source documents directory, ignoring specified files
100
+ """
101
+ all_files = []
102
+ for ext in LOADER_MAPPING:
103
+ all_files.extend(
104
+ glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
105
+ )
106
+ all_files.extend(
107
+ glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
108
+ )
109
+ filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
110
+
111
+ with Pool(processes=os.cpu_count()) as pool:
112
+ results = []
113
+ with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
114
+ for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
115
+ results.extend(docs)
116
+ pbar.update()
117
+
118
+ return results
119
+
120
+ def process_documents(ignored_files: List[str] = []) -> List[Document]:
121
+ """
122
+ Load documents and split in chunks
123
+ """
124
+ print(f"Loading documents from {source_directory}")
125
+ documents = load_documents(source_directory, ignored_files)
126
+ if not documents:
127
+ print("No new documents to load")
128
+ exit(0)
129
+ print(f"Loaded {len(documents)} new documents from {source_directory}")
130
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
131
+ texts = text_splitter.split_documents(documents)
132
+ print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
133
+ return texts
134
+
135
+ def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
136
+ """
137
+ Checks if vectorstore exists
138
+ """
139
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
140
+ if not db.get()['documents']:
141
+ return False
142
+ return True
143
+
144
+ def main():
145
+ # Create embeddings
146
+ embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
147
+ # Chroma client
148
+ chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
149
+
150
+ if does_vectorstore_exist(persist_directory, embeddings):
151
+ # Update and store locally vectorstore
152
+ print(f"Appending to existing vectorstore at {persist_directory}")
153
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
154
+ collection = db.get()
155
+ texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
156
+ print(f"Creating embeddings. May take some minutes...")
157
+ db.add_documents(texts)
158
+ else:
159
+ # Create and store locally vectorstore
160
+ print("Creating new vectorstore")
161
+ texts = process_documents()
162
+ print(f"Creating embeddings. May take some minutes...")
163
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
164
+ db.persist()
165
+ db = None
166
+
167
+ print(f"Ingestion complete! You can now run app.py to query your documents")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ main()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "privategpt"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Ivan Martinez <[email protected]>"]
6
+ license = "Apache Version 2.0"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ langchain = "0.0.274"
12
+ gpt4all = "1.0.8"
13
+ chromadb = "0.4.7"
14
+ llama-cpp-python = "0.1.81"
15
+ urllib3 = "2.0.4"
16
+ PyMuPDF = "1.23.1"
17
+ python-dotenv = "^1.0.0"
18
+ unstructured = "0.10.8"
19
+ extract-msg = "0.45.0"
20
+ tabulate = "^0.9.0"
21
+ pandoc = "^2.3"
22
+ pypandoc = "^1.11"
23
+ tqdm = "4.66.1"
24
+ sentence-transformers = "2.2.2"
25
+
26
+
27
+ [build-system]
28
+ requires = ["poetry-core"]
29
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.274
2
+ streamlit==1.18.1
3
+ altair==4
4
+ gpt4all==1.0.8
5
+ chromadb==0.4.7
6
+ urllib3==2.0.4
7
+ PyMuPDF==1.23.1
8
+ python-dotenv==1.0.0
9
+ unstructured==0.10.8
10
+ extract-msg==0.45.0
11
+ tabulate==0.9.0
12
+ pandoc==2.3
13
+ pypandoc==1.11
14
+ tqdm==4.66.1
15
+ sentence_transformers==2.2.2
16
+ pypdf