Spaces:
Sleeping
Sleeping
Bimal Bhattarai
commited on
Commit
·
96cd987
0
Parent(s):
second
Browse files- .DS_Store +0 -0
- .env +7 -0
- .gitattributes +2 -0
- .gitignore +3 -0
- README.md +11 -0
- app.py +107 -0
- constants.py +16 -0
- db/chroma.sqlite3 +3 -0
- db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/data_level0.bin +3 -0
- db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/header.bin +3 -0
- db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/index_metadata.pickle +0 -0
- db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/length.bin +3 -0
- db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/link_lists.bin +3 -0
- htmlTemplates.py +51 -0
- ingest.py +171 -0
- poetry.lock +0 -0
- pyproject.toml +29 -0
- requirements.txt +16 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
.env
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PERSIST_DIRECTORY=db
|
2 |
+
MODEL_TYPE=GPT4All
|
3 |
+
MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
|
4 |
+
EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
|
5 |
+
MODEL_N_CTX=1000
|
6 |
+
MODEL_N_BATCH=8
|
7 |
+
TARGET_SOURCE_CHUNKS=5
|
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
source_documents
|
3 |
+
models
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Tsetlin-Chat
|
3 |
+
emoji: ⚕
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: purple
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.2.0
|
8 |
+
app_file: app.py
|
9 |
+
models: models/gpt4all
|
10 |
+
pinned: true
|
11 |
+
---
|
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from langchain.chains import RetrievalQA
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
6 |
+
from langchain.vectorstores import Chroma
|
7 |
+
from langchain.llms import GPT4All, LlamaCpp
|
8 |
+
import chromadb
|
9 |
+
import os
|
10 |
+
import argparse
|
11 |
+
import time
|
12 |
+
import streamlit as st
|
13 |
+
from htmlTemplates import css, bot_template, user_template
|
14 |
+
from langchain.memory import ConversationBufferMemory
|
15 |
+
from langchain.chains import ConversationalRetrievalChain
|
16 |
+
import langchain
|
17 |
+
langchain.verbose = False
|
18 |
+
if not load_dotenv():
|
19 |
+
print("Could not load .env file or it is empty. Please check if it exists and is readable.")
|
20 |
+
exit(1)
|
21 |
+
|
22 |
+
embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
|
23 |
+
persist_directory = os.environ.get('PERSIST_DIRECTORY')
|
24 |
+
|
25 |
+
model_type = os.environ.get('MODEL_TYPE')
|
26 |
+
model_path = os.environ.get('MODEL_PATH')
|
27 |
+
model_n_ctx = os.environ.get('MODEL_N_CTX')
|
28 |
+
model_n_batch = int(os.environ.get('MODEL_N_BATCH',8))
|
29 |
+
target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
|
30 |
+
|
31 |
+
from constants import CHROMA_SETTINGS
|
32 |
+
|
33 |
+
def handle_userinput(user_question):
|
34 |
+
response = st.session_state.conversation({'question': user_question})
|
35 |
+
st.session_state.chat_history = response['chat_history']
|
36 |
+
|
37 |
+
for i, message in enumerate(st.session_state.chat_history):
|
38 |
+
if i % 2 == 0:
|
39 |
+
st.write(user_template.replace(
|
40 |
+
"{{MSG}}", message.content), unsafe_allow_html=True)
|
41 |
+
else:
|
42 |
+
st.write(bot_template.replace(
|
43 |
+
"{{MSG}}", message.content), unsafe_allow_html=True)
|
44 |
+
|
45 |
+
|
46 |
+
def get_conversation_chain(llm, retriever):
|
47 |
+
#llm = ChatOpenAI()
|
48 |
+
#llm= GPT4All(model=model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=False)
|
49 |
+
memory = ConversationBufferMemory(
|
50 |
+
memory_key='chat_history', return_messages=True)
|
51 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
52 |
+
llm=llm,
|
53 |
+
retriever=retriever,
|
54 |
+
memory=memory
|
55 |
+
)
|
56 |
+
return conversation_chain
|
57 |
+
|
58 |
+
def main():
|
59 |
+
# Parse the command line arguments
|
60 |
+
args = parse_arguments()
|
61 |
+
st.set_page_config(page_title="Chat with multiple PDFs",
|
62 |
+
page_icon=":books:")
|
63 |
+
st.write(css, unsafe_allow_html=True)
|
64 |
+
|
65 |
+
if "conversation" not in st.session_state:
|
66 |
+
st.session_state.conversation = None
|
67 |
+
if "chat_history" not in st.session_state:
|
68 |
+
st.session_state.chat_history = None
|
69 |
+
|
70 |
+
st.header("Tsetlin LLM Powered Chatbot")
|
71 |
+
user_question = st.text_input("Ask a question about Tsetlin Machine:")
|
72 |
+
|
73 |
+
if user_question:
|
74 |
+
handle_userinput(user_question)
|
75 |
+
|
76 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
77 |
+
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
78 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
79 |
+
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
|
80 |
+
# activate/deactivate the streaming StdOut callback for LLMs
|
81 |
+
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
|
82 |
+
# Prepare the LLM
|
83 |
+
#what is match equivalent in python 3.9?
|
84 |
+
|
85 |
+
llm = GPT4All(model=model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=False)
|
86 |
+
|
87 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
|
88 |
+
# Interactive questions and answers
|
89 |
+
|
90 |
+
st.session_state.conversation = get_conversation_chain(llm, retriever)
|
91 |
+
|
92 |
+
|
93 |
+
def parse_arguments():
|
94 |
+
parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
|
95 |
+
'using the power of LLMs.')
|
96 |
+
parser.add_argument("--hide-source", "-S", action='store_true',
|
97 |
+
help='Use this flag to disable printing of source documents used for answers.')
|
98 |
+
|
99 |
+
parser.add_argument("--mute-stream", "-M",
|
100 |
+
action='store_true',
|
101 |
+
help='Use this flag to disable the streaming StdOut callback for LLMs.')
|
102 |
+
|
103 |
+
return parser.parse_args()
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
main()
|
constants.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from chromadb.config import Settings
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
# Define the folder for storing database
|
8 |
+
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
|
9 |
+
if PERSIST_DIRECTORY is None:
|
10 |
+
raise Exception("Please set the PERSIST_DIRECTORY environment variable")
|
11 |
+
|
12 |
+
# Define the Chroma settings
|
13 |
+
CHROMA_SETTINGS = Settings(
|
14 |
+
persist_directory=PERSIST_DIRECTORY,
|
15 |
+
anonymized_telemetry=False
|
16 |
+
)
|
db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:faed94a1be48de21f54651ece93f3e27e36b8d70ece9812f105713ef8104fb35
|
3 |
+
size 34115584
|
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a817b4dada67e3f64ef3ce4102e176fca480e057c2dac39212158444698da2d2
|
3 |
+
size 8380000
|
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f9ef91d2871d89044cd5874a1689a09b8e45127f88b6f1e757908874e684a6c
|
3 |
+
size 100
|
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/index_metadata.pickle
ADDED
Binary file (288 kB). View file
|
|
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa6a78fccf41054a748238c8168ecf7e2ddab452feeacadb00c10d6b093c404a
|
3 |
+
size 20000
|
db/f3bf2f98-9353-4d29-83ca-7b0e011daddf/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff4af9781581a47abe381aa548704f949a76960b1bca7a99253c81e43fb21fdc
|
3 |
+
size 44752
|
htmlTemplates.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = '''
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex; box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
5 |
+
}
|
6 |
+
.chat-message:hover {
|
7 |
+
box-shadow: 0 0 20px rgba(0,0,0,0.2);
|
8 |
+
}
|
9 |
+
.chat-message.user {
|
10 |
+
background-color: #2b313e
|
11 |
+
}
|
12 |
+
.chat-message.bot {
|
13 |
+
background-color: #475063
|
14 |
+
}
|
15 |
+
.chat-message .avatar {
|
16 |
+
width: 20%;
|
17 |
+
margin-right: 1rem;
|
18 |
+
}
|
19 |
+
.chat-message .avatar img {
|
20 |
+
max-width: 78px;
|
21 |
+
max-height: 78px;
|
22 |
+
border-radius: 50%;
|
23 |
+
object-fit: cover;
|
24 |
+
}
|
25 |
+
.chat-message .message {
|
26 |
+
width: 80%;
|
27 |
+
padding: 1rem;
|
28 |
+
border-radius: 0.5rem;
|
29 |
+
color: #fff;
|
30 |
+
font-family: Arial, sans-serif;
|
31 |
+
}
|
32 |
+
</style>
|
33 |
+
'''
|
34 |
+
|
35 |
+
bot_template = '''
|
36 |
+
<div class="chat-message bot">
|
37 |
+
<div class="avatar">
|
38 |
+
<img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
|
39 |
+
</div>
|
40 |
+
<div class="message">{{MSG}}</div>
|
41 |
+
</div>
|
42 |
+
'''
|
43 |
+
|
44 |
+
user_template = '''
|
45 |
+
<div class="chat-message user">
|
46 |
+
<div class="avatar">
|
47 |
+
<img src="https://freeimage.host/i/JIFVVIf">
|
48 |
+
</div>
|
49 |
+
<div class="message">{{MSG}}</div>
|
50 |
+
</div>
|
51 |
+
'''
|
ingest.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
from typing import List
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from multiprocessing import Pool
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from langchain.document_loaders import (
|
10 |
+
CSVLoader,
|
11 |
+
EverNoteLoader,
|
12 |
+
PyMuPDFLoader,
|
13 |
+
TextLoader,
|
14 |
+
UnstructuredEmailLoader,
|
15 |
+
UnstructuredEPubLoader,
|
16 |
+
UnstructuredHTMLLoader,
|
17 |
+
UnstructuredMarkdownLoader,
|
18 |
+
UnstructuredODTLoader,
|
19 |
+
UnstructuredPowerPointLoader,
|
20 |
+
UnstructuredWordDocumentLoader,
|
21 |
+
PyPDFLoader
|
22 |
+
)
|
23 |
+
|
24 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
25 |
+
from langchain.vectorstores import Chroma
|
26 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
27 |
+
from langchain.docstore.document import Document
|
28 |
+
|
29 |
+
if not load_dotenv():
|
30 |
+
print("Could not load .env file or it is empty. Please check if it exists and is readable.")
|
31 |
+
exit(1)
|
32 |
+
|
33 |
+
from constants import CHROMA_SETTINGS
|
34 |
+
import chromadb
|
35 |
+
|
36 |
+
# Load environment variables
|
37 |
+
persist_directory = os.environ.get('PERSIST_DIRECTORY')
|
38 |
+
source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
|
39 |
+
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
40 |
+
chunk_size = 500
|
41 |
+
chunk_overlap = 50
|
42 |
+
|
43 |
+
|
44 |
+
# Custom document loaders
|
45 |
+
# class MyElmLoader(UnstructuredEmailLoader):
|
46 |
+
# """Wrapper to fallback to text/plain when default does not work"""
|
47 |
+
|
48 |
+
# def load(self) -> List[Document]:
|
49 |
+
# """Wrapper adding fallback for elm without html"""
|
50 |
+
# try:
|
51 |
+
# try:
|
52 |
+
# doc = UnstructuredEmailLoader.load(self)
|
53 |
+
# except ValueError as e:
|
54 |
+
# if 'text/html content not found in email' in str(e):
|
55 |
+
# # Try plain text
|
56 |
+
# self.unstructured_kwargs["content_source"]="text/plain"
|
57 |
+
# doc = UnstructuredEmailLoader.load(self)
|
58 |
+
# else:
|
59 |
+
# raise
|
60 |
+
# except Exception as e:
|
61 |
+
# # Add file_path to exception message
|
62 |
+
# raise type(e)(f"{self.file_path}: {e}") from e
|
63 |
+
|
64 |
+
# return doc
|
65 |
+
|
66 |
+
|
67 |
+
# Map file extensions to document loaders and their arguments
|
68 |
+
LOADER_MAPPING = {
|
69 |
+
".csv": (CSVLoader, {}),
|
70 |
+
# ".docx": (Docx2txtLoader, {}),
|
71 |
+
".doc": (UnstructuredWordDocumentLoader, {}),
|
72 |
+
".docx": (UnstructuredWordDocumentLoader, {}),
|
73 |
+
".enex": (EverNoteLoader, {}),
|
74 |
+
# ".eml": (MyElmLoader, {}),
|
75 |
+
".epub": (UnstructuredEPubLoader, {}),
|
76 |
+
".html": (UnstructuredHTMLLoader, {}),
|
77 |
+
".md": (UnstructuredMarkdownLoader, {}),
|
78 |
+
".odt": (UnstructuredODTLoader, {}),
|
79 |
+
# ".pdf": (PyMuPDFLoader, {}),
|
80 |
+
".pdf": (PyPDFLoader, {}),
|
81 |
+
".ppt": (UnstructuredPowerPointLoader, {}),
|
82 |
+
".pptx": (UnstructuredPowerPointLoader, {}),
|
83 |
+
".txt": (TextLoader, {"encoding": "utf8"}),
|
84 |
+
# Add more mappings for other file extensions and loaders as needed
|
85 |
+
}
|
86 |
+
|
87 |
+
|
88 |
+
def load_single_document(file_path: str) -> List[Document]:
|
89 |
+
ext = "." + file_path.rsplit(".", 1)[-1].lower()
|
90 |
+
if ext in LOADER_MAPPING:
|
91 |
+
loader_class, loader_args = LOADER_MAPPING[ext]
|
92 |
+
loader = loader_class(file_path, **loader_args)
|
93 |
+
return loader.load()
|
94 |
+
|
95 |
+
raise ValueError(f"Unsupported file extension '{ext}'")
|
96 |
+
|
97 |
+
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
|
98 |
+
"""
|
99 |
+
Loads all documents from the source documents directory, ignoring specified files
|
100 |
+
"""
|
101 |
+
all_files = []
|
102 |
+
for ext in LOADER_MAPPING:
|
103 |
+
all_files.extend(
|
104 |
+
glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
|
105 |
+
)
|
106 |
+
all_files.extend(
|
107 |
+
glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
|
108 |
+
)
|
109 |
+
filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
|
110 |
+
|
111 |
+
with Pool(processes=os.cpu_count()) as pool:
|
112 |
+
results = []
|
113 |
+
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
|
114 |
+
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
|
115 |
+
results.extend(docs)
|
116 |
+
pbar.update()
|
117 |
+
|
118 |
+
return results
|
119 |
+
|
120 |
+
def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
121 |
+
"""
|
122 |
+
Load documents and split in chunks
|
123 |
+
"""
|
124 |
+
print(f"Loading documents from {source_directory}")
|
125 |
+
documents = load_documents(source_directory, ignored_files)
|
126 |
+
if not documents:
|
127 |
+
print("No new documents to load")
|
128 |
+
exit(0)
|
129 |
+
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
130 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
131 |
+
texts = text_splitter.split_documents(documents)
|
132 |
+
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
133 |
+
return texts
|
134 |
+
|
135 |
+
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
|
136 |
+
"""
|
137 |
+
Checks if vectorstore exists
|
138 |
+
"""
|
139 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
140 |
+
if not db.get()['documents']:
|
141 |
+
return False
|
142 |
+
return True
|
143 |
+
|
144 |
+
def main():
|
145 |
+
# Create embeddings
|
146 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
147 |
+
# Chroma client
|
148 |
+
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
149 |
+
|
150 |
+
if does_vectorstore_exist(persist_directory, embeddings):
|
151 |
+
# Update and store locally vectorstore
|
152 |
+
print(f"Appending to existing vectorstore at {persist_directory}")
|
153 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
154 |
+
collection = db.get()
|
155 |
+
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
|
156 |
+
print(f"Creating embeddings. May take some minutes...")
|
157 |
+
db.add_documents(texts)
|
158 |
+
else:
|
159 |
+
# Create and store locally vectorstore
|
160 |
+
print("Creating new vectorstore")
|
161 |
+
texts = process_documents()
|
162 |
+
print(f"Creating embeddings. May take some minutes...")
|
163 |
+
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
164 |
+
db.persist()
|
165 |
+
db = None
|
166 |
+
|
167 |
+
print(f"Ingestion complete! You can now run app.py to query your documents")
|
168 |
+
|
169 |
+
|
170 |
+
if __name__ == "__main__":
|
171 |
+
main()
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "privategpt"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Ivan Martinez <[email protected]>"]
|
6 |
+
license = "Apache Version 2.0"
|
7 |
+
readme = "README.md"
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = "^3.10"
|
11 |
+
langchain = "0.0.274"
|
12 |
+
gpt4all = "1.0.8"
|
13 |
+
chromadb = "0.4.7"
|
14 |
+
llama-cpp-python = "0.1.81"
|
15 |
+
urllib3 = "2.0.4"
|
16 |
+
PyMuPDF = "1.23.1"
|
17 |
+
python-dotenv = "^1.0.0"
|
18 |
+
unstructured = "0.10.8"
|
19 |
+
extract-msg = "0.45.0"
|
20 |
+
tabulate = "^0.9.0"
|
21 |
+
pandoc = "^2.3"
|
22 |
+
pypandoc = "^1.11"
|
23 |
+
tqdm = "4.66.1"
|
24 |
+
sentence-transformers = "2.2.2"
|
25 |
+
|
26 |
+
|
27 |
+
[build-system]
|
28 |
+
requires = ["poetry-core"]
|
29 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.0.274
|
2 |
+
streamlit==1.18.1
|
3 |
+
altair==4
|
4 |
+
gpt4all==1.0.8
|
5 |
+
chromadb==0.4.7
|
6 |
+
urllib3==2.0.4
|
7 |
+
PyMuPDF==1.23.1
|
8 |
+
python-dotenv==1.0.0
|
9 |
+
unstructured==0.10.8
|
10 |
+
extract-msg==0.45.0
|
11 |
+
tabulate==0.9.0
|
12 |
+
pandoc==2.3
|
13 |
+
pypandoc==1.11
|
14 |
+
tqdm==4.66.1
|
15 |
+
sentence_transformers==2.2.2
|
16 |
+
pypdf
|