Spaces:
Build error
Build error
Initial Draft
Browse files
app.py
CHANGED
@@ -1,147 +1,249 @@
|
|
1 |
import streamlit as st
|
2 |
-
import tempfile
|
3 |
import os
|
4 |
-
import
|
5 |
-
import
|
|
|
6 |
|
7 |
-
from
|
8 |
-
from
|
|
|
9 |
from langchain.vectorstores.faiss import FAISS
|
10 |
-
from langchain.chains import ConversationalRetrievalChain
|
11 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
-
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
mv_processing_message.text("Step2: PDF content extracted")
|
22 |
|
23 |
-
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
lv_embeddings = HuggingFaceEmbeddings(
|
30 |
-
model_name="sentence-transformers/
|
31 |
-
model_kwargs={'device': 'cpu'}
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Function return QA Response using Vector Store
|
41 |
def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message):
|
42 |
"""Returns QA Response using Vector Store"""
|
43 |
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
print("Step4: Generating LLM response")
|
52 |
-
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
device_map="cpu", max_new_tokens=2048, return_full_text=True
|
68 |
-
)
|
69 |
-
|
70 |
-
lv_hf_phi2_pipeline = HuggingFacePipeline(pipeline=lv_ms_phi2_pipeline)
|
71 |
-
lv_chain = ConversationalRetrievalChain.from_llm(lv_hf_phi2_pipeline, lv_vector_store.as_retriever(), return_source_documents=True)
|
72 |
-
lv_response = lv_chain({"question": mv_user_question, 'chat_history': lv_chat_history})
|
73 |
-
|
74 |
-
lv_chat_history += [(mv_user_question, lv_response["answer"])]
|
75 |
-
st.session_state.chat_history = lv_chat_history
|
76 |
|
77 |
-
|
78 |
-
mv_processing_message.text("Step5: LLM response generated")
|
79 |
|
80 |
-
|
|
|
81 |
|
|
|
82 |
|
83 |
# Main Function
|
84 |
def main():
|
85 |
|
86 |
# -- Streamlit Settings
|
87 |
st.set_page_config(layout='wide')
|
88 |
-
|
|
|
|
|
|
|
89 |
# -- Initialize chat history
|
90 |
if "messages" not in st.session_state:
|
91 |
st.session_state.messages = []
|
92 |
|
|
|
93 |
col1, col2, col3 = st.columns(3)
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
|
|
|
97 |
col1, col2, col3 = st.columns(3)
|
98 |
-
|
99 |
-
st.text("")
|
100 |
st.text("")
|
101 |
-
st.text("")
|
102 |
-
col1, col2, col3 = st.columns(3)
|
103 |
|
104 |
# -- Reading PDF File
|
|
|
105 |
mv_pdf_input_file = col2.file_uploader("Choose a PDF file:", type=["pdf"])
|
106 |
-
|
107 |
-
if 'mv_temp_file_storage_dir' not in st.session_state:
|
108 |
-
mv_temp_file_storage_dir = tempfile.mkdtemp()
|
109 |
-
st.session_state.mv_temp_file_storage_dir = mv_temp_file_storage_dir
|
110 |
-
else:
|
111 |
-
mv_temp_file_storage_dir = st.session_state.mv_temp_file_storage_dir
|
112 |
|
113 |
-
|
114 |
-
st.text("")
|
115 |
st.text("")
|
|
|
|
|
116 |
st.text("")
|
117 |
|
118 |
-
|
119 |
-
|
|
|
|
|
120 |
if (mv_pdf_input_file is not None):
|
121 |
-
mv_file_name = mv_pdf_input_file.name
|
122 |
-
# mv_vectorstore_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".vectorstore")
|
123 |
-
# mv_metadata_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".metadata")
|
124 |
-
|
125 |
-
if 'lv_vector_store' not in st.session_state:
|
126 |
-
# -- Storing Uploaded PDF locally
|
127 |
-
lv_temp_file_path = os.path.join(mv_temp_file_storage_dir,mv_file_name)
|
128 |
-
with open(lv_temp_file_path,"wb") as lv_file:
|
129 |
-
lv_file.write(mv_pdf_input_file.getbuffer())
|
130 |
-
print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
|
131 |
-
mv_processing_message.text("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
|
132 |
-
|
133 |
-
# -- Extracting PDF Text
|
134 |
-
lv_pdf_content = fn_read_pdf(lv_temp_file_path, mv_processing_message)
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
|
142 |
-
# --
|
143 |
col1, col2, col3 = st.columns(3)
|
|
|
144 |
lv_chat_history = col2.chat_message
|
|
|
145 |
|
146 |
if mv_user_question := col2.chat_input("Chat on PDF Data"):
|
147 |
# -- Add user message to chat history
|
@@ -153,11 +255,14 @@ def main():
|
|
153 |
# -- Adding assistant response to chat history
|
154 |
st.session_state.messages.append({"role": "assistant", "content": lv_response})
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
|
|
|
161 |
|
162 |
# Calling Main Function
|
163 |
if __name__ == '__main__':
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import os
|
3 |
+
import requests
|
4 |
+
import time
|
5 |
+
import sys
|
6 |
|
7 |
+
from langchain.document_loaders import PyPDFLoader
|
8 |
+
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
10 |
from langchain.vectorstores.faiss import FAISS
|
|
|
|
|
|
|
11 |
|
12 |
+
from langchain.prompts.prompt import PromptTemplate
|
13 |
+
from langchain_community.llms import CTransformers
|
14 |
+
from langchain.chains import RetrievalQA
|
15 |
|
16 |
+
# Upload pdf file into 'pdf-data' folder if it does not exist
|
17 |
+
def fn_upload_pdf(mv_pdf_input_file, mv_processing_message):
|
18 |
+
"""Upload pdf file into 'pdf-data' folder if it does not exist"""
|
|
|
19 |
|
20 |
+
lv_file_name = mv_pdf_input_file.name
|
21 |
|
22 |
+
if not os.path.exists("pdf-data"):
|
23 |
+
os.makedirs("pdf-data")
|
24 |
+
|
25 |
+
lv_temp_file_path = os.path.join("pdf-data",lv_file_name)
|
26 |
+
|
27 |
+
if os.path.exists(lv_temp_file_path):
|
28 |
+
print("File already available")
|
29 |
+
fn_display_user_messages("File already available","Warning", mv_processing_message)
|
30 |
+
else:
|
31 |
+
with open(lv_temp_file_path,"wb") as lv_file:
|
32 |
+
lv_file.write(mv_pdf_input_file.getbuffer())
|
33 |
+
|
34 |
+
print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
|
35 |
+
fn_display_user_messages("Step1: PDF uploaded successfully at -> " + lv_temp_file_path, "Info", mv_processing_message)
|
36 |
|
37 |
+
# Create Vector DB of uploaded PDF
|
38 |
+
def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
|
39 |
+
"""Create Vector DB of uploaded PDF"""
|
40 |
+
|
41 |
+
lv_file_name = mv_pdf_input_file.name[:-4] + ".vectorstore"
|
42 |
+
|
43 |
+
if not os.path.exists("vectordb/fiaas"):
|
44 |
+
os.makedirs("vectordb/fiaas")
|
45 |
+
|
46 |
+
lv_temp_file_path = os.path.join("vectordb/fiaas",lv_file_name)
|
47 |
lv_embeddings = HuggingFaceEmbeddings(
|
48 |
+
model_name="sentence-transformers/all-mpnet-base-v2",
|
49 |
+
model_kwargs={'device': 'cpu'}
|
50 |
+
)
|
51 |
+
|
52 |
+
if os.path.exists(lv_temp_file_path):
|
53 |
+
print("VectorDB already available for uploaded file")
|
54 |
+
fn_display_user_messages("VectorDB already available for uploaded file","Warning", mv_processing_message)
|
55 |
+
|
56 |
+
lv_vector_store = FAISS.load_local(lv_temp_file_path, lv_embeddings,allow_dangerous_deserialization=True)
|
57 |
+
return lv_vector_store
|
58 |
+
else:
|
59 |
+
lv_temp_pdf_file_path = os.path.join("pdf-data",mv_pdf_input_file.name)
|
60 |
+
|
61 |
+
# -- Loading PDF Data
|
62 |
+
lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
|
63 |
+
lv_pdf_content = lv_pdf_loader.load()
|
64 |
+
print("Step2: PDF content extracted")
|
65 |
+
fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
|
66 |
+
|
67 |
+
# -- Chunking PDF Data
|
68 |
+
lv_text_splitter = CharacterTextSplitter(
|
69 |
+
separator="\n",
|
70 |
+
chunk_size=300,
|
71 |
+
chunk_overlap=30,
|
72 |
+
length_function=len
|
73 |
+
)
|
74 |
+
lv_pdf_chunk_documents = lv_text_splitter.split_documents(lv_pdf_content)
|
75 |
+
print("Step3: PDF content chucked and document object created")
|
76 |
+
fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
|
77 |
+
|
78 |
+
# -- Creating FIASS Vector Store
|
79 |
+
lv_vector_store = FAISS.from_documents(lv_pdf_chunk_documents, lv_embeddings)
|
80 |
+
print("Step4: Vector store created")
|
81 |
+
fn_display_user_messages("Step4: Vector store created", "Info", mv_processing_message)
|
82 |
+
lv_vector_store.save_local(lv_temp_file_path)
|
83 |
|
84 |
+
return lv_vector_store
|
85 |
+
|
86 |
+
# Display user Error, Warning or Success Message
|
87 |
+
def fn_display_user_messages(lv_text, lv_type, mv_processing_message):
|
88 |
+
"""Display user Info, Error, Warning or Success Message"""
|
89 |
+
|
90 |
+
if lv_type == "Success":
|
91 |
+
with mv_processing_message.container():
|
92 |
+
st.success(lv_text)
|
93 |
+
elif lv_type == "Error":
|
94 |
+
with mv_processing_message.container():
|
95 |
+
st.error(lv_text)
|
96 |
+
elif lv_type == "Warning":
|
97 |
+
with mv_processing_message.container():
|
98 |
+
st.warning(lv_text)
|
99 |
+
else:
|
100 |
+
with mv_processing_message.container():
|
101 |
+
st.info(lv_text)
|
102 |
+
|
103 |
+
# Download TheBloke Models
|
104 |
+
def fn_download_llm_models(mv_selected_model, mv_processing_message):
|
105 |
+
"""Download TheBloke Models"""
|
106 |
+
|
107 |
+
lv_download_url = ""
|
108 |
+
|
109 |
+
print("Downloading TheBloke of "+mv_selected_model)
|
110 |
+
fn_display_user_messages("Downloading TheBloke of "+mv_selected_model, "Info", mv_processing_message)
|
111 |
+
|
112 |
+
if mv_selected_model == 'microsoft/phi-2':
|
113 |
+
lv_download_url = "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf"
|
114 |
+
elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
|
115 |
+
lv_download_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf"
|
116 |
+
|
117 |
+
if not os.path.exists("model"):
|
118 |
+
os.makedirs("model")
|
119 |
+
|
120 |
+
lv_filename = os.path.basename(lv_download_url)
|
121 |
+
lv_temp_file_path = os.path.join("model",lv_filename)
|
122 |
+
|
123 |
+
if os.path.exists(lv_temp_file_path):
|
124 |
+
print("Model already available")
|
125 |
+
fn_display_user_messages("Model already available","Warning", mv_processing_message)
|
126 |
+
else:
|
127 |
+
lv_response = requests.get(lv_download_url, stream=True)
|
128 |
+
if lv_response.status_code == 200:
|
129 |
+
with open(lv_temp_file_path, 'wb') as f:
|
130 |
+
for chunk in lv_response.iter_content(chunk_size=1024):
|
131 |
+
if chunk:
|
132 |
+
f.write(chunk)
|
133 |
+
|
134 |
+
print("Download completed")
|
135 |
+
fn_display_user_messages("Model download completed","Info", mv_processing_message)
|
136 |
+
else:
|
137 |
+
print(f"Model download completed {response.status_code}")
|
138 |
+
fn_display_user_messages(f"Model download completed {response.status_code}","Error", mv_processing_message)
|
139 |
|
140 |
# Function return QA Response using Vector Store
|
141 |
def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message):
|
142 |
"""Returns QA Response using Vector Store"""
|
143 |
|
144 |
+
lv_model_path = ""
|
145 |
+
lv_model_type = ""
|
146 |
+
lv_template = """Instruction:
|
147 |
+
You are an AI assistant for answering questions about the provided context.
|
148 |
+
You are given the following extracted parts of a long document and a question. Provide a detailed answer.
|
149 |
+
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
|
150 |
+
=======
|
151 |
+
{context}
|
152 |
+
=======
|
153 |
+
Question: {question}
|
154 |
+
Output:\n"""
|
155 |
+
lv_qa_prompt = PromptTemplate(
|
156 |
+
template=lv_template,
|
157 |
+
input_variables=["question", "context"]
|
158 |
+
)
|
159 |
|
160 |
+
if mv_selected_model == 'microsoft/phi-2':
|
161 |
+
lv_model_path = "model/phi-2.Q2_K.gguf"
|
162 |
+
lv_model_type = "pi"
|
163 |
+
elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
|
164 |
+
lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
|
165 |
+
lv_model_type = "mistral"
|
166 |
+
|
167 |
+
print("Model Absolute location -" +lv_model_path)
|
168 |
|
169 |
print("Step4: Generating LLM response")
|
170 |
+
fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
|
171 |
|
172 |
+
lv_model = CTransformers(
|
173 |
+
model="model/mistral-7b-instruct-v0.2.Q2_K.gguf",
|
174 |
+
model_type=lv_model_type,
|
175 |
+
max_new_tokens=2048,
|
176 |
+
temperature=0.00
|
177 |
+
)
|
178 |
+
lv_retriever = lv_vector_store.as_retriever(search_kwargs={'k': 2})
|
179 |
+
lv_qa_chain = RetrievalQA.from_chain_type( llm=lv_model,
|
180 |
+
chain_type='stuff',
|
181 |
+
retriever=lv_retriever,
|
182 |
+
return_source_documents=True,
|
183 |
+
chain_type_kwargs={'prompt': lv_qa_prompt}
|
184 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
lv_response = lv_qa_chain({"query": mv_user_question})
|
|
|
187 |
|
188 |
+
print("Step5: LLM response generated")
|
189 |
+
fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
|
190 |
|
191 |
+
return lv_response['result']
|
192 |
|
193 |
# Main Function
|
194 |
def main():
|
195 |
|
196 |
# -- Streamlit Settings
|
197 |
st.set_page_config(layout='wide')
|
198 |
+
col1, col2, col3 = st.columns(3)
|
199 |
+
col2.title("Chat with PDF")
|
200 |
+
st.text("")
|
201 |
+
|
202 |
# -- Initialize chat history
|
203 |
if "messages" not in st.session_state:
|
204 |
st.session_state.messages = []
|
205 |
|
206 |
+
# -- Display Supported Models
|
207 |
col1, col2, col3 = st.columns(3)
|
208 |
+
mv_selected_model = col3.selectbox('Select Model',
|
209 |
+
[
|
210 |
+
'microsoft/phi-2',
|
211 |
+
'mistralai/Mistral-7B-Instruct-v0.2'
|
212 |
+
]
|
213 |
+
)
|
214 |
|
215 |
+
# -- Display Supported Vector Stores
|
216 |
col1, col2, col3 = st.columns(3)
|
217 |
+
mv_selected_vector_db = col3.selectbox('Select Vector DB', ['FAISS'])
|
|
|
218 |
st.text("")
|
|
|
|
|
219 |
|
220 |
# -- Reading PDF File
|
221 |
+
col1, col2, col3 = st.columns(3)
|
222 |
mv_pdf_input_file = col2.file_uploader("Choose a PDF file:", type=["pdf"])
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
+
# -- Display Processing Details
|
|
|
225 |
st.text("")
|
226 |
+
col1, col2, col3 = st.columns(3)
|
227 |
+
mv_processing_message = col2.empty()
|
228 |
st.text("")
|
229 |
|
230 |
+
# -- Downloading Model Files
|
231 |
+
fn_download_llm_models(mv_selected_model, mv_processing_message)
|
232 |
+
|
233 |
+
# -- Processing PDF
|
234 |
if (mv_pdf_input_file is not None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
+
# -- Upload PDF
|
237 |
+
fn_upload_pdf(mv_pdf_input_file, mv_processing_message)
|
238 |
+
|
239 |
+
# -- Create Vector Index
|
240 |
+
lv_vector_store = fn_create_vector_db(mv_pdf_input_file, mv_processing_message)
|
241 |
|
242 |
+
# -- Perform RAG
|
243 |
col1, col2, col3 = st.columns(3)
|
244 |
+
st.text("")
|
245 |
lv_chat_history = col2.chat_message
|
246 |
+
st.text("")
|
247 |
|
248 |
if mv_user_question := col2.chat_input("Chat on PDF Data"):
|
249 |
# -- Add user message to chat history
|
|
|
255 |
# -- Adding assistant response to chat history
|
256 |
st.session_state.messages.append({"role": "assistant", "content": lv_response})
|
257 |
|
258 |
+
# -- Display chat messages from history on app rerun
|
259 |
+
for message in st.session_state.messages:
|
260 |
+
with lv_chat_history(message["role"]):
|
261 |
+
st.markdown(message["content"])
|
262 |
+
|
263 |
+
# -- Validate Data
|
264 |
|
265 |
+
# -- Get Web Response
|
266 |
|
267 |
# Calling Main Function
|
268 |
if __name__ == '__main__':
|