rahgadda commited on
Commit
021440c
·
verified ·
1 Parent(s): 20b3e18

Initial Draft

Browse files
Files changed (1) hide show
  1. app.py +204 -99
app.py CHANGED
@@ -1,147 +1,249 @@
1
  import streamlit as st
2
- import tempfile
3
  import os
4
- import re
5
- import torch
 
6
 
7
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextIteratorStreamer
8
- from langchain_community.document_loaders import PyPDFLoader
 
9
  from langchain.vectorstores.faiss import FAISS
10
- from langchain.chains import ConversationalRetrievalChain
11
- from langchain_community.embeddings import HuggingFaceEmbeddings
12
- from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
13
 
14
- # Function return langchain document object of PDF pages
15
- def fn_read_pdf(lv_temp_file_path, mv_processing_message):
16
- """Returns langchain document object of PDF pages"""
17
 
18
- lv_pdf_loader = PyPDFLoader(lv_temp_file_path)
19
- lv_pdf_content = lv_pdf_loader.load()
20
- print("Step2: PDF content extracted")
21
- mv_processing_message.text("Step2: PDF content extracted")
22
 
23
- return lv_pdf_content
24
 
25
- # Function return FAISS Vector store
26
- def fn_create_faiss_vector_store(lv_pdf_content, mv_processing_message):
27
- """Returns FAISS vector store index of PDF Content"""
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
29
  lv_embeddings = HuggingFaceEmbeddings(
30
- model_name="sentence-transformers/msmarco-distilbert-base-v4",
31
- model_kwargs={'device': 'cpu'},
32
- encode_kwargs={'normalize_embeddings': False}
33
- )
34
- lv_vector_store = FAISS.from_documents(lv_pdf_content, lv_embeddings)
35
- print("Step3: Vector store created")
36
- mv_processing_message.text("Step3: Vector store created")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- return lv_vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Function return QA Response using Vector Store
41
  def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message):
42
  """Returns QA Response using Vector Store"""
43
 
44
- lv_chat_history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- if "chat_history" not in st.session_state:
47
- st.session_state.chat_history = []
48
- else:
49
- lv_chat_history = st.session_state.chat_history
 
 
 
 
50
 
51
  print("Step4: Generating LLM response")
52
- mv_processing_message.text("Step4: Generating LLM response")
53
 
54
- lv_tokenizer = AutoTokenizer.from_pretrained(
55
- mv_selected_model,
56
- model_max_length=2048,
57
- trust_remote_code=True
58
- )
59
- lv_model = AutoModelForCausalLM.from_pretrained(
60
- mv_selected_model,
61
- device_map="cpu",
62
- trust_remote_code=True
63
- )
64
- lv_ms_phi2_pipeline = pipeline(
65
- "text-generation", tokenizer=lv_tokenizer, model=lv_model,
66
- pad_token_id=lv_tokenizer.eos_token_id, eos_token_id=lv_tokenizer.eos_token_id,
67
- device_map="cpu", max_new_tokens=2048, return_full_text=True
68
- )
69
-
70
- lv_hf_phi2_pipeline = HuggingFacePipeline(pipeline=lv_ms_phi2_pipeline)
71
- lv_chain = ConversationalRetrievalChain.from_llm(lv_hf_phi2_pipeline, lv_vector_store.as_retriever(), return_source_documents=True)
72
- lv_response = lv_chain({"question": mv_user_question, 'chat_history': lv_chat_history})
73
-
74
- lv_chat_history += [(mv_user_question, lv_response["answer"])]
75
- st.session_state.chat_history = lv_chat_history
76
 
77
- print("Step5: LLM response generated")
78
- mv_processing_message.text("Step5: LLM response generated")
79
 
80
- return lv_response['answer']
 
81
 
 
82
 
83
  # Main Function
84
  def main():
85
 
86
  # -- Streamlit Settings
87
  st.set_page_config(layout='wide')
88
-
 
 
 
89
  # -- Initialize chat history
90
  if "messages" not in st.session_state:
91
  st.session_state.messages = []
92
 
 
93
  col1, col2, col3 = st.columns(3)
94
- col2.title("Chat with your PDF")
95
- st.text("")
 
 
 
 
96
 
 
97
  col1, col2, col3 = st.columns(3)
98
- mv_selected_model=col3.selectbox('Select Model',['microsoft/phi-2'])
99
- st.text("")
100
  st.text("")
101
- st.text("")
102
- col1, col2, col3 = st.columns(3)
103
 
104
  # -- Reading PDF File
 
105
  mv_pdf_input_file = col2.file_uploader("Choose a PDF file:", type=["pdf"])
106
-
107
- if 'mv_temp_file_storage_dir' not in st.session_state:
108
- mv_temp_file_storage_dir = tempfile.mkdtemp()
109
- st.session_state.mv_temp_file_storage_dir = mv_temp_file_storage_dir
110
- else:
111
- mv_temp_file_storage_dir = st.session_state.mv_temp_file_storage_dir
112
 
113
- mv_processing_message = col2.empty()
114
- st.text("")
115
  st.text("")
 
 
116
  st.text("")
117
 
118
- mv_vector_storage_dir = "/workspace/knowledge-base/01-ML/01-dev/adhoc/Talk2PDF/vector_store"
119
-
 
 
120
  if (mv_pdf_input_file is not None):
121
- mv_file_name = mv_pdf_input_file.name
122
- # mv_vectorstore_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".vectorstore")
123
- # mv_metadata_file_name = os.path.join(mv_vector_storage_dir, mv_file_name[:-4] + ".metadata")
124
-
125
- if 'lv_vector_store' not in st.session_state:
126
- # -- Storing Uploaded PDF locally
127
- lv_temp_file_path = os.path.join(mv_temp_file_storage_dir,mv_file_name)
128
- with open(lv_temp_file_path,"wb") as lv_file:
129
- lv_file.write(mv_pdf_input_file.getbuffer())
130
- print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
131
- mv_processing_message.text("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
132
-
133
- # -- Extracting PDF Text
134
- lv_pdf_content = fn_read_pdf(lv_temp_file_path, mv_processing_message)
135
 
136
- # -- Creating FAISS Vector Store
137
- lv_vector_store = fn_create_faiss_vector_store(lv_pdf_content, mv_processing_message)
138
- st.session_state.lv_vector_store = lv_vector_store
139
- else:
140
- lv_vector_store = st.session_state.lv_vector_store
141
 
142
- # -- Taking input question and generate answer
143
  col1, col2, col3 = st.columns(3)
 
144
  lv_chat_history = col2.chat_message
 
145
 
146
  if mv_user_question := col2.chat_input("Chat on PDF Data"):
147
  # -- Add user message to chat history
@@ -153,11 +255,14 @@ def main():
153
  # -- Adding assistant response to chat history
154
  st.session_state.messages.append({"role": "assistant", "content": lv_response})
155
 
156
- # -- Display chat messages from history on app rerun
157
- for message in st.session_state.messages:
158
- with lv_chat_history(message["role"]):
159
- st.markdown(message["content"])
 
 
160
 
 
161
 
162
  # Calling Main Function
163
  if __name__ == '__main__':
 
1
  import streamlit as st
 
2
  import os
3
+ import requests
4
+ import time
5
+ import sys
6
 
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores.faiss import FAISS
 
 
 
11
 
12
+ from langchain.prompts.prompt import PromptTemplate
13
+ from langchain_community.llms import CTransformers
14
+ from langchain.chains import RetrievalQA
15
 
16
+ # Upload pdf file into 'pdf-data' folder if it does not exist
17
+ def fn_upload_pdf(mv_pdf_input_file, mv_processing_message):
18
+ """Upload pdf file into 'pdf-data' folder if it does not exist"""
 
19
 
20
+ lv_file_name = mv_pdf_input_file.name
21
 
22
+ if not os.path.exists("pdf-data"):
23
+ os.makedirs("pdf-data")
24
+
25
+ lv_temp_file_path = os.path.join("pdf-data",lv_file_name)
26
+
27
+ if os.path.exists(lv_temp_file_path):
28
+ print("File already available")
29
+ fn_display_user_messages("File already available","Warning", mv_processing_message)
30
+ else:
31
+ with open(lv_temp_file_path,"wb") as lv_file:
32
+ lv_file.write(mv_pdf_input_file.getbuffer())
33
+
34
+ print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path)
35
+ fn_display_user_messages("Step1: PDF uploaded successfully at -> " + lv_temp_file_path, "Info", mv_processing_message)
36
 
37
+ # Create Vector DB of uploaded PDF
38
+ def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
39
+ """Create Vector DB of uploaded PDF"""
40
+
41
+ lv_file_name = mv_pdf_input_file.name[:-4] + ".vectorstore"
42
+
43
+ if not os.path.exists("vectordb/fiaas"):
44
+ os.makedirs("vectordb/fiaas")
45
+
46
+ lv_temp_file_path = os.path.join("vectordb/fiaas",lv_file_name)
47
  lv_embeddings = HuggingFaceEmbeddings(
48
+ model_name="sentence-transformers/all-mpnet-base-v2",
49
+ model_kwargs={'device': 'cpu'}
50
+ )
51
+
52
+ if os.path.exists(lv_temp_file_path):
53
+ print("VectorDB already available for uploaded file")
54
+ fn_display_user_messages("VectorDB already available for uploaded file","Warning", mv_processing_message)
55
+
56
+ lv_vector_store = FAISS.load_local(lv_temp_file_path, lv_embeddings,allow_dangerous_deserialization=True)
57
+ return lv_vector_store
58
+ else:
59
+ lv_temp_pdf_file_path = os.path.join("pdf-data",mv_pdf_input_file.name)
60
+
61
+ # -- Loading PDF Data
62
+ lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
63
+ lv_pdf_content = lv_pdf_loader.load()
64
+ print("Step2: PDF content extracted")
65
+ fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
66
+
67
+ # -- Chunking PDF Data
68
+ lv_text_splitter = CharacterTextSplitter(
69
+ separator="\n",
70
+ chunk_size=300,
71
+ chunk_overlap=30,
72
+ length_function=len
73
+ )
74
+ lv_pdf_chunk_documents = lv_text_splitter.split_documents(lv_pdf_content)
75
+ print("Step3: PDF content chucked and document object created")
76
+ fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
77
+
78
+ # -- Creating FIASS Vector Store
79
+ lv_vector_store = FAISS.from_documents(lv_pdf_chunk_documents, lv_embeddings)
80
+ print("Step4: Vector store created")
81
+ fn_display_user_messages("Step4: Vector store created", "Info", mv_processing_message)
82
+ lv_vector_store.save_local(lv_temp_file_path)
83
 
84
+ return lv_vector_store
85
+
86
+ # Display user Error, Warning or Success Message
87
+ def fn_display_user_messages(lv_text, lv_type, mv_processing_message):
88
+ """Display user Info, Error, Warning or Success Message"""
89
+
90
+ if lv_type == "Success":
91
+ with mv_processing_message.container():
92
+ st.success(lv_text)
93
+ elif lv_type == "Error":
94
+ with mv_processing_message.container():
95
+ st.error(lv_text)
96
+ elif lv_type == "Warning":
97
+ with mv_processing_message.container():
98
+ st.warning(lv_text)
99
+ else:
100
+ with mv_processing_message.container():
101
+ st.info(lv_text)
102
+
103
+ # Download TheBloke Models
104
+ def fn_download_llm_models(mv_selected_model, mv_processing_message):
105
+ """Download TheBloke Models"""
106
+
107
+ lv_download_url = ""
108
+
109
+ print("Downloading TheBloke of "+mv_selected_model)
110
+ fn_display_user_messages("Downloading TheBloke of "+mv_selected_model, "Info", mv_processing_message)
111
+
112
+ if mv_selected_model == 'microsoft/phi-2':
113
+ lv_download_url = "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf"
114
+ elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
115
+ lv_download_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf"
116
+
117
+ if not os.path.exists("model"):
118
+ os.makedirs("model")
119
+
120
+ lv_filename = os.path.basename(lv_download_url)
121
+ lv_temp_file_path = os.path.join("model",lv_filename)
122
+
123
+ if os.path.exists(lv_temp_file_path):
124
+ print("Model already available")
125
+ fn_display_user_messages("Model already available","Warning", mv_processing_message)
126
+ else:
127
+ lv_response = requests.get(lv_download_url, stream=True)
128
+ if lv_response.status_code == 200:
129
+ with open(lv_temp_file_path, 'wb') as f:
130
+ for chunk in lv_response.iter_content(chunk_size=1024):
131
+ if chunk:
132
+ f.write(chunk)
133
+
134
+ print("Download completed")
135
+ fn_display_user_messages("Model download completed","Info", mv_processing_message)
136
+ else:
137
+ print(f"Model download completed {response.status_code}")
138
+ fn_display_user_messages(f"Model download completed {response.status_code}","Error", mv_processing_message)
139
 
140
  # Function return QA Response using Vector Store
141
  def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_store, mv_processing_message):
142
  """Returns QA Response using Vector Store"""
143
 
144
+ lv_model_path = ""
145
+ lv_model_type = ""
146
+ lv_template = """Instruction:
147
+ You are an AI assistant for answering questions about the provided context.
148
+ You are given the following extracted parts of a long document and a question. Provide a detailed answer.
149
+ If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
150
+ =======
151
+ {context}
152
+ =======
153
+ Question: {question}
154
+ Output:\n"""
155
+ lv_qa_prompt = PromptTemplate(
156
+ template=lv_template,
157
+ input_variables=["question", "context"]
158
+ )
159
 
160
+ if mv_selected_model == 'microsoft/phi-2':
161
+ lv_model_path = "model/phi-2.Q2_K.gguf"
162
+ lv_model_type = "pi"
163
+ elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
164
+ lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
165
+ lv_model_type = "mistral"
166
+
167
+ print("Model Absolute location -" +lv_model_path)
168
 
169
  print("Step4: Generating LLM response")
170
+ fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
171
 
172
+ lv_model = CTransformers(
173
+ model="model/mistral-7b-instruct-v0.2.Q2_K.gguf",
174
+ model_type=lv_model_type,
175
+ max_new_tokens=2048,
176
+ temperature=0.00
177
+ )
178
+ lv_retriever = lv_vector_store.as_retriever(search_kwargs={'k': 2})
179
+ lv_qa_chain = RetrievalQA.from_chain_type( llm=lv_model,
180
+ chain_type='stuff',
181
+ retriever=lv_retriever,
182
+ return_source_documents=True,
183
+ chain_type_kwargs={'prompt': lv_qa_prompt}
184
+ )
 
 
 
 
 
 
 
 
 
185
 
186
+ lv_response = lv_qa_chain({"query": mv_user_question})
 
187
 
188
+ print("Step5: LLM response generated")
189
+ fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
190
 
191
+ return lv_response['result']
192
 
193
  # Main Function
194
  def main():
195
 
196
  # -- Streamlit Settings
197
  st.set_page_config(layout='wide')
198
+ col1, col2, col3 = st.columns(3)
199
+ col2.title("Chat with PDF")
200
+ st.text("")
201
+
202
  # -- Initialize chat history
203
  if "messages" not in st.session_state:
204
  st.session_state.messages = []
205
 
206
+ # -- Display Supported Models
207
  col1, col2, col3 = st.columns(3)
208
+ mv_selected_model = col3.selectbox('Select Model',
209
+ [
210
+ 'microsoft/phi-2',
211
+ 'mistralai/Mistral-7B-Instruct-v0.2'
212
+ ]
213
+ )
214
 
215
+ # -- Display Supported Vector Stores
216
  col1, col2, col3 = st.columns(3)
217
+ mv_selected_vector_db = col3.selectbox('Select Vector DB', ['FAISS'])
 
218
  st.text("")
 
 
219
 
220
  # -- Reading PDF File
221
+ col1, col2, col3 = st.columns(3)
222
  mv_pdf_input_file = col2.file_uploader("Choose a PDF file:", type=["pdf"])
 
 
 
 
 
 
223
 
224
+ # -- Display Processing Details
 
225
  st.text("")
226
+ col1, col2, col3 = st.columns(3)
227
+ mv_processing_message = col2.empty()
228
  st.text("")
229
 
230
+ # -- Downloading Model Files
231
+ fn_download_llm_models(mv_selected_model, mv_processing_message)
232
+
233
+ # -- Processing PDF
234
  if (mv_pdf_input_file is not None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ # -- Upload PDF
237
+ fn_upload_pdf(mv_pdf_input_file, mv_processing_message)
238
+
239
+ # -- Create Vector Index
240
+ lv_vector_store = fn_create_vector_db(mv_pdf_input_file, mv_processing_message)
241
 
242
+ # -- Perform RAG
243
  col1, col2, col3 = st.columns(3)
244
+ st.text("")
245
  lv_chat_history = col2.chat_message
246
+ st.text("")
247
 
248
  if mv_user_question := col2.chat_input("Chat on PDF Data"):
249
  # -- Add user message to chat history
 
255
  # -- Adding assistant response to chat history
256
  st.session_state.messages.append({"role": "assistant", "content": lv_response})
257
 
258
+ # -- Display chat messages from history on app rerun
259
+ for message in st.session_state.messages:
260
+ with lv_chat_history(message["role"]):
261
+ st.markdown(message["content"])
262
+
263
+ # -- Validate Data
264
 
265
+ # -- Get Web Response
266
 
267
  # Calling Main Function
268
  if __name__ == '__main__':