demoPOC commited on
Commit
2ec1a1d
·
1 Parent(s): 288a785

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -6
app.py CHANGED
@@ -178,15 +178,37 @@ def getRAGChain(customerName,customerDistrict, custDetailsPresent,vectordb):
178
  )
179
  return chain
180
 
181
- def createVectorDB(documents):
182
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
183
- texts = text_splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  print("All chunk List START ***********************\n\n")
185
  pretty_print_docs(texts)
186
  print("All chunk List END ***********************\n\n")
187
- embeddings = getEmbeddingModel(0)
188
- vectordb = Chroma.from_documents(texts, embeddings)
 
 
189
  return vectordb
 
 
 
 
 
 
 
190
 
191
  def createPrompt(cName, cCity, custDetailsPresent):
192
  cProfile = "Customer's Name is " + cName + "\nCustomer's lives in or customer's Resident State or Customer's place is " + cCity + "\n"
@@ -213,7 +235,7 @@ def createPrompt(cName, cCity, custDetailsPresent):
213
  PROMPT = PromptTemplate(template=prompt_template, input_variables=["history", "context", "question"])
214
  return PROMPT
215
 
216
- vectordb = createVectorDB(loadKB(False, False, uploads_dir, None))
217
 
218
  @app.route('/', methods=['GET'])
219
  def test():
@@ -287,7 +309,7 @@ def file_Upload():
287
  embeddingModelID = int(request.form.getlist('embeddingModelID')[0])
288
  global vectordb
289
  vectordb = createVectorDB(documents, embeddingModelID)
290
- vectordb=createVectorDB(documents)
291
  return render_template("index.html")
292
 
293
  if __name__ == '__main__':
 
178
  )
179
  return chain
180
 
181
+ def createVectorDB(documents,embeddingModelID):
182
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
183
+ texts = []
184
+ for document in documents:
185
+ tokenized_input = tokenizer.tokenize(document.page_content)
186
+ print("Token Count::::::::::" + str(len(tokenized_input)))
187
+ if (len(tokenized_input) > 1000):
188
+ print("Splitting Content using RTS")
189
+ splitted_doc = text_splitter.split_documents([document])
190
+ texts.extend(splitted_doc)
191
+ # for text in texts:
192
+ # print("splitted content:"+str(len(text.page_content)))
193
+ # print(text.page_content)
194
+ elif (len(tokenized_input) < 1000 and len(tokenized_input) > 1):
195
+ texts.append(document)
196
+ # texts = text_splitter.split_documents(documents)
197
  print("All chunk List START ***********************\n\n")
198
  pretty_print_docs(texts)
199
  print("All chunk List END ***********************\n\n")
200
+ embeddings = getEmbeddingModel(embeddingModelID)
201
+ print("Embedding Started >>>>>>>>>>>>>>>>>>", datetime.now().strftime("%H:%M:%S"))
202
+ vectordb = Chroma.from_documents(texts, embeddings, collection_metadata={"hnsw:space": "cosine"})
203
+ print("Vector Store Creation Completed*********************************\n\n")
204
  return vectordb
205
+ # texts = text_splitter.split_documents(documents)
206
+ # print("All chunk List START ***********************\n\n")
207
+ # pretty_print_docs(texts)
208
+ # print("All chunk List END ***********************\n\n")
209
+ # embeddings = getEmbeddingModel(0)
210
+ # vectordb = Chroma.from_documents(texts, embeddings)
211
+ # return vectordb
212
 
213
  def createPrompt(cName, cCity, custDetailsPresent):
214
  cProfile = "Customer's Name is " + cName + "\nCustomer's lives in or customer's Resident State or Customer's place is " + cCity + "\n"
 
235
  PROMPT = PromptTemplate(template=prompt_template, input_variables=["history", "context", "question"])
236
  return PROMPT
237
 
238
+ vectordb = createVectorDB(loadKB(False, False, uploads_dir, None),defaultEmbeddingModelID)
239
 
240
  @app.route('/', methods=['GET'])
241
  def test():
 
309
  embeddingModelID = int(request.form.getlist('embeddingModelID')[0])
310
  global vectordb
311
  vectordb = createVectorDB(documents, embeddingModelID)
312
+ #vectordb=createVectorDB(documents)
313
  return render_template("index.html")
314
 
315
  if __name__ == '__main__':