hanzla commited on
Commit
f2473b4
·
1 Parent(s): e15ad80
Files changed (3) hide show
  1. src/app.py +1 -1
  2. src/interface.py +1 -1
  3. src/pdfchatbot.py +11 -4
src/app.py CHANGED
@@ -5,7 +5,7 @@ demo, chat_history, show_img, txt, submit_button, uploaded_pdf, slider_chunk_siz
5
 
6
  pdf_chatbot = PDFChatBot()
7
  with demo:
8
- uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider_chunk_size], outputs=[show_img])
9
 
10
  submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
11
  success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
 
5
 
6
  pdf_chatbot = PDFChatBot()
7
  with demo:
8
+ uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider_chunk_size,slider_overlap_percentage,slider_temp,slider_k], outputs=[show_img])
9
 
10
  submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
11
  success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
src/interface.py CHANGED
@@ -28,7 +28,7 @@ def create_demo():
28
  )
29
  with gr.Row():
30
  slider_overlap_percentage = gr.Slider(
31
- minimum=0, maximum=100, value=50, label="Chunk Overlap Percentage", elem_id='slider2'
32
  )
33
  with gr.Row():
34
  slider_temp = gr.Slider(
 
28
  )
29
  with gr.Row():
30
  slider_overlap_percentage = gr.Slider(
31
+ minimum=0, maximum=99, value=50, label="Chunk Overlap Percentage", elem_id='slider2'
32
  )
33
  with gr.Row():
34
  slider_temp = gr.Slider(
src/pdfchatbot.py CHANGED
@@ -34,7 +34,10 @@ class PDFChatBot:
34
  self.pipeline = None
35
  self.chain = None
36
  self.chunk_size = None
 
 
37
  self.current_context = None
 
38
  self.format_seperator="""\n\n--\n\n"""
39
  self.pipe = None
40
  #self.chunk_size_slider = chunk_size_slider
@@ -45,9 +48,10 @@ class PDFChatBot:
45
  print("Embedding model loaded")
46
 
47
  def load_vectordb(self):
 
48
  text_splitter = RecursiveCharacterTextSplitter(
49
- chunk_size=256,
50
- chunk_overlap=100,
51
  length_function=len,
52
  add_start_index=True,
53
  )
@@ -69,7 +73,7 @@ class PDFChatBot:
69
  print("Model pipeline loaded")
70
 
71
  def get_organic_context(self, query):
72
- documents = self.vectordb.similarity_search_with_relevance_scores(query, k=3)
73
  context = self.format_seperator.join([doc.page_content for doc, score in documents])
74
  self.current_context = context
75
  print("Context Ready")
@@ -135,11 +139,14 @@ class PDFChatBot:
135
  history[-1][-1] += char
136
  return history,""
137
 
138
- def render_file(self, file,chunk_size):
139
  print(chunk_size)
140
  doc = fitz.open(file.name)
141
  page = doc[self.page]
142
  self.chunk_size = chunk_size
 
 
 
143
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
144
  image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
145
  return image
 
34
  self.pipeline = None
35
  self.chain = None
36
  self.chunk_size = None
37
+ self.overlap_percentage = None
38
+ self.max_chunks_in_context = None
39
  self.current_context = None
40
+ self.model_temperatue = None
41
  self.format_seperator="""\n\n--\n\n"""
42
  self.pipe = None
43
  #self.chunk_size_slider = chunk_size_slider
 
48
  print("Embedding model loaded")
49
 
50
  def load_vectordb(self):
51
+ overlap = (self.overlap_percentage/100) * self.chunk_size
52
  text_splitter = RecursiveCharacterTextSplitter(
53
+ chunk_size=self.chunk_size,
54
+ chunk_overlap=overlap,
55
  length_function=len,
56
  add_start_index=True,
57
  )
 
73
  print("Model pipeline loaded")
74
 
75
  def get_organic_context(self, query):
76
+ documents = self.vectordb.similarity_search_with_relevance_scores(query, k=self.max_chunks_in_context)
77
  context = self.format_seperator.join([doc.page_content for doc, score in documents])
78
  self.current_context = context
79
  print("Context Ready")
 
139
  history[-1][-1] += char
140
  return history,""
141
 
142
+ def render_file(self, file,chunk_size,chunk_overlap_percentage,model_temperature,max_chunks_in_context):
143
  print(chunk_size)
144
  doc = fitz.open(file.name)
145
  page = doc[self.page]
146
  self.chunk_size = chunk_size
147
+ self.overlap_percentage = chunk_overlap_percentage
148
+ self.model_temperatue = model_temperature
149
+ self.max_chunks_in_context = max_chunks_in_context
150
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
151
  image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
152
  return image