ValakiJay1706 commited on
Commit
47c4d2b
·
verified ·
1 Parent(s): 2fce29d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -14
app.py CHANGED
@@ -167,11 +167,14 @@ def extract_text_from_pdf(pdf_path):
167
 
168
  for page_index in range(len(pdf_file)):
169
  page = pdf_file.load_page(page_index)
170
- text = page.get_text()
171
- all_text += text + "\n"
 
172
 
173
  pdf_file.close()
174
- return all_text
 
 
175
 
176
  except Exception as e:
177
  print(f"Error extracting text from PDF: {e}")
@@ -196,6 +199,8 @@ def extract_images_from_pdf(pdf_path):
196
  images.append(image)
197
 
198
  pdf_file.close()
 
 
199
  return images
200
 
201
  except Exception as e:
@@ -212,9 +217,11 @@ def recognize_text(image):
212
  recognized_text = ""
213
  for (bbox, text, prob) in result:
214
  if prob > 0.2:
215
- recognized_text += f'{text}\n'
216
-
217
- return recognized_text
 
 
218
 
219
  except Exception as e:
220
  print(f"Error recognizing text from image: {e}")
@@ -227,15 +234,24 @@ def ocr_text_from_pdf(pdf_path):
227
 
228
  for image in images:
229
  text = recognize_text(image)
230
- all_text += text
 
231
 
232
- return all_text
 
 
233
 
234
  def extract_all_text_from_pdf(pdf_path):
235
  """Extract both direct text and OCR text from a PDF."""
236
  direct_text = extract_text_from_pdf(pdf_path)
237
  ocr_text = ocr_text_from_pdf(pdf_path)
238
- return direct_text + "\n" + ocr_text
 
 
 
 
 
 
239
 
240
 
241
 
@@ -634,12 +650,7 @@ def main():
634
  file = st.file_uploader("Upload PDF Files")
635
  if file is not None:
636
  try:
637
- # pdf_path = "path/to/your/pdf_file.pdf"
638
-
639
- # Extract text from the PDF
640
  text = extract_all_text_from_pdf(file)
641
- # print(extracted_text)
642
-
643
  # text = get_pdf_text(file)
644
  except Exception as e:
645
  st.error(f"Error reading PDF file: {str(e)}")
 
167
 
168
  for page_index in range(len(pdf_file)):
169
  page = pdf_file.load_page(page_index)
170
+ text = page.get_text("text")
171
+ if text.strip(): # Check if the text is not empty
172
+ all_text += text.replace('\n', ' ') + " "
173
 
174
  pdf_file.close()
175
+ if not all_text.strip():
176
+ print("No direct text found in the PDF.")
177
+ return all_text.strip() # Strip any leading/trailing whitespace
178
 
179
  except Exception as e:
180
  print(f"Error extracting text from PDF: {e}")
 
199
  images.append(image)
200
 
201
  pdf_file.close()
202
+ if not images:
203
+ print("No images found in the PDF.")
204
  return images
205
 
206
  except Exception as e:
 
217
  recognized_text = ""
218
  for (bbox, text, prob) in result:
219
  if prob > 0.2:
220
+ recognized_text += f'{text} '
221
+
222
+ if not recognized_text.strip():
223
+ print("No text recognized from the image.")
224
+ return recognized_text.strip() # Strip any leading/trailing whitespace
225
 
226
  except Exception as e:
227
  print(f"Error recognizing text from image: {e}")
 
234
 
235
  for image in images:
236
  text = recognize_text(image)
237
+ if text.strip(): # Check if the recognized text is not empty
238
+ all_text += text + " "
239
 
240
+ if not all_text.strip():
241
+ print("No OCR text found in the PDF images.")
242
+ return all_text.strip() # Strip any leading/trailing whitespace
243
 
244
  def extract_all_text_from_pdf(pdf_path):
245
  """Extract both direct text and OCR text from a PDF."""
246
  direct_text = extract_text_from_pdf(pdf_path)
247
  ocr_text = ocr_text_from_pdf(pdf_path)
248
+ all_text = direct_text + " " + ocr_text + " "
249
+ if not all_text.strip():
250
+ print("No text extracted from the PDF.")
251
+ return all_text.strip() # Strip any leading/trailing whitespace
252
+
253
+
254
+
255
 
256
 
257
 
 
650
  file = st.file_uploader("Upload PDF Files")
651
  if file is not None:
652
  try:
 
 
 
653
  text = extract_all_text_from_pdf(file)
 
 
654
  # text = get_pdf_text(file)
655
  except Exception as e:
656
  st.error(f"Error reading PDF file: {str(e)}")