Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -167,11 +167,14 @@ def extract_text_from_pdf(pdf_path):
|
|
167 |
|
168 |
for page_index in range(len(pdf_file)):
|
169 |
page = pdf_file.load_page(page_index)
|
170 |
-
text = page.get_text()
|
171 |
-
|
|
|
172 |
|
173 |
pdf_file.close()
|
174 |
-
|
|
|
|
|
175 |
|
176 |
except Exception as e:
|
177 |
print(f"Error extracting text from PDF: {e}")
|
@@ -196,6 +199,8 @@ def extract_images_from_pdf(pdf_path):
|
|
196 |
images.append(image)
|
197 |
|
198 |
pdf_file.close()
|
|
|
|
|
199 |
return images
|
200 |
|
201 |
except Exception as e:
|
@@ -212,9 +217,11 @@ def recognize_text(image):
|
|
212 |
recognized_text = ""
|
213 |
for (bbox, text, prob) in result:
|
214 |
if prob > 0.2:
|
215 |
-
recognized_text += f'{text}
|
216 |
-
|
217 |
-
|
|
|
|
|
218 |
|
219 |
except Exception as e:
|
220 |
print(f"Error recognizing text from image: {e}")
|
@@ -227,15 +234,24 @@ def ocr_text_from_pdf(pdf_path):
|
|
227 |
|
228 |
for image in images:
|
229 |
text = recognize_text(image)
|
230 |
-
|
|
|
231 |
|
232 |
-
|
|
|
|
|
233 |
|
234 |
def extract_all_text_from_pdf(pdf_path):
|
235 |
"""Extract both direct text and OCR text from a PDF."""
|
236 |
direct_text = extract_text_from_pdf(pdf_path)
|
237 |
ocr_text = ocr_text_from_pdf(pdf_path)
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
|
241 |
|
@@ -634,12 +650,7 @@ def main():
|
|
634 |
file = st.file_uploader("Upload PDF Files")
|
635 |
if file is not None:
|
636 |
try:
|
637 |
-
# pdf_path = "path/to/your/pdf_file.pdf"
|
638 |
-
|
639 |
-
# Extract text from the PDF
|
640 |
text = extract_all_text_from_pdf(file)
|
641 |
-
# print(extracted_text)
|
642 |
-
|
643 |
# text = get_pdf_text(file)
|
644 |
except Exception as e:
|
645 |
st.error(f"Error reading PDF file: {str(e)}")
|
|
|
167 |
|
168 |
for page_index in range(len(pdf_file)):
|
169 |
page = pdf_file.load_page(page_index)
|
170 |
+
text = page.get_text("text")
|
171 |
+
if text.strip(): # Check if the text is not empty
|
172 |
+
all_text += text.replace('\n', ' ') + " "
|
173 |
|
174 |
pdf_file.close()
|
175 |
+
if not all_text.strip():
|
176 |
+
print("No direct text found in the PDF.")
|
177 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
178 |
|
179 |
except Exception as e:
|
180 |
print(f"Error extracting text from PDF: {e}")
|
|
|
199 |
images.append(image)
|
200 |
|
201 |
pdf_file.close()
|
202 |
+
if not images:
|
203 |
+
print("No images found in the PDF.")
|
204 |
return images
|
205 |
|
206 |
except Exception as e:
|
|
|
217 |
recognized_text = ""
|
218 |
for (bbox, text, prob) in result:
|
219 |
if prob > 0.2:
|
220 |
+
recognized_text += f'{text} '
|
221 |
+
|
222 |
+
if not recognized_text.strip():
|
223 |
+
print("No text recognized from the image.")
|
224 |
+
return recognized_text.strip() # Strip any leading/trailing whitespace
|
225 |
|
226 |
except Exception as e:
|
227 |
print(f"Error recognizing text from image: {e}")
|
|
|
234 |
|
235 |
for image in images:
|
236 |
text = recognize_text(image)
|
237 |
+
if text.strip(): # Check if the recognized text is not empty
|
238 |
+
all_text += text + " "
|
239 |
|
240 |
+
if not all_text.strip():
|
241 |
+
print("No OCR text found in the PDF images.")
|
242 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
243 |
|
244 |
def extract_all_text_from_pdf(pdf_path):
|
245 |
"""Extract both direct text and OCR text from a PDF."""
|
246 |
direct_text = extract_text_from_pdf(pdf_path)
|
247 |
ocr_text = ocr_text_from_pdf(pdf_path)
|
248 |
+
all_text = direct_text + " " + ocr_text + " "
|
249 |
+
if not all_text.strip():
|
250 |
+
print("No text extracted from the PDF.")
|
251 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
|
256 |
|
257 |
|
|
|
650 |
file = st.file_uploader("Upload PDF Files")
|
651 |
if file is not None:
|
652 |
try:
|
|
|
|
|
|
|
653 |
text = extract_all_text_from_pdf(file)
|
|
|
|
|
654 |
# text = get_pdf_text(file)
|
655 |
except Exception as e:
|
656 |
st.error(f"Error reading PDF file: {str(e)}")
|