MCQGen

Sleeping

App Files Files Community

ValakiJay1706 commited on Jul 10, 2024

Commit

47c4d2b

verified ·

1 Parent(s): 2fce29d

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -14

app.py CHANGED Viewed

@@ -167,11 +167,14 @@ def extract_text_from_pdf(pdf_path):
         for page_index in range(len(pdf_file)):
             page = pdf_file.load_page(page_index)
-            text = page.get_text()
-            all_text += text + "\n"
         pdf_file.close()
-        return all_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
@@ -196,6 +199,8 @@ def extract_images_from_pdf(pdf_path):
                 images.append(image)
         pdf_file.close()
         return images
     except Exception as e:
@@ -212,9 +217,11 @@ def recognize_text(image):
         recognized_text = ""
         for (bbox, text, prob) in result:
             if prob > 0.2:
-                recognized_text += f'{text}\n'
-        return recognized_text
     except Exception as e:
         print(f"Error recognizing text from image: {e}")
@@ -227,15 +234,24 @@ def ocr_text_from_pdf(pdf_path):
     for image in images:
         text = recognize_text(image)
-        all_text += text
-    return all_text
 def extract_all_text_from_pdf(pdf_path):
     """Extract both direct text and OCR text from a PDF."""
     direct_text = extract_text_from_pdf(pdf_path)
     ocr_text = ocr_text_from_pdf(pdf_path)
-    return direct_text + "\n" + ocr_text
@@ -634,12 +650,7 @@ def main():
         file = st.file_uploader("Upload PDF Files")
         if file is not None:
             try:
-                # pdf_path = "path/to/your/pdf_file.pdf"
-                # Extract text from the PDF
                 text = extract_all_text_from_pdf(file)
-                # print(extracted_text)
                 # text = get_pdf_text(file)
             except Exception as e:
                 st.error(f"Error reading PDF file: {str(e)}")

         for page_index in range(len(pdf_file)):
             page = pdf_file.load_page(page_index)
+            text = page.get_text("text")
+            if text.strip():  # Check if the text is not empty
+                all_text += text.replace('\n', ' ') + " "
         pdf_file.close()
+        if not all_text.strip():
+            print("No direct text found in the PDF.")
+        return all_text.strip()  # Strip any leading/trailing whitespace
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
                 images.append(image)
         pdf_file.close()
+        if not images:
+            print("No images found in the PDF.")
         return images
     except Exception as e:
         recognized_text = ""
         for (bbox, text, prob) in result:
             if prob > 0.2:
+                recognized_text += f'{text} '
+        if not recognized_text.strip():
+            print("No text recognized from the image.")
+        return recognized_text.strip()  # Strip any leading/trailing whitespace
     except Exception as e:
         print(f"Error recognizing text from image: {e}")
     for image in images:
         text = recognize_text(image)
+        if text.strip():  # Check if the recognized text is not empty
+            all_text += text + " "
+    if not all_text.strip():
+        print("No OCR text found in the PDF images.")
+    return all_text.strip()  # Strip any leading/trailing whitespace
 def extract_all_text_from_pdf(pdf_path):
     """Extract both direct text and OCR text from a PDF."""
     direct_text = extract_text_from_pdf(pdf_path)
     ocr_text = ocr_text_from_pdf(pdf_path)
+    all_text = direct_text + " " + ocr_text + " "
+    if not all_text.strip():
+        print("No text extracted from the PDF.")
+    return all_text.strip()  # Strip any leading/trailing whitespace
         file = st.file_uploader("Upload PDF Files")
         if file is not None:
             try:
                 text = extract_all_text_from_pdf(file)
                 # text = get_pdf_text(file)
             except Exception as e:
                 st.error(f"Error reading PDF file: {str(e)}")