Spaces:

salvinjose
/

HNTAI

Paused

Joyna-Joy commited on Jun 12

Commit

e0770c1

1 Parent(s): 095d042

Updated text_extractor

{
"error": "No valid text extracted from short_medical_report.pdf for NER processing."
}

Former-commit-id: 6f42972349e9dd85af5ba3741a1163cc739b6738

Files changed (1) hide show

ai_med_extract/agents/text_extractor.py +155 -155

ai_med_extract/agents/text_extractor.py CHANGED Viewed

@@ -1,183 +1,183 @@
-# import pdfplumber
-# import pytesseract
-# import cv2
-# import pandas as pd
-# from PIL import Image
-# from docx import Document
-# import tempfile
-# import os
-# import logging
-# class TextExtractorAgent:
-#     @staticmethod
-#     def extract_text(filepath, ext):
-#         try:
-#             if ext == "pdf":
-#                 return TextExtractorAgent.extract_text_from_pdf(filepath)
-#             elif ext in {"jpg", "jpeg", "png"}:
-#                 return TextExtractorAgent.extract_text_from_image(filepath)
-#             elif ext == "docx":
-#                 return TextExtractorAgent.extract_text_from_docx(filepath)
-#             elif ext in {"xlsx", "xls"}:
-#                 return TextExtractorAgent.extract_text_from_excel(filepath)
-#             return None
-#         except Exception as e:
-#             logging.error(f"Text extraction failed: {e}")
-#             return None
-#     @staticmethod
-#     def extract_text_from_pdf(filepath, password=None):
-#         text = ""
-#         with pdfplumber.open(filepath) as pdf:
-#             for page in pdf.pages:
-#                 page_text = page.extract_text()
-#                 if page_text:
-#                     text += page_text + "\n"
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_image(filepath):
-#         image = cv2.imread(filepath)
-#         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-#         _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-#         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-#             processed_path = temp_file.name
-#         cv2.imwrite(processed_path, processed)
-#         text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
-#         os.remove(processed_path)
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_docx(filepath):
-#         doc = Document(filepath)
-#         text = "\n".join([para.text for para in doc.paragraphs])
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_excel(filepath):
-#         dfs = pd.read_excel(filepath, sheet_name=None)
-#         text = "\n".join([
-#             "\n".join([
-#                 " ".join(map(str, df[col].dropna()))
-#                 for col in df.columns
-#             ])
-#             for df in dfs.values()
-#         ])
-#         return text.strip() or None
 import pytesseract
 import cv2
 from PIL import Image
 from docx import Document
-from PyPDF2 import PdfReader
-from pdf2image import convert_from_path
-from concurrent.futures import ThreadPoolExecutor
 import tempfile
 import os
 import logging
-import numpy as np
-logger = logging.getLogger(__name__)
 class TextExtractorAgent:
     @staticmethod
-    def extract_text(filepath, ext, password=None):
         try:
-            ext = ext.lower()
             if ext == "pdf":
-                return TextExtractorAgent.extract_text_from_pdf(filepath, password)
             elif ext in {"jpg", "jpeg", "png"}:
                 return TextExtractorAgent.extract_text_from_image(filepath)
             elif ext == "docx":
                 return TextExtractorAgent.extract_text_from_docx(filepath)
             return None
         except Exception as e:
-            logger.error(f"Text extraction failed: {e}")
             return None
     @staticmethod
-    def is_blurred(image_path, variance_threshold=150):
-        try:
-            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
-            if image is None:
-                logger.error(f"Unable to read image: {image_path}")
-                return True
-            laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
-            edges = cv2.Canny(image, 50, 150)
-            edge_density = np.mean(edges)
-            logger.info(f"Laplacian: {laplacian_var:.2f}, Edge Density: {edge_density:.2f}")
-            is_blurry = laplacian_var < variance_threshold and edge_density < 10
-            if is_blurry:
-                logger.warning(f"Image '{image_path}' flagged as blurry.")
-            return is_blurry
-        except Exception as e:
-            logger.exception(f"Error checking blur for '{image_path}': {e}")
-            return True
     @staticmethod
     def extract_text_from_image(filepath):
-        try:
-            if TextExtractorAgent.is_blurred(filepath):
-                logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
-                return "Image is too blurry, OCR failed."
-            image = cv2.imread(filepath)
-            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-            gray = cv2.GaussianBlur(gray, (5, 5), 0)
-            gray = cv2.adaptiveThreshold(
-                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
-            )
-            gray = cv2.dilate(gray, np.ones((2, 2), np.uint8), iterations=1)
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-                processed_path = temp_file.name
-            cv2.imwrite(processed_path, gray)
-            text = pytesseract.image_to_string(Image.open(processed_path), lang="eng").strip()
-            os.remove(processed_path)
-            if len(text.split()) < 5:
-                logger.warning(f"Too little OCR output from '{filepath}'.")
-                return "OCR failed to extract meaningful text."
-            return text
-        except Exception as e:
-            logger.exception(f"OCR failed for image '{filepath}': {e}")
-            return "Failed to extract text"
     @staticmethod
-    def extract_text_from_pdf(filepath, password=None):
-        try:
-            reader = PdfReader(filepath)
-            if reader.is_encrypted:
-                if not password:
-                    return {"error": "File is password-protected."}, 401
-                if reader.decrypt(password) == 0:
-                    return {"error": "Invalid password."}, 403
-            text = "\n".join([page.extract_text() or "" for page in reader.pages])
-            if text.strip():
-                return text.strip(), 200
-            logger.info("Falling back to OCR for PDF.")
-            images = convert_from_path(filepath)
-            with ThreadPoolExecutor(max_workers=5) as pool:
-                ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang="eng"), images))
-            full_text = "\n".join(ocr_text).strip()
-            return (full_text, 200) if full_text else ("No text found", 415)
-        except Exception as e:
-            logger.exception(f"PDF processing error: {filepath}")
-            return "Failed to extract text"
     @staticmethod
-    def extract_text_from_docx(filepath):
-        try:
-            doc = Document(filepath)
-            text = "\n".join([para.text for para in doc.paragraphs])
-            return text.strip() or None
-        except Exception as e:
-            logger.exception(f"Failed to extract text from DOCX: {filepath}")
-            return None

+import pdfplumber
 import pytesseract
 import cv2
+import pandas as pd
 from PIL import Image
 from docx import Document
 import tempfile
 import os
 import logging
 class TextExtractorAgent:
     @staticmethod
+    def extract_text(filepath, ext):
         try:
             if ext == "pdf":
+                return TextExtractorAgent.extract_text_from_pdf(filepath)
             elif ext in {"jpg", "jpeg", "png"}:
                 return TextExtractorAgent.extract_text_from_image(filepath)
             elif ext == "docx":
                 return TextExtractorAgent.extract_text_from_docx(filepath)
+            elif ext in {"xlsx", "xls"}:
+                return TextExtractorAgent.extract_text_from_excel(filepath)
             return None
         except Exception as e:
+            logging.error(f"Text extraction failed: {e}")
             return None
     @staticmethod
+    def extract_text_from_pdf(filepath, password=None):
+        text = ""
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text.strip() or None
     @staticmethod
     def extract_text_from_image(filepath):
+        image = cv2.imread(filepath)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+            processed_path = temp_file.name
+        cv2.imwrite(processed_path, processed)
+        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
+        os.remove(processed_path)
+        return text.strip() or None
     @staticmethod
+    def extract_text_from_docx(filepath):
+        doc = Document(filepath)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text.strip() or None
     @staticmethod
+    def extract_text_from_excel(filepath):
+        dfs = pd.read_excel(filepath, sheet_name=None)
+        text = "\n".join([
+            "\n".join([
+                " ".join(map(str, df[col].dropna()))
+                for col in df.columns
+            ])
+            for df in dfs.values()
+        ])
+        return text.strip() or None
+# import pytesseract
+# import cv2
+# from PIL import Image
+# from docx import Document
+# from PyPDF2 import PdfReader
+# from pdf2image import convert_from_path
+# from concurrent.futures import ThreadPoolExecutor
+# import tempfile
+# import os
+# import logging
+# import numpy as np
+# logger = logging.getLogger(__name__)
+# class TextExtractorAgent:
+#     @staticmethod
+#     def extract_text(filepath, ext, password=None):
+#         try:
+#             ext = ext.lower()
+#             if ext == "pdf":
+#                 return TextExtractorAgent.extract_text_from_pdf(filepath, password)
+#             elif ext in {"jpg", "jpeg", "png"}:
+#                 return TextExtractorAgent.extract_text_from_image(filepath)
+#             elif ext == "docx":
+#                 return TextExtractorAgent.extract_text_from_docx(filepath)
+#             return None
+#         except Exception as e:
+#             logger.error(f"Text extraction failed: {e}")
+#             return None
+#     @staticmethod
+#     def is_blurred(image_path, variance_threshold=150):
+#         try:
+#             image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+#             if image is None:
+#                 logger.error(f"Unable to read image: {image_path}")
+#                 return True
+#             laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
+#             edges = cv2.Canny(image, 50, 150)
+#             edge_density = np.mean(edges)
+#             logger.info(f"Laplacian: {laplacian_var:.2f}, Edge Density: {edge_density:.2f}")
+#             is_blurry = laplacian_var < variance_threshold and edge_density < 10
+#             if is_blurry:
+#                 logger.warning(f"Image '{image_path}' flagged as blurry.")
+#             return is_blurry
+#         except Exception as e:
+#             logger.exception(f"Error checking blur for '{image_path}': {e}")
+#             return True
+#     @staticmethod
+#     def extract_text_from_image(filepath):
+#         try:
+#             if TextExtractorAgent.is_blurred(filepath):
+#                 logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
+#                 return "Image is too blurry, OCR failed."
+#             image = cv2.imread(filepath)
+#             gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+#             gray = cv2.GaussianBlur(gray, (5, 5), 0)
+#             gray = cv2.adaptiveThreshold(
+#                 gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+#             )
+#             gray = cv2.dilate(gray, np.ones((2, 2), np.uint8), iterations=1)
+#             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+#                 processed_path = temp_file.name
+#             cv2.imwrite(processed_path, gray)
+#             text = pytesseract.image_to_string(Image.open(processed_path), lang="eng").strip()
+#             os.remove(processed_path)
+#             if len(text.split()) < 5:
+#                 logger.warning(f"Too little OCR output from '{filepath}'.")
+#                 return "OCR failed to extract meaningful text."
+#             return text
+#         except Exception as e:
+#             logger.exception(f"OCR failed for image '{filepath}': {e}")
+#             return "Failed to extract text"
+#     @staticmethod
+#     def extract_text_from_pdf(filepath, password=None):
+#         try:
+#             reader = PdfReader(filepath)
+#             if reader.is_encrypted:
+#                 if not password:
+#                     return {"error": "File is password-protected."}, 401
+#                 if reader.decrypt(password) == 0:
+#                     return {"error": "Invalid password."}, 403
+#             text = "\n".join([page.extract_text() or "" for page in reader.pages])
+#             if text.strip():
+#                 return text.strip(), 200
+#             logger.info("Falling back to OCR for PDF.")
+#             images = convert_from_path(filepath)
+#             with ThreadPoolExecutor(max_workers=5) as pool:
+#                 ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang="eng"), images))
+#             full_text = "\n".join(ocr_text).strip()
+#             return (full_text, 200) if full_text else ("No text found", 415)
+#         except Exception as e:
+#             logger.exception(f"PDF processing error: {filepath}")
+#             return "Failed to extract text"
+#     @staticmethod
+#     def extract_text_from_docx(filepath):
+#         try:
+#             doc = Document(filepath)
+#             text = "\n".join([para.text for para in doc.paragraphs])
+#             return text.strip() or None
+#         except Exception as e:
+#             logger.exception(f"Failed to extract text from DOCX: {filepath}")
+#             return None