Joyna-Joy commited on
Commit
e0770c1
·
1 Parent(s): 095d042

Updated text_extractor

Browse files

{
"error": "No valid text extracted from short_medical_report.pdf for NER processing."
}


Former-commit-id: 6f42972349e9dd85af5ba3741a1163cc739b6738

Files changed (1) hide show
  1. ai_med_extract/agents/text_extractor.py +155 -155
ai_med_extract/agents/text_extractor.py CHANGED
@@ -1,183 +1,183 @@
1
- # import pdfplumber
2
- # import pytesseract
3
- # import cv2
4
- # import pandas as pd
5
- # from PIL import Image
6
- # from docx import Document
7
- # import tempfile
8
- # import os
9
- # import logging
10
-
11
- # class TextExtractorAgent:
12
- # @staticmethod
13
- # def extract_text(filepath, ext):
14
- # try:
15
- # if ext == "pdf":
16
- # return TextExtractorAgent.extract_text_from_pdf(filepath)
17
- # elif ext in {"jpg", "jpeg", "png"}:
18
- # return TextExtractorAgent.extract_text_from_image(filepath)
19
- # elif ext == "docx":
20
- # return TextExtractorAgent.extract_text_from_docx(filepath)
21
- # elif ext in {"xlsx", "xls"}:
22
- # return TextExtractorAgent.extract_text_from_excel(filepath)
23
- # return None
24
- # except Exception as e:
25
- # logging.error(f"Text extraction failed: {e}")
26
- # return None
27
-
28
- # @staticmethod
29
- # def extract_text_from_pdf(filepath, password=None):
30
- # text = ""
31
- # with pdfplumber.open(filepath) as pdf:
32
- # for page in pdf.pages:
33
- # page_text = page.extract_text()
34
- # if page_text:
35
- # text += page_text + "\n"
36
- # return text.strip() or None
37
-
38
- # @staticmethod
39
- # def extract_text_from_image(filepath):
40
- # image = cv2.imread(filepath)
41
- # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
42
- # _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
43
- # with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
44
- # processed_path = temp_file.name
45
- # cv2.imwrite(processed_path, processed)
46
- # text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
47
- # os.remove(processed_path)
48
- # return text.strip() or None
49
-
50
- # @staticmethod
51
- # def extract_text_from_docx(filepath):
52
- # doc = Document(filepath)
53
- # text = "\n".join([para.text for para in doc.paragraphs])
54
- # return text.strip() or None
55
-
56
- # @staticmethod
57
- # def extract_text_from_excel(filepath):
58
- # dfs = pd.read_excel(filepath, sheet_name=None)
59
- # text = "\n".join([
60
- # "\n".join([
61
- # " ".join(map(str, df[col].dropna()))
62
- # for col in df.columns
63
- # ])
64
- # for df in dfs.values()
65
- # ])
66
- # return text.strip() or None
67
-
68
  import pytesseract
69
  import cv2
 
70
  from PIL import Image
71
  from docx import Document
72
- from PyPDF2 import PdfReader
73
- from pdf2image import convert_from_path
74
- from concurrent.futures import ThreadPoolExecutor
75
  import tempfile
76
  import os
77
  import logging
78
- import numpy as np
79
-
80
- logger = logging.getLogger(__name__)
81
 
82
  class TextExtractorAgent:
83
  @staticmethod
84
- def extract_text(filepath, ext, password=None):
85
  try:
86
- ext = ext.lower()
87
  if ext == "pdf":
88
- return TextExtractorAgent.extract_text_from_pdf(filepath, password)
89
  elif ext in {"jpg", "jpeg", "png"}:
90
  return TextExtractorAgent.extract_text_from_image(filepath)
91
  elif ext == "docx":
92
  return TextExtractorAgent.extract_text_from_docx(filepath)
 
 
93
  return None
94
  except Exception as e:
95
- logger.error(f"Text extraction failed: {e}")
96
  return None
97
 
98
  @staticmethod
99
- def is_blurred(image_path, variance_threshold=150):
100
- try:
101
- image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
102
- if image is None:
103
- logger.error(f"Unable to read image: {image_path}")
104
- return True
105
-
106
- laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
107
- edges = cv2.Canny(image, 50, 150)
108
- edge_density = np.mean(edges)
109
-
110
- logger.info(f"Laplacian: {laplacian_var:.2f}, Edge Density: {edge_density:.2f}")
111
- is_blurry = laplacian_var < variance_threshold and edge_density < 10
112
-
113
- if is_blurry:
114
- logger.warning(f"Image '{image_path}' flagged as blurry.")
115
- return is_blurry
116
- except Exception as e:
117
- logger.exception(f"Error checking blur for '{image_path}': {e}")
118
- return True
119
 
120
  @staticmethod
121
  def extract_text_from_image(filepath):
122
- try:
123
- if TextExtractorAgent.is_blurred(filepath):
124
- logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
125
- return "Image is too blurry, OCR failed."
126
-
127
- image = cv2.imread(filepath)
128
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
129
- gray = cv2.GaussianBlur(gray, (5, 5), 0)
130
- gray = cv2.adaptiveThreshold(
131
- gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
132
- )
133
- gray = cv2.dilate(gray, np.ones((2, 2), np.uint8), iterations=1)
134
-
135
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
136
- processed_path = temp_file.name
137
- cv2.imwrite(processed_path, gray)
138
-
139
- text = pytesseract.image_to_string(Image.open(processed_path), lang="eng").strip()
140
- os.remove(processed_path)
141
-
142
- if len(text.split()) < 5:
143
- logger.warning(f"Too little OCR output from '{filepath}'.")
144
- return "OCR failed to extract meaningful text."
145
-
146
- return text
147
- except Exception as e:
148
- logger.exception(f"OCR failed for image '{filepath}': {e}")
149
- return "Failed to extract text"
150
 
151
  @staticmethod
152
- def extract_text_from_pdf(filepath, password=None):
153
- try:
154
- reader = PdfReader(filepath)
155
- if reader.is_encrypted:
156
- if not password:
157
- return {"error": "File is password-protected."}, 401
158
- if reader.decrypt(password) == 0:
159
- return {"error": "Invalid password."}, 403
160
-
161
- text = "\n".join([page.extract_text() or "" for page in reader.pages])
162
- if text.strip():
163
- return text.strip(), 200
164
-
165
- logger.info("Falling back to OCR for PDF.")
166
- images = convert_from_path(filepath)
167
- with ThreadPoolExecutor(max_workers=5) as pool:
168
- ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang="eng"), images))
169
- full_text = "\n".join(ocr_text).strip()
170
- return (full_text, 200) if full_text else ("No text found", 415)
171
- except Exception as e:
172
- logger.exception(f"PDF processing error: {filepath}")
173
- return "Failed to extract text"
174
 
175
  @staticmethod
176
- def extract_text_from_docx(filepath):
177
- try:
178
- doc = Document(filepath)
179
- text = "\n".join([para.text for para in doc.paragraphs])
180
- return text.strip() or None
181
- except Exception as e:
182
- logger.exception(f"Failed to extract text from DOCX: {filepath}")
183
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pytesseract
3
  import cv2
4
+ import pandas as pd
5
  from PIL import Image
6
  from docx import Document
 
 
 
7
  import tempfile
8
  import os
9
  import logging
 
 
 
10
 
11
  class TextExtractorAgent:
12
  @staticmethod
13
+ def extract_text(filepath, ext):
14
  try:
 
15
  if ext == "pdf":
16
+ return TextExtractorAgent.extract_text_from_pdf(filepath)
17
  elif ext in {"jpg", "jpeg", "png"}:
18
  return TextExtractorAgent.extract_text_from_image(filepath)
19
  elif ext == "docx":
20
  return TextExtractorAgent.extract_text_from_docx(filepath)
21
+ elif ext in {"xlsx", "xls"}:
22
+ return TextExtractorAgent.extract_text_from_excel(filepath)
23
  return None
24
  except Exception as e:
25
+ logging.error(f"Text extraction failed: {e}")
26
  return None
27
 
28
  @staticmethod
29
+ def extract_text_from_pdf(filepath, password=None):
30
+ text = ""
31
+ with pdfplumber.open(filepath) as pdf:
32
+ for page in pdf.pages:
33
+ page_text = page.extract_text()
34
+ if page_text:
35
+ text += page_text + "\n"
36
+ return text.strip() or None
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  @staticmethod
39
  def extract_text_from_image(filepath):
40
+ image = cv2.imread(filepath)
41
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
42
+ _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
43
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
44
+ processed_path = temp_file.name
45
+ cv2.imwrite(processed_path, processed)
46
+ text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
47
+ os.remove(processed_path)
48
+ return text.strip() or None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  @staticmethod
51
+ def extract_text_from_docx(filepath):
52
+ doc = Document(filepath)
53
+ text = "\n".join([para.text for para in doc.paragraphs])
54
+ return text.strip() or None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  @staticmethod
57
+ def extract_text_from_excel(filepath):
58
+ dfs = pd.read_excel(filepath, sheet_name=None)
59
+ text = "\n".join([
60
+ "\n".join([
61
+ " ".join(map(str, df[col].dropna()))
62
+ for col in df.columns
63
+ ])
64
+ for df in dfs.values()
65
+ ])
66
+ return text.strip() or None
67
+
68
+ # import pytesseract
69
+ # import cv2
70
+ # from PIL import Image
71
+ # from docx import Document
72
+ # from PyPDF2 import PdfReader
73
+ # from pdf2image import convert_from_path
74
+ # from concurrent.futures import ThreadPoolExecutor
75
+ # import tempfile
76
+ # import os
77
+ # import logging
78
+ # import numpy as np
79
+
80
+ # logger = logging.getLogger(__name__)
81
+
82
+ # class TextExtractorAgent:
83
+ # @staticmethod
84
+ # def extract_text(filepath, ext, password=None):
85
+ # try:
86
+ # ext = ext.lower()
87
+ # if ext == "pdf":
88
+ # return TextExtractorAgent.extract_text_from_pdf(filepath, password)
89
+ # elif ext in {"jpg", "jpeg", "png"}:
90
+ # return TextExtractorAgent.extract_text_from_image(filepath)
91
+ # elif ext == "docx":
92
+ # return TextExtractorAgent.extract_text_from_docx(filepath)
93
+ # return None
94
+ # except Exception as e:
95
+ # logger.error(f"Text extraction failed: {e}")
96
+ # return None
97
+
98
+ # @staticmethod
99
+ # def is_blurred(image_path, variance_threshold=150):
100
+ # try:
101
+ # image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
102
+ # if image is None:
103
+ # logger.error(f"Unable to read image: {image_path}")
104
+ # return True
105
+
106
+ # laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
107
+ # edges = cv2.Canny(image, 50, 150)
108
+ # edge_density = np.mean(edges)
109
+
110
+ # logger.info(f"Laplacian: {laplacian_var:.2f}, Edge Density: {edge_density:.2f}")
111
+ # is_blurry = laplacian_var < variance_threshold and edge_density < 10
112
+
113
+ # if is_blurry:
114
+ # logger.warning(f"Image '{image_path}' flagged as blurry.")
115
+ # return is_blurry
116
+ # except Exception as e:
117
+ # logger.exception(f"Error checking blur for '{image_path}': {e}")
118
+ # return True
119
+
120
+ # @staticmethod
121
+ # def extract_text_from_image(filepath):
122
+ # try:
123
+ # if TextExtractorAgent.is_blurred(filepath):
124
+ # logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
125
+ # return "Image is too blurry, OCR failed."
126
+
127
+ # image = cv2.imread(filepath)
128
+ # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
129
+ # gray = cv2.GaussianBlur(gray, (5, 5), 0)
130
+ # gray = cv2.adaptiveThreshold(
131
+ # gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
132
+ # )
133
+ # gray = cv2.dilate(gray, np.ones((2, 2), np.uint8), iterations=1)
134
+
135
+ # with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
136
+ # processed_path = temp_file.name
137
+ # cv2.imwrite(processed_path, gray)
138
+
139
+ # text = pytesseract.image_to_string(Image.open(processed_path), lang="eng").strip()
140
+ # os.remove(processed_path)
141
+
142
+ # if len(text.split()) < 5:
143
+ # logger.warning(f"Too little OCR output from '{filepath}'.")
144
+ # return "OCR failed to extract meaningful text."
145
+
146
+ # return text
147
+ # except Exception as e:
148
+ # logger.exception(f"OCR failed for image '{filepath}': {e}")
149
+ # return "Failed to extract text"
150
+
151
+ # @staticmethod
152
+ # def extract_text_from_pdf(filepath, password=None):
153
+ # try:
154
+ # reader = PdfReader(filepath)
155
+ # if reader.is_encrypted:
156
+ # if not password:
157
+ # return {"error": "File is password-protected."}, 401
158
+ # if reader.decrypt(password) == 0:
159
+ # return {"error": "Invalid password."}, 403
160
+
161
+ # text = "\n".join([page.extract_text() or "" for page in reader.pages])
162
+ # if text.strip():
163
+ # return text.strip(), 200
164
+
165
+ # logger.info("Falling back to OCR for PDF.")
166
+ # images = convert_from_path(filepath)
167
+ # with ThreadPoolExecutor(max_workers=5) as pool:
168
+ # ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang="eng"), images))
169
+ # full_text = "\n".join(ocr_text).strip()
170
+ # return (full_text, 200) if full_text else ("No text found", 415)
171
+ # except Exception as e:
172
+ # logger.exception(f"PDF processing error: {filepath}")
173
+ # return "Failed to extract text"
174
+
175
+ # @staticmethod
176
+ # def extract_text_from_docx(filepath):
177
+ # try:
178
+ # doc = Document(filepath)
179
+ # text = "\n".join([para.text for para in doc.paragraphs])
180
+ # return text.strip() or None
181
+ # except Exception as e:
182
+ # logger.exception(f"Failed to extract text from DOCX: {filepath}")
183
+ # return None