Kevin Hu commited on
Commit
7b6220c
·
1 Parent(s): e758781

bigger resolution for OCR (#2919)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement

Files changed (1) hide show
  1. deepdoc/parser/pdf_parser.py +5 -3
deepdoc/parser/pdf_parser.py CHANGED
@@ -957,6 +957,8 @@ class RAGFlowPdfParser:
957
  fnm, str) else pdfplumber.open(BytesIO(fnm))
958
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
959
  enumerate(self.pdf.pages[page_from:page_to])]
 
 
960
  self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
961
  self.pdf.pages[page_from:page_to]]
962
  self.total_page = len(self.pdf.pages)
@@ -992,7 +994,7 @@ class RAGFlowPdfParser:
992
  self.is_english = False
993
 
994
  st = timer()
995
- for i, img in enumerate(self.page_images):
996
  chars = self.page_chars[i] if not self.is_english else []
997
  self.mean_height.append(
998
  np.median(sorted([c["height"] for c in chars])) if chars else 0
@@ -1000,7 +1002,7 @@ class RAGFlowPdfParser:
1000
  self.mean_width.append(
1001
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1002
  )
1003
- self.page_cum_height.append(img.size[1] / zoomin)
1004
  j = 0
1005
  while j + 1 < len(chars):
1006
  if chars[j]["text"] and chars[j + 1]["text"] \
@@ -1010,7 +1012,7 @@ class RAGFlowPdfParser:
1010
  chars[j]["text"] += " "
1011
  j += 1
1012
 
1013
- self.__ocr(i + 1, img, chars, zoomin)
1014
  if callback and i % 6 == 5:
1015
  callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
1016
  # print("OCR:", timer()-st)
 
957
  fnm, str) else pdfplumber.open(BytesIO(fnm))
958
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
959
  enumerate(self.pdf.pages[page_from:page_to])]
960
+ self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
961
+ enumerate(self.pdf.pages[page_from:page_to])]
962
  self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
963
  self.pdf.pages[page_from:page_to]]
964
  self.total_page = len(self.pdf.pages)
 
994
  self.is_english = False
995
 
996
  st = timer()
997
+ for i, img in enumerate(self.page_images_x2):
998
  chars = self.page_chars[i] if not self.is_english else []
999
  self.mean_height.append(
1000
  np.median(sorted([c["height"] for c in chars])) if chars else 0
 
1002
  self.mean_width.append(
1003
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1004
  )
1005
+ self.page_cum_height.append(img.size[1] / zoomin/2)
1006
  j = 0
1007
  while j + 1 < len(chars):
1008
  if chars[j]["text"] and chars[j + 1]["text"] \
 
1012
  chars[j]["text"] += " "
1013
  j += 1
1014
 
1015
+ self.__ocr(i + 1, img, chars, zoomin*2)
1016
  if callback and i % 6 == 5:
1017
  callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
1018
  # print("OCR:", timer()-st)