Spaces:

retopara
/

ragflow

Build error

App Files Files Community

Kevin Hu commited on Oct 21, 2024

Commit

7b6220c

1 Parent(s): e758781

bigger resolution for OCR (#2919)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement

Files changed (1) hide show

deepdoc/parser/pdf_parser.py +5 -3

deepdoc/parser/pdf_parser.py CHANGED Viewed

@@ -957,6 +957,8 @@ class RAGFlowPdfParser:
                 fnm, str) else pdfplumber.open(BytesIO(fnm))
             self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                 enumerate(self.pdf.pages[page_from:page_to])]
             self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
                                self.pdf.pages[page_from:page_to]]
             self.total_page = len(self.pdf.pages)
@@ -992,7 +994,7 @@ class RAGFlowPdfParser:
             self.is_english = False
         st = timer()
-        for i, img in enumerate(self.page_images):
             chars = self.page_chars[i] if not self.is_english else []
             self.mean_height.append(
                 np.median(sorted([c["height"] for c in chars])) if chars else 0
@@ -1000,7 +1002,7 @@ class RAGFlowPdfParser:
             self.mean_width.append(
                 np.median(sorted([c["width"] for c in chars])) if chars else 8
             )
-            self.page_cum_height.append(img.size[1] / zoomin)
             j = 0
             while j + 1 < len(chars):
                 if chars[j]["text"] and chars[j + 1]["text"] \
@@ -1010,7 +1012,7 @@ class RAGFlowPdfParser:
                     chars[j]["text"] += " "
                 j += 1
-            self.__ocr(i + 1, img, chars, zoomin)
             if callback and i % 6 == 5:
                 callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
         # print("OCR:", timer()-st)

                 fnm, str) else pdfplumber.open(BytesIO(fnm))
             self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                 enumerate(self.pdf.pages[page_from:page_to])]
+            self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
+                                enumerate(self.pdf.pages[page_from:page_to])]
             self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
                                self.pdf.pages[page_from:page_to]]
             self.total_page = len(self.pdf.pages)
             self.is_english = False
         st = timer()
+        for i, img in enumerate(self.page_images_x2):
             chars = self.page_chars[i] if not self.is_english else []
             self.mean_height.append(
                 np.median(sorted([c["height"] for c in chars])) if chars else 0
             self.mean_width.append(
                 np.median(sorted([c["width"] for c in chars])) if chars else 8
             )
+            self.page_cum_height.append(img.size[1] / zoomin/2)
             j = 0
             while j + 1 < len(chars):
                 if chars[j]["text"] and chars[j + 1]["text"] \
                     chars[j]["text"] += " "
                 j += 1
+            self.__ocr(i + 1, img, chars, zoomin*2)
             if callback and i % 6 == 5:
                 callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
         # print("OCR:", timer()-st)