H
Kevin Hu
commited on
Commit
·
dda4c86
1
Parent(s):
971f83c
Fix docx parser line bug (#1715)
Browse files### What problem does this PR solve?
#1704
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: Kevin Hu <[email protected]>
- deepdoc/parser/pdf_parser.py +1 -1
- rag/app/naive.py +4 -1
deepdoc/parser/pdf_parser.py
CHANGED
@@ -952,7 +952,7 @@ class RAGFlowPdfParser:
|
|
952 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
953 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
954 |
enumerate(self.pdf.pages[page_from:page_to])]
|
955 |
-
self.page_chars = [[{**c, 'top':
|
956 |
self.pdf.pages[page_from:page_to]]
|
957 |
self.total_page = len(self.pdf.pages)
|
958 |
except Exception as e:
|
|
|
952 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
953 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
954 |
enumerate(self.pdf.pages[page_from:page_to])]
|
955 |
+
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
956 |
self.pdf.pages[page_from:page_to]]
|
957 |
self.total_page = len(self.pdf.pages)
|
958 |
except Exception as e:
|
rag/app/naive.py
CHANGED
@@ -23,6 +23,8 @@ from rag.utils import num_tokens_from_string
|
|
23 |
from PIL import Image
|
24 |
from functools import reduce
|
25 |
from markdown import markdown
|
|
|
|
|
26 |
class Docx(DocxParser):
|
27 |
def __init__(self):
|
28 |
pass
|
@@ -81,7 +83,8 @@ class Docx(DocxParser):
|
|
81 |
continue
|
82 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
83 |
pn += 1
|
84 |
-
new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
|
|
|
85 |
tbls = []
|
86 |
for tb in self.doc.tables:
|
87 |
html= "<table>"
|
|
|
23 |
from PIL import Image
|
24 |
from functools import reduce
|
25 |
from markdown import markdown
|
26 |
+
|
27 |
+
|
28 |
class Docx(DocxParser):
|
29 |
def __init__(self):
|
30 |
pass
|
|
|
83 |
continue
|
84 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
85 |
pn += 1
|
86 |
+
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
|
87 |
+
|
88 |
tbls = []
|
89 |
for tb in self.doc.tables:
|
90 |
html= "<table>"
|