加帆
commited on
Commit
·
f77c02e
1
Parent(s):
700dff9
Bug fix pdf parse index out of range (#440)
Browse files### What problem does this PR solve?
fix a bug comes when parse some pdf file #436
### Type of change
- [☑️ ] Bug Fix (non-breaking change which fixes an issue)
deepdoc/parser/pdf_parser.py
CHANGED
@@ -830,10 +830,13 @@ class HuParser:
|
|
830 |
pn = [bx["page_number"]]
|
831 |
top = bx["top"] - self.page_cum_height[pn[0] - 1]
|
832 |
bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
|
833 |
-
|
|
|
834 |
while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
|
835 |
bott -= self.page_images[pn[-1] - 1].size[1] / ZM
|
836 |
pn.append(pn[-1] + 1)
|
|
|
|
|
837 |
|
838 |
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
839 |
.format("-".join([str(p) for p in pn]),
|
|
|
830 |
pn = [bx["page_number"]]
|
831 |
top = bx["top"] - self.page_cum_height[pn[0] - 1]
|
832 |
bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
|
833 |
+
page_images_cnt = len(self.page_images)
|
834 |
+
if pn[-1] - 1 >= page_images_cnt: return ""
|
835 |
while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
|
836 |
bott -= self.page_images[pn[-1] - 1].size[1] / ZM
|
837 |
pn.append(pn[-1] + 1)
|
838 |
+
if pn[-1] - 1 >= page_images_cnt:
|
839 |
+
return ""
|
840 |
|
841 |
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
842 |
.format("-".join([str(p) for p in pn]),
|