加帆 commited on
Commit
f77c02e
·
1 Parent(s): 700dff9

Bug fix pdf parse index out of range (#440)

Browse files

### What problem does this PR solve?

fix a bug comes when parse some pdf file #436

### Type of change

- [☑️ ] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. deepdoc/parser/pdf_parser.py +4 -1
deepdoc/parser/pdf_parser.py CHANGED
@@ -830,10 +830,13 @@ class HuParser:
830
  pn = [bx["page_number"]]
831
  top = bx["top"] - self.page_cum_height[pn[0] - 1]
832
  bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
833
- if pn[-1] - 1 >= len(self.page_images): return ""
 
834
  while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
835
  bott -= self.page_images[pn[-1] - 1].size[1] / ZM
836
  pn.append(pn[-1] + 1)
 
 
837
 
838
  return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
839
  .format("-".join([str(p) for p in pn]),
 
830
  pn = [bx["page_number"]]
831
  top = bx["top"] - self.page_cum_height[pn[0] - 1]
832
  bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
833
+ page_images_cnt = len(self.page_images)
834
+ if pn[-1] - 1 >= page_images_cnt: return ""
835
  while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
836
  bott -= self.page_images[pn[-1] - 1].size[1] / ZM
837
  pn.append(pn[-1] + 1)
838
+ if pn[-1] - 1 >= page_images_cnt:
839
+ return ""
840
 
841
  return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
842
  .format("-".join([str(p) for p in pn]),