H commited on
Commit
ece4f03
·
1 Parent(s): d389389

fix pdf_parser content confusion (#1458)

Browse files

### What problem does this PR solve?

#1407

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. deepdoc/parser/pdf_parser.py +13 -0
deepdoc/parser/pdf_parser.py CHANGED
@@ -286,6 +286,19 @@ class RAGFlowPdfParser:
286
  self.mean_height[-1] / 3
287
  )
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # merge chars in the same rect
290
  for c in Recognizer.sort_X_firstly(
291
  chars, self.mean_width[pagenum - 1] // 4):
 
286
  self.mean_height[-1] / 3
287
  )
288
 
289
+ # solve char content confusion
290
+ record_error_length = 0
291
+ for c in chars[0:128]:
292
+ ii = Recognizer.find_overlapped(c, bxs)
293
+ if ii is None:
294
+ continue
295
+ record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2)
296
+
297
+ record_error_length = record_error_length / 128
298
+ for char in chars:
299
+ char["top"] -= record_error_length
300
+ char["bottom"] -= record_error_length
301
+
302
  # merge chars in the same rect
303
  for c in Recognizer.sort_X_firstly(
304
  chars, self.mean_width[pagenum - 1] // 4):