H commited on
Commit
1164cba
·
1 Parent(s): ece4f03

fix pdf_paser char content confusion (#1462)

Browse files

### What problem does this PR solve?

#1407

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. deepdoc/parser/pdf_parser.py +4 -3
deepdoc/parser/pdf_parser.py CHANGED
@@ -287,14 +287,15 @@ class RAGFlowPdfParser:
287
  )
288
 
289
  # solve char content confusion
290
- record_error_length = 0
291
  for c in chars[0:128]:
292
  ii = Recognizer.find_overlapped(c, bxs)
293
  if ii is None:
294
  continue
295
- record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2)
 
296
 
297
- record_error_length = record_error_length / 128
298
  for char in chars:
299
  char["top"] -= record_error_length
300
  char["bottom"] -= record_error_length
 
287
  )
288
 
289
  # solve char content confusion
290
+ record_error_length, ct = 0, 1
291
  for c in chars[0:128]:
292
  ii = Recognizer.find_overlapped(c, bxs)
293
  if ii is None:
294
  continue
295
+ record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2)
296
+ ct += 1
297
 
298
+ record_error_length = record_error_length / ct
299
  for char in chars:
300
  char["top"] -= record_error_length
301
  char["bottom"] -= record_error_length