kuschzzp Kevin Hu commited on
Commit
ff43695
·
1 Parent(s): 5fb0114

Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)

Browse files

### What problem does this PR solve?

When parsing a docx file using the Book parsing method, to_page is
always -1, resulting in a block count of 0 even if parsing is successful

Fix:#3230

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: Kevin Hu <[email protected]>

api/db/db_models.py CHANGED
@@ -840,7 +840,7 @@ class Task(DataBaseModel):
840
  doc_id = CharField(max_length=32, null=False, index=True)
841
  from_page = IntegerField(default=0)
842
 
843
- to_page = IntegerField(default=-1)
844
 
845
  begin_at = DateTimeField(null=True, index=True)
846
  process_duation = FloatField(default=0)
 
840
  doc_id = CharField(max_length=32, null=False, index=True)
841
  from_page = IntegerField(default=0)
842
 
843
+ to_page = IntegerField(default=100000000)
844
 
845
  begin_at = DateTimeField(null=True, index=True)
846
  process_duation = FloatField(default=0)
deepdoc/parser/docx_parser.py CHANGED
@@ -110,7 +110,7 @@ class RAGFlowDocxParser:
110
  return lines
111
  return ["\n".join(lines)]
112
 
113
- def __call__(self, fnm, from_page=0, to_page=100000):
114
  self.doc = Document(fnm) if isinstance(
115
  fnm, str) else Document(BytesIO(fnm))
116
  pn = 0 # parsed page
@@ -130,7 +130,7 @@ class RAGFlowDocxParser:
130
  if 'lastRenderedPageBreak' in run._element.xml:
131
  pn += 1
132
 
133
- secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
134
 
135
  tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
136
  return secs, tbls
 
110
  return lines
111
  return ["\n".join(lines)]
112
 
113
+ def __call__(self, fnm, from_page=0, to_page=100000000):
114
  self.doc = Document(fnm) if isinstance(
115
  fnm, str) else Document(BytesIO(fnm))
116
  pn = 0 # parsed page
 
130
  if 'lastRenderedPageBreak' in run._element.xml:
131
  pn += 1
132
 
133
+ secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
134
 
135
  tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
136
  return secs, tbls