KevinHuSh commited on
Commit
9aa975e
·
1 Parent(s): d923a42

fix too long query exception (#1195)

Browse files

### What problem does this PR solve?

#1161
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

deepdoc/parser/docx_parser.py CHANGED
@@ -113,19 +113,24 @@ class RAGFlowDocxParser:
113
  def __call__(self, fnm, from_page=0, to_page=100000):
114
  self.doc = Document(fnm) if isinstance(
115
  fnm, str) else Document(BytesIO(fnm))
116
- pn = 0
117
- secs = []
118
  for p in self.doc.paragraphs:
119
  if pn > to_page:
120
  break
121
- if from_page <= pn < to_page and p.text.strip():
122
- secs.append((p.text, p.style.name))
123
  for run in p.runs:
124
- if 'lastRenderedPageBreak' in run._element.xml:
125
- pn += 1
126
- continue
127
- if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
 
 
 
128
  pn += 1
129
 
 
 
130
  tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
131
  return secs, tbls
 
113
  def __call__(self, fnm, from_page=0, to_page=100000):
114
  self.doc = Document(fnm) if isinstance(
115
  fnm, str) else Document(BytesIO(fnm))
116
+ pn = 0 # parsed page
117
+ secs = [] # parsed contents
118
  for p in self.doc.paragraphs:
119
  if pn > to_page:
120
  break
121
+
122
+ runs_within_single_paragraph = [] # save runs within the range of pages
123
  for run in p.runs:
124
+ if pn > to_page:
125
+ break
126
+ if from_page <= pn < to_page and p.text.strip():
127
+ runs_within_single_paragraph.append(run.text) # append run.text first
128
+
129
+ # wrap page break checker into a static method
130
+ if RAGFlowDocxParser.has_page_break(run._element.xml):
131
  pn += 1
132
 
133
+ secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
134
+
135
  tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
136
  return secs, tbls
rag/app/qa.py CHANGED
@@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
145
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
146
  return d
147
 
 
148
  def mdQuestionLevel(s):
149
  match = re.match(r'#*', s)
150
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
@@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
244
  break
245
  txt += l
246
  lines = txt.split("\n")
247
- comma, tab = 0, 0
248
  last_question, last_answer = "", ""
249
  question_stack, level_stack = [], []
250
  code_block = False
@@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
262
  last_answer = f'{last_answer}\n{l}'
263
  else: # is a question
264
  if last_answer:
265
- sum_question = ('\n').join(question_stack)
266
  if sum_question:
267
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
268
  last_answer = ''
@@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
274
  question_stack.append(question)
275
  level_stack.append(question_level)
276
  if last_answer:
277
- sum_question = ('\n').join(question_stack)
278
  if sum_question:
279
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
280
  return res
281
 
282
-
283
  raise NotImplementedError(
284
  "Excel, csv(txt), pdf and markdown format files are supported.")
285
 
 
145
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
146
  return d
147
 
148
+
149
  def mdQuestionLevel(s):
150
  match = re.match(r'#*', s)
151
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
 
245
  break
246
  txt += l
247
  lines = txt.split("\n")
 
248
  last_question, last_answer = "", ""
249
  question_stack, level_stack = [], []
250
  code_block = False
 
262
  last_answer = f'{last_answer}\n{l}'
263
  else: # is a question
264
  if last_answer:
265
+ sum_question = '\n'.join(question_stack)
266
  if sum_question:
267
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
268
  last_answer = ''
 
274
  question_stack.append(question)
275
  level_stack.append(question_level)
276
  if last_answer:
277
+ sum_question = '\n'.join(question_stack)
278
  if sum_question:
279
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
280
  return res
281
 
 
282
  raise NotImplementedError(
283
  "Excel, csv(txt), pdf and markdown format files are supported.")
284
 
rag/nlp/query.py CHANGED
@@ -110,6 +110,7 @@ class EsQueryer:
110
  sm = []
111
 
112
  keywords.append(re.sub(r"[ \\\"']+", "", tk))
 
113
 
114
  tk_syns = self.syn.lookup(tk)
115
  tk = EsQueryer.subSpecialChar(tk)
 
110
  sm = []
111
 
112
  keywords.append(re.sub(r"[ \\\"']+", "", tk))
113
+ if len(keywords) >= 12: break
114
 
115
  tk_syns = self.syn.lookup(tk)
116
  tk = EsQueryer.subSpecialChar(tk)
rag/nlp/search.py CHANGED
@@ -98,7 +98,7 @@ class Dealer:
98
  if not qst:
99
  if not req.get("sort"):
100
  s = s.sort(
101
- {"create_time": {"order": "desc", "unmapped_type": "date"}},
102
  {"create_timestamp_flt": {
103
  "order": "desc", "unmapped_type": "float"}}
104
  )
@@ -108,7 +108,7 @@ class Dealer:
108
  "mode": "avg", "numeric_type": "double"}},
109
  {"top_int": {"order": "asc", "unmapped_type": "float",
110
  "mode": "avg", "numeric_type": "double"}},
111
- {"create_time": {"order": "desc", "unmapped_type": "date"}},
112
  {"create_timestamp_flt": {
113
  "order": "desc", "unmapped_type": "float"}}
114
  )
 
98
  if not qst:
99
  if not req.get("sort"):
100
  s = s.sort(
101
+ #{"create_time": {"order": "desc", "unmapped_type": "date"}},
102
  {"create_timestamp_flt": {
103
  "order": "desc", "unmapped_type": "float"}}
104
  )
 
108
  "mode": "avg", "numeric_type": "double"}},
109
  {"top_int": {"order": "asc", "unmapped_type": "float",
110
  "mode": "avg", "numeric_type": "double"}},
111
+ #{"create_time": {"order": "desc", "unmapped_type": "date"}},
112
  {"create_timestamp_flt": {
113
  "order": "desc", "unmapped_type": "float"}}
114
  )