KevinHuSh
commited on
Commit
·
9aa975e
1
Parent(s):
d923a42
fix too long query exception (#1195)
Browse files### What problem does this PR solve?
#1161
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- deepdoc/parser/docx_parser.py +13 -8
- rag/app/qa.py +3 -4
- rag/nlp/query.py +1 -0
- rag/nlp/search.py +2 -2
deepdoc/parser/docx_parser.py
CHANGED
@@ -113,19 +113,24 @@ class RAGFlowDocxParser:
|
|
113 |
def __call__(self, fnm, from_page=0, to_page=100000):
|
114 |
self.doc = Document(fnm) if isinstance(
|
115 |
fnm, str) else Document(BytesIO(fnm))
|
116 |
-
pn = 0
|
117 |
-
secs = []
|
118 |
for p in self.doc.paragraphs:
|
119 |
if pn > to_page:
|
120 |
break
|
121 |
-
|
122 |
-
|
123 |
for run in p.runs:
|
124 |
-
if
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
128 |
pn += 1
|
129 |
|
|
|
|
|
130 |
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
131 |
return secs, tbls
|
|
|
113 |
def __call__(self, fnm, from_page=0, to_page=100000):
|
114 |
self.doc = Document(fnm) if isinstance(
|
115 |
fnm, str) else Document(BytesIO(fnm))
|
116 |
+
pn = 0 # parsed page
|
117 |
+
secs = [] # parsed contents
|
118 |
for p in self.doc.paragraphs:
|
119 |
if pn > to_page:
|
120 |
break
|
121 |
+
|
122 |
+
runs_within_single_paragraph = [] # save runs within the range of pages
|
123 |
for run in p.runs:
|
124 |
+
if pn > to_page:
|
125 |
+
break
|
126 |
+
if from_page <= pn < to_page and p.text.strip():
|
127 |
+
runs_within_single_paragraph.append(run.text) # append run.text first
|
128 |
+
|
129 |
+
# wrap page break checker into a static method
|
130 |
+
if RAGFlowDocxParser.has_page_break(run._element.xml):
|
131 |
pn += 1
|
132 |
|
133 |
+
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
|
134 |
+
|
135 |
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
136 |
return secs, tbls
|
rag/app/qa.py
CHANGED
@@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
|
|
145 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
146 |
return d
|
147 |
|
|
|
148 |
def mdQuestionLevel(s):
|
149 |
match = re.match(r'#*', s)
|
150 |
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
@@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
244 |
break
|
245 |
txt += l
|
246 |
lines = txt.split("\n")
|
247 |
-
comma, tab = 0, 0
|
248 |
last_question, last_answer = "", ""
|
249 |
question_stack, level_stack = [], []
|
250 |
code_block = False
|
@@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
262 |
last_answer = f'{last_answer}\n{l}'
|
263 |
else: # is a question
|
264 |
if last_answer:
|
265 |
-
sum_question =
|
266 |
if sum_question:
|
267 |
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
268 |
last_answer = ''
|
@@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
274 |
question_stack.append(question)
|
275 |
level_stack.append(question_level)
|
276 |
if last_answer:
|
277 |
-
sum_question =
|
278 |
if sum_question:
|
279 |
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
280 |
return res
|
281 |
|
282 |
-
|
283 |
raise NotImplementedError(
|
284 |
"Excel, csv(txt), pdf and markdown format files are supported.")
|
285 |
|
|
|
145 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
146 |
return d
|
147 |
|
148 |
+
|
149 |
def mdQuestionLevel(s):
|
150 |
match = re.match(r'#*', s)
|
151 |
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
|
|
245 |
break
|
246 |
txt += l
|
247 |
lines = txt.split("\n")
|
|
|
248 |
last_question, last_answer = "", ""
|
249 |
question_stack, level_stack = [], []
|
250 |
code_block = False
|
|
|
262 |
last_answer = f'{last_answer}\n{l}'
|
263 |
else: # is a question
|
264 |
if last_answer:
|
265 |
+
sum_question = '\n'.join(question_stack)
|
266 |
if sum_question:
|
267 |
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
268 |
last_answer = ''
|
|
|
274 |
question_stack.append(question)
|
275 |
level_stack.append(question_level)
|
276 |
if last_answer:
|
277 |
+
sum_question = '\n'.join(question_stack)
|
278 |
if sum_question:
|
279 |
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
280 |
return res
|
281 |
|
|
|
282 |
raise NotImplementedError(
|
283 |
"Excel, csv(txt), pdf and markdown format files are supported.")
|
284 |
|
rag/nlp/query.py
CHANGED
@@ -110,6 +110,7 @@ class EsQueryer:
|
|
110 |
sm = []
|
111 |
|
112 |
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
|
|
113 |
|
114 |
tk_syns = self.syn.lookup(tk)
|
115 |
tk = EsQueryer.subSpecialChar(tk)
|
|
|
110 |
sm = []
|
111 |
|
112 |
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
113 |
+
if len(keywords) >= 12: break
|
114 |
|
115 |
tk_syns = self.syn.lookup(tk)
|
116 |
tk = EsQueryer.subSpecialChar(tk)
|
rag/nlp/search.py
CHANGED
@@ -98,7 +98,7 @@ class Dealer:
|
|
98 |
if not qst:
|
99 |
if not req.get("sort"):
|
100 |
s = s.sort(
|
101 |
-
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
102 |
{"create_timestamp_flt": {
|
103 |
"order": "desc", "unmapped_type": "float"}}
|
104 |
)
|
@@ -108,7 +108,7 @@ class Dealer:
|
|
108 |
"mode": "avg", "numeric_type": "double"}},
|
109 |
{"top_int": {"order": "asc", "unmapped_type": "float",
|
110 |
"mode": "avg", "numeric_type": "double"}},
|
111 |
-
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
112 |
{"create_timestamp_flt": {
|
113 |
"order": "desc", "unmapped_type": "float"}}
|
114 |
)
|
|
|
98 |
if not qst:
|
99 |
if not req.get("sort"):
|
100 |
s = s.sort(
|
101 |
+
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
102 |
{"create_timestamp_flt": {
|
103 |
"order": "desc", "unmapped_type": "float"}}
|
104 |
)
|
|
|
108 |
"mode": "avg", "numeric_type": "double"}},
|
109 |
{"top_int": {"order": "asc", "unmapped_type": "float",
|
110 |
"mode": "avg", "numeric_type": "double"}},
|
111 |
+
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
112 |
{"create_timestamp_flt": {
|
113 |
"order": "desc", "unmapped_type": "float"}}
|
114 |
)
|