Spaces:

retopara
/

ragflow

Build error

KevinHuSh commited on Jun 18, 2024

Commit

9aa975e

1 Parent(s): d923a42

fix too long query exception (#1195)

### What problem does this PR solve?

#1161
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (4) hide show

deepdoc/parser/docx_parser.py +13 -8
rag/app/qa.py +3 -4
rag/nlp/query.py +1 -0
rag/nlp/search.py +2 -2

deepdoc/parser/docx_parser.py CHANGED Viewed

@@ -113,19 +113,24 @@ class RAGFlowDocxParser:
     def __call__(self, fnm, from_page=0, to_page=100000):
         self.doc = Document(fnm) if isinstance(
             fnm, str) else Document(BytesIO(fnm))
-        pn = 0
-        secs = []
         for p in self.doc.paragraphs:
             if pn > to_page:
                 break
-            if from_page <= pn < to_page and p.text.strip():
-                secs.append((p.text, p.style.name))
             for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                     pn += 1
         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
         return secs, tbls

     def __call__(self, fnm, from_page=0, to_page=100000):
         self.doc = Document(fnm) if isinstance(
             fnm, str) else Document(BytesIO(fnm))
+        pn = 0 # parsed page
+        secs = [] # parsed contents
         for p in self.doc.paragraphs:
             if pn > to_page:
                 break
+            runs_within_single_paragraph = [] # save runs within the range of pages
             for run in p.runs:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+                # wrap page break checker into a static method
+                if RAGFlowDocxParser.has_page_break(run._element.xml):
                     pn += 1
+            secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
         return secs, tbls

rag/app/qa.py CHANGED Viewed

@@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
     return d
 def mdQuestionLevel(s):
     match = re.match(r'#*', s)
     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
@@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                         break
                     txt += l
         lines = txt.split("\n")
-        comma, tab = 0, 0
         last_question, last_answer = "", ""
         question_stack, level_stack = [], []
         code_block = False
@@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                 last_answer = f'{last_answer}\n{l}'
             else:   # is a question
                 if last_answer:
-                    sum_question = ('\n').join(question_stack)
                     if sum_question:
                         res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
                     last_answer = ''
@@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                 question_stack.append(question)
                 level_stack.append(question_level)
         if last_answer:
-            sum_question = ('\n').join(question_stack)
             if sum_question:
                 res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
         return res
     raise NotImplementedError(
         "Excel, csv(txt), pdf and markdown format files are supported.")

     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
     return d
 def mdQuestionLevel(s):
     match = re.match(r'#*', s)
     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
                         break
                     txt += l
         lines = txt.split("\n")
         last_question, last_answer = "", ""
         question_stack, level_stack = [], []
         code_block = False
                 last_answer = f'{last_answer}\n{l}'
             else:   # is a question
                 if last_answer:
+                    sum_question = '\n'.join(question_stack)
                     if sum_question:
                         res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
                     last_answer = ''
                 question_stack.append(question)
                 level_stack.append(question_level)
         if last_answer:
+            sum_question = '\n'.join(question_stack)
             if sum_question:
                 res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
         return res
     raise NotImplementedError(
         "Excel, csv(txt), pdf and markdown format files are supported.")

rag/nlp/query.py CHANGED Viewed

@@ -110,6 +110,7 @@ class EsQueryer:
                     sm = []
                 keywords.append(re.sub(r"[ \\\"']+", "", tk))
                 tk_syns = self.syn.lookup(tk)
                 tk = EsQueryer.subSpecialChar(tk)

                     sm = []
                 keywords.append(re.sub(r"[ \\\"']+", "", tk))
+                if len(keywords) >= 12: break
                 tk_syns = self.syn.lookup(tk)
                 tk = EsQueryer.subSpecialChar(tk)

rag/nlp/search.py CHANGED Viewed

@@ -98,7 +98,7 @@ class Dealer:
         if not qst:
             if not req.get("sort"):
                 s = s.sort(
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )
@@ -108,7 +108,7 @@ class Dealer:
                                       "mode": "avg", "numeric_type": "double"}},
                     {"top_int": {"order": "asc", "unmapped_type": "float",
                                  "mode": "avg", "numeric_type": "double"}},
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )

         if not qst:
             if not req.get("sort"):
                 s = s.sort(
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )
                                       "mode": "avg", "numeric_type": "double"}},
                     {"top_int": {"order": "asc", "unmapped_type": "float",
                                  "mode": "avg", "numeric_type": "double"}},
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )