Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Mar 19, 2024

Commit

89444d3

1 Parent(s): 7d85666

fix github account login issue (#132)

Browse files

Files changed (6) hide show

api/apps/user_app.py +2 -0
deepdoc/parser/pdf_parser.py +1 -1
rag/app/manual.py +1 -4
rag/app/qa.py +9 -3
rag/nlp/__init__.py +1 -0
rag/nlp/query.py +1 -1

api/apps/user_app.py CHANGED Viewed

@@ -106,7 +106,9 @@ def github_callback():
             stat_logger.exception(e)
             return redirect("/?error=%s"%str(e))
     user = users[0]
     login_user(user)
     return redirect("/?auth=%s" % user.get_id())

             stat_logger.exception(e)
             return redirect("/?error=%s"%str(e))
     user = users[0]
+    user.access_token = get_uuid()
     login_user(user)
+    user.save()
     return redirect("/?auth=%s" % user.get_id())

deepdoc/parser/pdf_parser.py CHANGED Viewed

@@ -639,7 +639,7 @@ class HuParser:
                 mink = ""
                 minv = 1000000000
                 for k, bxs in tbls.items():
-                    for b in bxs[:10]:
                         if b.get("layout_type", "").find("caption") >= 0:
                             continue
                         y_dis = self._y_dis(c, b)

                 mink = ""
                 minv = 1000000000
                 for k, bxs in tbls.items():
+                    for b in bxs:
                         if b.get("layout_type", "").find("caption") >= 0:
                             continue
                         y_dis = self._y_dis(c, b)

rag/app/manual.py CHANGED Viewed

@@ -62,9 +62,6 @@ class Pdf(PdfParser):
         for b in self.boxes:
             b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
-        # merge chunks with the same bullets
-        self._merge_with_same_bullet()
         # set pivot using the most frequent type of title,
         # then merge between 2 pivot
         bull = bullets_category([b["text"] for b in self.boxes])
@@ -79,7 +76,7 @@ class Pdf(PdfParser):
         sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
         for (img, rows), poss in tbls:
-            sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss]))
         chunks = []
         last_sid = -2

         for b in self.boxes:
             b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
         # set pivot using the most frequent type of title,
         # then merge between 2 pivot
         bull = bullets_category([b["text"] for b in self.boxes])
         sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
         for (img, rows), poss in tbls:
+            sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
         chunks = []
         last_sid = -2

rag/app/qa.py CHANGED Viewed

@@ -11,6 +11,7 @@
 #  limitations under the License.
 #
 import re
 from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
@@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
         All the deformed lines will be ignored.
         Every pair of Q&A will be treated as a chunk.
     """
     res = []
     if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = Excel()
         for q, a in excel_parser(filename, binary, callback):
-            res.append(beAdoc({}, q, a, excel_parser.is_english))
         return res
     elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
@@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                         break
                     txt += l
         lines = txt.split("\n")
-        eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
         fails = []
         for i, line in enumerate(lines):
             arr = [l for l in line.split("\t") if len(l) > 1]
             if len(arr) != 2:
                 fails.append(str(i))
                 continue
-            res.append(beAdoc({}, arr[0], arr[1], eng))
             if len(res) % 999 == 0:
                 callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                     f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

 #  limitations under the License.
 #
 import re
+from copy import deepcopy
 from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
         All the deformed lines will be ignored.
         Every pair of Q&A will be treated as a chunk.
     """
+    eng = lang.lower() == "english"
     res = []
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
     if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = Excel()
         for q, a in excel_parser(filename, binary, callback):
+            res.append(beAdoc(deepcopy(doc), q, a, eng))
         return res
     elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
                         break
                     txt += l
         lines = txt.split("\n")
+        #is_english([rmPrefix(l) for l in lines[:100]])
         fails = []
         for i, line in enumerate(lines):
             arr = [l for l in line.split("\t") if len(l) > 1]
             if len(arr) != 2:
                 fails.append(str(i))
                 continue
+            res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
             if len(res) % 999 == 0:
                 callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                     f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

rag/nlp/__init__.py CHANGED Viewed

@@ -76,6 +76,7 @@ def is_english(texts):
 def tokenize(d, t, eng):
     d["content_with_weight"] = t
     if eng:
         t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
         d["content_ltks"] = " ".join([stemmer.stem(w)

 def tokenize(d, t, eng):
     d["content_with_weight"] = t
+    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
     if eng:
         t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
         d["content_ltks"] = " ".join([stemmer.stem(w)

rag/nlp/query.py CHANGED Viewed

@@ -29,7 +29,7 @@ class EsQueryer:
         for t in arr:
             if not re.match(r"[a-zA-Z]+$", t):
                 e += 1
-        return e * 1. / len(arr) >= 0.8
     @staticmethod
     def rmWWW(txt):

         for t in arr:
             if not re.match(r"[a-zA-Z]+$", t):
                 e += 1
+        return e * 1. / len(arr) >= 0.7
     @staticmethod
     def rmWWW(txt):