Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Mar 26, 2024

Commit

328b4c9

1 Parent(s): 31c7dca

fix plainPdf bugs (#152)

Browse files

Files changed (13) hide show

api/apps/conversation_app.py +7 -5
deepdoc/parser/pdf_parser.py +1 -1
deepdoc/vision/ocr.py +1 -5
deepdoc/vision/table_structure_recognizer.py +3 -1
rag/app/laws.py +2 -2
rag/app/paper.py +1 -1
rag/app/presentation.py +1 -1
rag/app/resume.py +1 -1
rag/llm/embedding_model.py +2 -2
rag/nlp/query.py +2 -2
rag/nlp/search.py +2 -2
rag/nlp/term_weight.py +10 -9
rag/svr/task_broker.py +3 -1

api/apps/conversation_app.py CHANGED Viewed

@@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs):
     ## try to use sql if field mapping is good to go
     if field_map:
         chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
-        markdown_tbl, chunks = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
-        if markdown_tbl:
-            return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}}
     prompt_config = dialog.prompt_config
     for p in prompt_config["parameters"]:
@@ -311,7 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
     clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
     # compose markdown table
-    clmns = "|"+"|".join([re.sub(r"(/.*|（[^（）]+）)", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
     line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
     rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
     if not docid_idx or not docnm_idx:
@@ -322,4 +320,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
     rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
     docid_idx = list(docid_idx)[0]
     docnm_idx = list(docnm_idx)[0]
-    return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]]

     ## try to use sql if field mapping is good to go
     if field_map:
         chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
+        return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
     prompt_config = dialog.prompt_config
     for p in prompt_config["parameters"]:
     clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
     # compose markdown table
+    clmns = "|"+"|".join([re.sub(r"(/.*|（[^（）]+）)", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|")
     line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
     rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
     if not docid_idx or not docnm_idx:
     rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
     docid_idx = list(docid_idx)[0]
     docnm_idx = list(docnm_idx)[0]
+    return {
+        "answer": "\n".join([clmns, line, rows]),
+        "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
+                      "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
+    }

deepdoc/parser/pdf_parser.py CHANGED Viewed

@@ -996,7 +996,7 @@ class HuParser:
             if need_position: return None, None
             return
-        max_width = np.max([right - left for (_, left, right, _, _) in poss])
         GAP = 6
         pos = poss[0]
         poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))

             if need_position: return None, None
             return
+        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
         GAP = 6
         pos = poss[0]
         poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))

deepdoc/vision/ocr.py CHANGED Viewed

@@ -14,9 +14,6 @@
 import copy
 import time
 import os
-from huggingface_hub import snapshot_download
 from .operators import *
 import numpy as np
 import onnxruntime as ort
@@ -24,7 +21,6 @@ import onnxruntime as ort
 from .postprocess import build_post_process
 from rag.settings import cron_logger
 def transform(data, ops=None):
     """ transform """
     if ops is None:
@@ -82,7 +78,7 @@ class TextRecognizer(object):
         self.rec_batch_num = 16
         postprocess_params = {
             'name': 'CTCLabelDecode',
-            "character_dict_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "ocr.res"),
             "use_space_char": True
         }
         self.postprocess_op = build_post_process(postprocess_params)

 import copy
 import time
 import os
 from .operators import *
 import numpy as np
 import onnxruntime as ort
 from .postprocess import build_post_process
 from rag.settings import cron_logger
 def transform(data, ops=None):
     """ transform """
     if ops is None:
         self.rec_batch_num = 16
         postprocess_params = {
             'name': 'CTCLabelDecode',
+            "character_dict_path": os.path.join(model_dir, "ocr.res"),
             "use_space_char": True
         }
         self.postprocess_op = build_post_process(postprocess_params)

deepdoc/vision/table_structure_recognizer.py CHANGED Viewed

@@ -16,6 +16,7 @@ import re
 from collections import Counter
 import numpy as np
 from api.utils.file_utils import get_project_base_directory
 from rag.nlp import huqie
@@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer):
     ]
     def __init__(self):
-        super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
     def __call__(self, images, thr=0.2):
         tbls = super().__call__(images, thr)

 from collections import Counter
 import numpy as np
+from huggingface_hub import snapshot_download
 from api.utils.file_utils import get_project_base_directory
 from rag.nlp import huqie
     ]
     def __init__(self):
+        model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
+        super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
     def __call__(self, images, thr=0.2):
         tbls = super().__call__(images, thr)

rag/app/laws.py CHANGED Viewed

@@ -68,7 +68,7 @@ class Pdf(PdfParser):
         callback(0.8, "Text extraction finished")
-        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
             pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
             for txt, poss in pdf_parser(filename if not binary else binary,
-                             from_page=from_page, to_page=to_page, callback=callback):
                 sections.append(txt + poss)
     elif re.search(r"\.txt$", filename, re.IGNORECASE):

         callback(0.8, "Text extraction finished")
+        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
             pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
             for txt, poss in pdf_parser(filename if not binary else binary,
+                             from_page=from_page, to_page=to_page, callback=callback)[0]:
                 sections.append(txt + poss)
     elif re.search(r"\.txt$", filename, re.IGNORECASE):

rag/app/paper.py CHANGED Viewed

@@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
                 "title": filename,
                 "authors": " ",
                 "abstract": "",
-                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page),
                 "tables": []
             }
         else:

                 "title": filename,
                 "authors": " ",
                 "abstract": "",
+                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
                 "tables": []
             }
         else:

rag/app/presentation.py CHANGED Viewed

@@ -66,7 +66,7 @@ class Pdf(PdfParser):
 class PlainPdf(PlainParser):
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
-        self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
         page_txt = []
         for page in self.pdf.pages[from_page: to_page]:
             page_txt.append(page.extract_text())

 class PlainPdf(PlainParser):
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
         page_txt = []
         for page in self.pdf.pages[from_page: to_page]:
             page_txt.append(page.extract_text())

rag/app/resume.py CHANGED Viewed

@@ -40,7 +40,7 @@ def remote_call(filename, binary):
                 "encrypt_type": "base64",
                 "filename": filename,
                 "langtype": '',
-                "fileori": base64.b64encode(binary.stream.read()).decode('utf-8')
             },
             "c": "resume_parse_module",
             "m": "resume_parse"

                 "encrypt_type": "base64",
                 "filename": filename,
                 "langtype": '',
+                "fileori": base64.b64encode(binary).decode('utf-8')
             },
             "c": "resume_parse_module",
             "m": "resume_parse"

rag/llm/embedding_model.py CHANGED Viewed

@@ -20,10 +20,10 @@ from openai import OpenAI
 from FlagEmbedding import FlagModel
 import torch
 import numpy as np
 from rag.utils import num_tokens_from_string
-flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
                        query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                        use_fp16=torch.cuda.is_available())

 from FlagEmbedding import FlagModel
 import torch
 import numpy as np
+from huggingface_hub import snapshot_download
 from rag.utils import num_tokens_from_string
+flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
                        query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                        use_fp16=torch.cuda.is_available())

rag/nlp/query.py CHANGED Viewed

@@ -53,7 +53,7 @@ class EsQueryer:
         if not self.isChinese(txt):
             tks = huqie.qie(txt).split(" ")
-            q = tks
             for i in range(1, len(tks)):
                 q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
             if not q:
@@ -138,7 +138,7 @@ class EsQueryer:
         def toDict(tks):
             d = {}
-            if isinstance(tks, type("")):
                 tks = tks.split(" ")
             for t, c in self.tw.weights(tks):
                 if t not in d:

         if not self.isChinese(txt):
             tks = huqie.qie(txt).split(" ")
+            q = copy.deepcopy(tks)
             for i in range(1, len(tks)):
                 q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
             if not q:
         def toDict(tks):
             d = {}
+            if isinstance(tks, str):
                 tks = tks.split(" ")
             for t, c in self.tw.weights(tks):
                 if t not in d:

rag/nlp/search.py CHANGED Viewed

@@ -234,13 +234,13 @@ class Dealer:
         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
             len(ans_v[0]), len(chunk_v[0]))
-        chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks]
         cites = {}
         for i, a in enumerate(pieces_):
             sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                             chunk_v,
                                                             huqie.qie(
-                                                                pieces_[i]).split(" "),
                                                             chunks_tks,
                                                             tkweight, vtweight)
             mx = np.max(sim) * 0.99

         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
             len(ans_v[0]), len(chunk_v[0]))
+        chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
         cites = {}
         for i, a in enumerate(pieces_):
             sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                             chunk_v,
                                                             huqie.qie(
+                                                                self.qryr.rmWWW(pieces_[i])).split(" "),
                                                             chunks_tks,
                                                             tkweight, vtweight)
             mx = np.max(sim) * 0.99

rag/nlp/term_weight.py CHANGED Viewed

@@ -150,9 +150,10 @@ class Dealer:
             return 6
         def ner(t):
             if not self.ne or t not in self.ne:
                 return 1
-            if re.match(r"[0-9,.]+$", t): return 2
             m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
                  "firstnm": 1}
             return m[self.ne[t]]
@@ -170,11 +171,11 @@ class Dealer:
             return 1
         def freq(t):
-            if re.match(r"[0-9\. -]+$", t):
-                return 10000
             s = huqie.freq(t)
-            if not s and re.match(r"[a-z\. -]+$", t):
-                return 10
             if not s:
                 s = 0
@@ -188,12 +189,12 @@ class Dealer:
             return max(s, 10)
         def df(t):
-            if re.match(r"[0-9\. -]+$", t):
-                return 100000
             if t in self.df:
                 return self.df[t] + 3
-            elif re.match(r"[a-z\. -]+$", t):
-                return 3
             elif len(t) >= 4:
                 s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
                 if len(s) > 1:

             return 6
         def ner(t):
+            if re.match(r"[0-9,.]{2,}$", t): return 2
+            if re.match(r"[a-z]{1,2}$", t): return 0.01
             if not self.ne or t not in self.ne:
                 return 1
             m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
                  "firstnm": 1}
             return m[self.ne[t]]
             return 1
         def freq(t):
+            if re.match(r"[0-9. -]{2,}$", t):
+                return 3
             s = huqie.freq(t)
+            if not s and re.match(r"[a-z. -]+$", t):
+                return 300
             if not s:
                 s = 0
             return max(s, 10)
         def df(t):
+            if re.match(r"[0-9. -]{2,}$", t):
+                return 5
             if t in self.df:
                 return self.df[t] + 3
+            elif re.match(r"[a-z. -]+$", t):
+                return 300
             elif len(t) >= 4:
                 s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
                 if len(s) > 1:

rag/svr/task_broker.py CHANGED Viewed

@@ -87,7 +87,9 @@ def dispatch():
             if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
             if r["parser_id"] == "one": page_size = 1000000000
             if not do_layout: page_size = 1000000000
-            for s,e in r["parser_config"].get("pages", [(1, 100000)]):
                 s -= 1
                 s = max(0, s)
                 e = min(e-1, pages)

             if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
             if r["parser_id"] == "one": page_size = 1000000000
             if not do_layout: page_size = 1000000000
+            page_ranges = r["parser_config"].get("pages")
+            if not page_ranges:  page_ranges = [(1, 100000)]
+            for s,e in page_ranges:
                 s -= 1
                 s = max(0, s)
                 e = min(e-1, pages)