KevinHuSh
commited on
Commit
·
328b4c9
1
Parent(s):
31c7dca
fix plainPdf bugs (#152)
Browse files- api/apps/conversation_app.py +7 -5
- deepdoc/parser/pdf_parser.py +1 -1
- deepdoc/vision/ocr.py +1 -5
- deepdoc/vision/table_structure_recognizer.py +3 -1
- rag/app/laws.py +2 -2
- rag/app/paper.py +1 -1
- rag/app/presentation.py +1 -1
- rag/app/resume.py +1 -1
- rag/llm/embedding_model.py +2 -2
- rag/nlp/query.py +2 -2
- rag/nlp/search.py +2 -2
- rag/nlp/term_weight.py +10 -9
- rag/svr/task_broker.py +3 -1
api/apps/conversation_app.py
CHANGED
@@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs):
|
|
183 |
## try to use sql if field mapping is good to go
|
184 |
if field_map:
|
185 |
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
186 |
-
|
187 |
-
if markdown_tbl:
|
188 |
-
return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}}
|
189 |
|
190 |
prompt_config = dialog.prompt_config
|
191 |
for p in prompt_config["parameters"]:
|
@@ -311,7 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
|
|
311 |
clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
|
312 |
|
313 |
# compose markdown table
|
314 |
-
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("
|
315 |
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
|
316 |
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
|
317 |
if not docid_idx or not docnm_idx:
|
@@ -322,4 +320,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
|
|
322 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
323 |
docid_idx = list(docid_idx)[0]
|
324 |
docnm_idx = list(docnm_idx)[0]
|
325 |
-
return
|
|
|
|
|
|
|
|
|
|
183 |
## try to use sql if field mapping is good to go
|
184 |
if field_map:
|
185 |
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
186 |
+
return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
|
|
|
|
|
187 |
|
188 |
prompt_config = dialog.prompt_config
|
189 |
for p in prompt_config["parameters"]:
|
|
|
309 |
clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
|
310 |
|
311 |
# compose markdown table
|
312 |
+
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|")
|
313 |
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
|
314 |
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
|
315 |
if not docid_idx or not docnm_idx:
|
|
|
320 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
321 |
docid_idx = list(docid_idx)[0]
|
322 |
docnm_idx = list(docnm_idx)[0]
|
323 |
+
return {
|
324 |
+
"answer": "\n".join([clmns, line, rows]),
|
325 |
+
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
|
326 |
+
"doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
|
327 |
+
}
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -996,7 +996,7 @@ class HuParser:
|
|
996 |
if need_position: return None, None
|
997 |
return
|
998 |
|
999 |
-
max_width = np.max([right - left for (_, left, right, _, _) in poss])
|
1000 |
GAP = 6
|
1001 |
pos = poss[0]
|
1002 |
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
|
996 |
if need_position: return None, None
|
997 |
return
|
998 |
|
999 |
+
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
1000 |
GAP = 6
|
1001 |
pos = poss[0]
|
1002 |
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
deepdoc/vision/ocr.py
CHANGED
@@ -14,9 +14,6 @@
|
|
14 |
import copy
|
15 |
import time
|
16 |
import os
|
17 |
-
|
18 |
-
from huggingface_hub import snapshot_download
|
19 |
-
|
20 |
from .operators import *
|
21 |
import numpy as np
|
22 |
import onnxruntime as ort
|
@@ -24,7 +21,6 @@ import onnxruntime as ort
|
|
24 |
from .postprocess import build_post_process
|
25 |
from rag.settings import cron_logger
|
26 |
|
27 |
-
|
28 |
def transform(data, ops=None):
|
29 |
""" transform """
|
30 |
if ops is None:
|
@@ -82,7 +78,7 @@ class TextRecognizer(object):
|
|
82 |
self.rec_batch_num = 16
|
83 |
postprocess_params = {
|
84 |
'name': 'CTCLabelDecode',
|
85 |
-
"character_dict_path": os.path.join(
|
86 |
"use_space_char": True
|
87 |
}
|
88 |
self.postprocess_op = build_post_process(postprocess_params)
|
|
|
14 |
import copy
|
15 |
import time
|
16 |
import os
|
|
|
|
|
|
|
17 |
from .operators import *
|
18 |
import numpy as np
|
19 |
import onnxruntime as ort
|
|
|
21 |
from .postprocess import build_post_process
|
22 |
from rag.settings import cron_logger
|
23 |
|
|
|
24 |
def transform(data, ops=None):
|
25 |
""" transform """
|
26 |
if ops is None:
|
|
|
78 |
self.rec_batch_num = 16
|
79 |
postprocess_params = {
|
80 |
'name': 'CTCLabelDecode',
|
81 |
+
"character_dict_path": os.path.join(model_dir, "ocr.res"),
|
82 |
"use_space_char": True
|
83 |
}
|
84 |
self.postprocess_op = build_post_process(postprocess_params)
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
@@ -16,6 +16,7 @@ import re
|
|
16 |
from collections import Counter
|
17 |
|
18 |
import numpy as np
|
|
|
19 |
|
20 |
from api.utils.file_utils import get_project_base_directory
|
21 |
from rag.nlp import huqie
|
@@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer):
|
|
33 |
]
|
34 |
|
35 |
def __init__(self):
|
36 |
-
|
|
|
37 |
|
38 |
def __call__(self, images, thr=0.2):
|
39 |
tbls = super().__call__(images, thr)
|
|
|
16 |
from collections import Counter
|
17 |
|
18 |
import numpy as np
|
19 |
+
from huggingface_hub import snapshot_download
|
20 |
|
21 |
from api.utils.file_utils import get_project_base_directory
|
22 |
from rag.nlp import huqie
|
|
|
34 |
]
|
35 |
|
36 |
def __init__(self):
|
37 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
38 |
+
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
39 |
|
40 |
def __call__(self, images, thr=0.2):
|
41 |
tbls = super().__call__(images, thr)
|
rag/app/laws.py
CHANGED
@@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
|
68 |
|
69 |
callback(0.8, "Text extraction finished")
|
70 |
|
71 |
-
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
72 |
|
73 |
|
74 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
@@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
91 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
92 |
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
93 |
for txt, poss in pdf_parser(filename if not binary else binary,
|
94 |
-
from_page=from_page, to_page=to_page, callback=callback):
|
95 |
sections.append(txt + poss)
|
96 |
|
97 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
|
|
68 |
|
69 |
callback(0.8, "Text extraction finished")
|
70 |
|
71 |
+
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
72 |
|
73 |
|
74 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
|
91 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
92 |
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
93 |
for txt, poss in pdf_parser(filename if not binary else binary,
|
94 |
+
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
95 |
sections.append(txt + poss)
|
96 |
|
97 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
rag/app/paper.py
CHANGED
@@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
136 |
"title": filename,
|
137 |
"authors": " ",
|
138 |
"abstract": "",
|
139 |
-
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page),
|
140 |
"tables": []
|
141 |
}
|
142 |
else:
|
|
|
136 |
"title": filename,
|
137 |
"authors": " ",
|
138 |
"abstract": "",
|
139 |
+
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
|
140 |
"tables": []
|
141 |
}
|
142 |
else:
|
rag/app/presentation.py
CHANGED
@@ -66,7 +66,7 @@ class Pdf(PdfParser):
|
|
66 |
|
67 |
class PlainPdf(PlainParser):
|
68 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
69 |
-
self.pdf = pdf2_read(filename if not binary else BytesIO(
|
70 |
page_txt = []
|
71 |
for page in self.pdf.pages[from_page: to_page]:
|
72 |
page_txt.append(page.extract_text())
|
|
|
66 |
|
67 |
class PlainPdf(PlainParser):
|
68 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
69 |
+
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
|
70 |
page_txt = []
|
71 |
for page in self.pdf.pages[from_page: to_page]:
|
72 |
page_txt.append(page.extract_text())
|
rag/app/resume.py
CHANGED
@@ -40,7 +40,7 @@ def remote_call(filename, binary):
|
|
40 |
"encrypt_type": "base64",
|
41 |
"filename": filename,
|
42 |
"langtype": '',
|
43 |
-
"fileori": base64.b64encode(binary
|
44 |
},
|
45 |
"c": "resume_parse_module",
|
46 |
"m": "resume_parse"
|
|
|
40 |
"encrypt_type": "base64",
|
41 |
"filename": filename,
|
42 |
"langtype": '',
|
43 |
+
"fileori": base64.b64encode(binary).decode('utf-8')
|
44 |
},
|
45 |
"c": "resume_parse_module",
|
46 |
"m": "resume_parse"
|
rag/llm/embedding_model.py
CHANGED
@@ -20,10 +20,10 @@ from openai import OpenAI
|
|
20 |
from FlagEmbedding import FlagModel
|
21 |
import torch
|
22 |
import numpy as np
|
23 |
-
|
24 |
from rag.utils import num_tokens_from_string
|
25 |
|
26 |
-
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
|
27 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
28 |
use_fp16=torch.cuda.is_available())
|
29 |
|
|
|
20 |
from FlagEmbedding import FlagModel
|
21 |
import torch
|
22 |
import numpy as np
|
23 |
+
from huggingface_hub import snapshot_download
|
24 |
from rag.utils import num_tokens_from_string
|
25 |
|
26 |
+
flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
|
27 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
28 |
use_fp16=torch.cuda.is_available())
|
29 |
|
rag/nlp/query.py
CHANGED
@@ -53,7 +53,7 @@ class EsQueryer:
|
|
53 |
|
54 |
if not self.isChinese(txt):
|
55 |
tks = huqie.qie(txt).split(" ")
|
56 |
-
q = tks
|
57 |
for i in range(1, len(tks)):
|
58 |
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
59 |
if not q:
|
@@ -138,7 +138,7 @@ class EsQueryer:
|
|
138 |
|
139 |
def toDict(tks):
|
140 |
d = {}
|
141 |
-
if isinstance(tks,
|
142 |
tks = tks.split(" ")
|
143 |
for t, c in self.tw.weights(tks):
|
144 |
if t not in d:
|
|
|
53 |
|
54 |
if not self.isChinese(txt):
|
55 |
tks = huqie.qie(txt).split(" ")
|
56 |
+
q = copy.deepcopy(tks)
|
57 |
for i in range(1, len(tks)):
|
58 |
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
59 |
if not q:
|
|
|
138 |
|
139 |
def toDict(tks):
|
140 |
d = {}
|
141 |
+
if isinstance(tks, str):
|
142 |
tks = tks.split(" ")
|
143 |
for t, c in self.tw.weights(tks):
|
144 |
if t not in d:
|
rag/nlp/search.py
CHANGED
@@ -234,13 +234,13 @@ class Dealer:
|
|
234 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
235 |
len(ans_v[0]), len(chunk_v[0]))
|
236 |
|
237 |
-
chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks]
|
238 |
cites = {}
|
239 |
for i, a in enumerate(pieces_):
|
240 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
241 |
chunk_v,
|
242 |
huqie.qie(
|
243 |
-
pieces_[i]).split(" "),
|
244 |
chunks_tks,
|
245 |
tkweight, vtweight)
|
246 |
mx = np.max(sim) * 0.99
|
|
|
234 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
235 |
len(ans_v[0]), len(chunk_v[0]))
|
236 |
|
237 |
+
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
|
238 |
cites = {}
|
239 |
for i, a in enumerate(pieces_):
|
240 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
241 |
chunk_v,
|
242 |
huqie.qie(
|
243 |
+
self.qryr.rmWWW(pieces_[i])).split(" "),
|
244 |
chunks_tks,
|
245 |
tkweight, vtweight)
|
246 |
mx = np.max(sim) * 0.99
|
rag/nlp/term_weight.py
CHANGED
@@ -150,9 +150,10 @@ class Dealer:
|
|
150 |
return 6
|
151 |
|
152 |
def ner(t):
|
|
|
|
|
153 |
if not self.ne or t not in self.ne:
|
154 |
return 1
|
155 |
-
if re.match(r"[0-9,.]+$", t): return 2
|
156 |
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
|
157 |
"firstnm": 1}
|
158 |
return m[self.ne[t]]
|
@@ -170,11 +171,11 @@ class Dealer:
|
|
170 |
return 1
|
171 |
|
172 |
def freq(t):
|
173 |
-
if re.match(r"[0-9
|
174 |
-
return
|
175 |
s = huqie.freq(t)
|
176 |
-
if not s and re.match(r"[a-z
|
177 |
-
return
|
178 |
if not s:
|
179 |
s = 0
|
180 |
|
@@ -188,12 +189,12 @@ class Dealer:
|
|
188 |
return max(s, 10)
|
189 |
|
190 |
def df(t):
|
191 |
-
if re.match(r"[0-9
|
192 |
-
return
|
193 |
if t in self.df:
|
194 |
return self.df[t] + 3
|
195 |
-
elif re.match(r"[a-z
|
196 |
-
return
|
197 |
elif len(t) >= 4:
|
198 |
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
199 |
if len(s) > 1:
|
|
|
150 |
return 6
|
151 |
|
152 |
def ner(t):
|
153 |
+
if re.match(r"[0-9,.]{2,}$", t): return 2
|
154 |
+
if re.match(r"[a-z]{1,2}$", t): return 0.01
|
155 |
if not self.ne or t not in self.ne:
|
156 |
return 1
|
|
|
157 |
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
|
158 |
"firstnm": 1}
|
159 |
return m[self.ne[t]]
|
|
|
171 |
return 1
|
172 |
|
173 |
def freq(t):
|
174 |
+
if re.match(r"[0-9. -]{2,}$", t):
|
175 |
+
return 3
|
176 |
s = huqie.freq(t)
|
177 |
+
if not s and re.match(r"[a-z. -]+$", t):
|
178 |
+
return 300
|
179 |
if not s:
|
180 |
s = 0
|
181 |
|
|
|
189 |
return max(s, 10)
|
190 |
|
191 |
def df(t):
|
192 |
+
if re.match(r"[0-9. -]{2,}$", t):
|
193 |
+
return 5
|
194 |
if t in self.df:
|
195 |
return self.df[t] + 3
|
196 |
+
elif re.match(r"[a-z. -]+$", t):
|
197 |
+
return 300
|
198 |
elif len(t) >= 4:
|
199 |
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
200 |
if len(s) > 1:
|
rag/svr/task_broker.py
CHANGED
@@ -87,7 +87,9 @@ def dispatch():
|
|
87 |
if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
|
88 |
if r["parser_id"] == "one": page_size = 1000000000
|
89 |
if not do_layout: page_size = 1000000000
|
90 |
-
|
|
|
|
|
91 |
s -= 1
|
92 |
s = max(0, s)
|
93 |
e = min(e-1, pages)
|
|
|
87 |
if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
|
88 |
if r["parser_id"] == "one": page_size = 1000000000
|
89 |
if not do_layout: page_size = 1000000000
|
90 |
+
page_ranges = r["parser_config"].get("pages")
|
91 |
+
if not page_ranges: page_ranges = [(1, 100000)]
|
92 |
+
for s,e in page_ranges:
|
93 |
s -= 1
|
94 |
s = max(0, s)
|
95 |
e = min(e-1, pages)
|