KevinHuSh commited on
Commit
328b4c9
·
1 Parent(s): 31c7dca

fix plainPdf bugs (#152)

Browse files
api/apps/conversation_app.py CHANGED
@@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs):
183
  ## try to use sql if field mapping is good to go
184
  if field_map:
185
  chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
186
- markdown_tbl, chunks = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
187
- if markdown_tbl:
188
- return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}}
189
 
190
  prompt_config = dialog.prompt_config
191
  for p in prompt_config["parameters"]:
@@ -311,7 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
311
  clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
312
 
313
  # compose markdown table
314
- clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
315
  line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
316
  rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
317
  if not docid_idx or not docnm_idx:
@@ -322,4 +320,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
322
  rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
323
  docid_idx = list(docid_idx)[0]
324
  docnm_idx = list(docnm_idx)[0]
325
- return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]]
 
 
 
 
 
183
  ## try to use sql if field mapping is good to go
184
  if field_map:
185
  chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
186
+ return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
 
 
187
 
188
  prompt_config = dialog.prompt_config
189
  for p in prompt_config["parameters"]:
 
309
  clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
310
 
311
  # compose markdown table
312
+ clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|")
313
  line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
314
  rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
315
  if not docid_idx or not docnm_idx:
 
320
  rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
321
  docid_idx = list(docid_idx)[0]
322
  docnm_idx = list(docnm_idx)[0]
323
+ return {
324
+ "answer": "\n".join([clmns, line, rows]),
325
+ "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
326
+ "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
327
+ }
deepdoc/parser/pdf_parser.py CHANGED
@@ -996,7 +996,7 @@ class HuParser:
996
  if need_position: return None, None
997
  return
998
 
999
- max_width = np.max([right - left for (_, left, right, _, _) in poss])
1000
  GAP = 6
1001
  pos = poss[0]
1002
  poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
 
996
  if need_position: return None, None
997
  return
998
 
999
+ max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
1000
  GAP = 6
1001
  pos = poss[0]
1002
  poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
deepdoc/vision/ocr.py CHANGED
@@ -14,9 +14,6 @@
14
  import copy
15
  import time
16
  import os
17
-
18
- from huggingface_hub import snapshot_download
19
-
20
  from .operators import *
21
  import numpy as np
22
  import onnxruntime as ort
@@ -24,7 +21,6 @@ import onnxruntime as ort
24
  from .postprocess import build_post_process
25
  from rag.settings import cron_logger
26
 
27
-
28
  def transform(data, ops=None):
29
  """ transform """
30
  if ops is None:
@@ -82,7 +78,7 @@ class TextRecognizer(object):
82
  self.rec_batch_num = 16
83
  postprocess_params = {
84
  'name': 'CTCLabelDecode',
85
- "character_dict_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "ocr.res"),
86
  "use_space_char": True
87
  }
88
  self.postprocess_op = build_post_process(postprocess_params)
 
14
  import copy
15
  import time
16
  import os
 
 
 
17
  from .operators import *
18
  import numpy as np
19
  import onnxruntime as ort
 
21
  from .postprocess import build_post_process
22
  from rag.settings import cron_logger
23
 
 
24
  def transform(data, ops=None):
25
  """ transform """
26
  if ops is None:
 
78
  self.rec_batch_num = 16
79
  postprocess_params = {
80
  'name': 'CTCLabelDecode',
81
+ "character_dict_path": os.path.join(model_dir, "ocr.res"),
82
  "use_space_char": True
83
  }
84
  self.postprocess_op = build_post_process(postprocess_params)
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -16,6 +16,7 @@ import re
16
  from collections import Counter
17
 
18
  import numpy as np
 
19
 
20
  from api.utils.file_utils import get_project_base_directory
21
  from rag.nlp import huqie
@@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer):
33
  ]
34
 
35
  def __init__(self):
36
- super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
 
37
 
38
  def __call__(self, images, thr=0.2):
39
  tbls = super().__call__(images, thr)
 
16
  from collections import Counter
17
 
18
  import numpy as np
19
+ from huggingface_hub import snapshot_download
20
 
21
  from api.utils.file_utils import get_project_base_directory
22
  from rag.nlp import huqie
 
34
  ]
35
 
36
  def __init__(self):
37
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
38
+ super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
39
 
40
  def __call__(self, images, thr=0.2):
41
  tbls = super().__call__(images, thr)
rag/app/laws.py CHANGED
@@ -68,7 +68,7 @@ class Pdf(PdfParser):
68
 
69
  callback(0.8, "Text extraction finished")
70
 
71
- return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
72
 
73
 
74
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
91
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
92
  pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
93
  for txt, poss in pdf_parser(filename if not binary else binary,
94
- from_page=from_page, to_page=to_page, callback=callback):
95
  sections.append(txt + poss)
96
 
97
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
 
68
 
69
  callback(0.8, "Text extraction finished")
70
 
71
+ return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
72
 
73
 
74
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
91
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
92
  pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
93
  for txt, poss in pdf_parser(filename if not binary else binary,
94
+ from_page=from_page, to_page=to_page, callback=callback)[0]:
95
  sections.append(txt + poss)
96
 
97
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
rag/app/paper.py CHANGED
@@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
136
  "title": filename,
137
  "authors": " ",
138
  "abstract": "",
139
- "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page),
140
  "tables": []
141
  }
142
  else:
 
136
  "title": filename,
137
  "authors": " ",
138
  "abstract": "",
139
+ "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
140
  "tables": []
141
  }
142
  else:
rag/app/presentation.py CHANGED
@@ -66,7 +66,7 @@ class Pdf(PdfParser):
66
 
67
  class PlainPdf(PlainParser):
68
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
69
- self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
70
  page_txt = []
71
  for page in self.pdf.pages[from_page: to_page]:
72
  page_txt.append(page.extract_text())
 
66
 
67
  class PlainPdf(PlainParser):
68
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
69
+ self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
70
  page_txt = []
71
  for page in self.pdf.pages[from_page: to_page]:
72
  page_txt.append(page.extract_text())
rag/app/resume.py CHANGED
@@ -40,7 +40,7 @@ def remote_call(filename, binary):
40
  "encrypt_type": "base64",
41
  "filename": filename,
42
  "langtype": '',
43
- "fileori": base64.b64encode(binary.stream.read()).decode('utf-8')
44
  },
45
  "c": "resume_parse_module",
46
  "m": "resume_parse"
 
40
  "encrypt_type": "base64",
41
  "filename": filename,
42
  "langtype": '',
43
+ "fileori": base64.b64encode(binary).decode('utf-8')
44
  },
45
  "c": "resume_parse_module",
46
  "m": "resume_parse"
rag/llm/embedding_model.py CHANGED
@@ -20,10 +20,10 @@ from openai import OpenAI
20
  from FlagEmbedding import FlagModel
21
  import torch
22
  import numpy as np
23
-
24
  from rag.utils import num_tokens_from_string
25
 
26
- flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
27
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
28
  use_fp16=torch.cuda.is_available())
29
 
 
20
  from FlagEmbedding import FlagModel
21
  import torch
22
  import numpy as np
23
+ from huggingface_hub import snapshot_download
24
  from rag.utils import num_tokens_from_string
25
 
26
+ flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
27
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
28
  use_fp16=torch.cuda.is_available())
29
 
rag/nlp/query.py CHANGED
@@ -53,7 +53,7 @@ class EsQueryer:
53
 
54
  if not self.isChinese(txt):
55
  tks = huqie.qie(txt).split(" ")
56
- q = tks
57
  for i in range(1, len(tks)):
58
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
59
  if not q:
@@ -138,7 +138,7 @@ class EsQueryer:
138
 
139
  def toDict(tks):
140
  d = {}
141
- if isinstance(tks, type("")):
142
  tks = tks.split(" ")
143
  for t, c in self.tw.weights(tks):
144
  if t not in d:
 
53
 
54
  if not self.isChinese(txt):
55
  tks = huqie.qie(txt).split(" ")
56
+ q = copy.deepcopy(tks)
57
  for i in range(1, len(tks)):
58
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
59
  if not q:
 
138
 
139
  def toDict(tks):
140
  d = {}
141
+ if isinstance(tks, str):
142
  tks = tks.split(" ")
143
  for t, c in self.tw.weights(tks):
144
  if t not in d:
rag/nlp/search.py CHANGED
@@ -234,13 +234,13 @@ class Dealer:
234
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
235
  len(ans_v[0]), len(chunk_v[0]))
236
 
237
- chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks]
238
  cites = {}
239
  for i, a in enumerate(pieces_):
240
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
241
  chunk_v,
242
  huqie.qie(
243
- pieces_[i]).split(" "),
244
  chunks_tks,
245
  tkweight, vtweight)
246
  mx = np.max(sim) * 0.99
 
234
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
235
  len(ans_v[0]), len(chunk_v[0]))
236
 
237
+ chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
238
  cites = {}
239
  for i, a in enumerate(pieces_):
240
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
241
  chunk_v,
242
  huqie.qie(
243
+ self.qryr.rmWWW(pieces_[i])).split(" "),
244
  chunks_tks,
245
  tkweight, vtweight)
246
  mx = np.max(sim) * 0.99
rag/nlp/term_weight.py CHANGED
@@ -150,9 +150,10 @@ class Dealer:
150
  return 6
151
 
152
  def ner(t):
 
 
153
  if not self.ne or t not in self.ne:
154
  return 1
155
- if re.match(r"[0-9,.]+$", t): return 2
156
  m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
157
  "firstnm": 1}
158
  return m[self.ne[t]]
@@ -170,11 +171,11 @@ class Dealer:
170
  return 1
171
 
172
  def freq(t):
173
- if re.match(r"[0-9\. -]+$", t):
174
- return 10000
175
  s = huqie.freq(t)
176
- if not s and re.match(r"[a-z\. -]+$", t):
177
- return 10
178
  if not s:
179
  s = 0
180
 
@@ -188,12 +189,12 @@ class Dealer:
188
  return max(s, 10)
189
 
190
  def df(t):
191
- if re.match(r"[0-9\. -]+$", t):
192
- return 100000
193
  if t in self.df:
194
  return self.df[t] + 3
195
- elif re.match(r"[a-z\. -]+$", t):
196
- return 3
197
  elif len(t) >= 4:
198
  s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
199
  if len(s) > 1:
 
150
  return 6
151
 
152
  def ner(t):
153
+ if re.match(r"[0-9,.]{2,}$", t): return 2
154
+ if re.match(r"[a-z]{1,2}$", t): return 0.01
155
  if not self.ne or t not in self.ne:
156
  return 1
 
157
  m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
158
  "firstnm": 1}
159
  return m[self.ne[t]]
 
171
  return 1
172
 
173
  def freq(t):
174
+ if re.match(r"[0-9. -]{2,}$", t):
175
+ return 3
176
  s = huqie.freq(t)
177
+ if not s and re.match(r"[a-z. -]+$", t):
178
+ return 300
179
  if not s:
180
  s = 0
181
 
 
189
  return max(s, 10)
190
 
191
  def df(t):
192
+ if re.match(r"[0-9. -]{2,}$", t):
193
+ return 5
194
  if t in self.df:
195
  return self.df[t] + 3
196
+ elif re.match(r"[a-z. -]+$", t):
197
+ return 300
198
  elif len(t) >= 4:
199
  s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
200
  if len(s) > 1:
rag/svr/task_broker.py CHANGED
@@ -87,7 +87,9 @@ def dispatch():
87
  if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
88
  if r["parser_id"] == "one": page_size = 1000000000
89
  if not do_layout: page_size = 1000000000
90
- for s,e in r["parser_config"].get("pages", [(1, 100000)]):
 
 
91
  s -= 1
92
  s = max(0, s)
93
  e = min(e-1, pages)
 
87
  if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
88
  if r["parser_id"] == "one": page_size = 1000000000
89
  if not do_layout: page_size = 1000000000
90
+ page_ranges = r["parser_config"].get("pages")
91
+ if not page_ranges: page_ranges = [(1, 100000)]
92
+ for s,e in page_ranges:
93
  s -= 1
94
  s = max(0, s)
95
  e = min(e-1, pages)