KevinHuSh commited on
Commit
64a0633
·
1 Parent(s): bcb7249

fix table desc bugs, add positions to chunks (#91)

Browse files
api/apps/chunk_app.py CHANGED
@@ -51,7 +51,7 @@ def list():
51
  if not e:
52
  return get_data_error_result(retmsg="Document not found!")
53
  query = {
54
- "doc_ids": [doc_id], "page": page, "size": size, "question": question
55
  }
56
  if "available_int" in req:
57
  query["available_int"] = int(req["available_int"])
@@ -66,7 +66,12 @@ def list():
66
  "important_kwd": sres.field[id].get("important_kwd", []),
67
  "img_id": sres.field[id].get("img_id", ""),
68
  "available_int": sres.field[id].get("available_int", 1),
 
69
  }
 
 
 
 
70
  res["chunks"].append(d)
71
  return get_json_result(data=res)
72
  except Exception as e:
 
51
  if not e:
52
  return get_data_error_result(retmsg="Document not found!")
53
  query = {
54
+ "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
55
  }
56
  if "available_int" in req:
57
  query["available_int"] = int(req["available_int"])
 
66
  "important_kwd": sres.field[id].get("important_kwd", []),
67
  "img_id": sres.field[id].get("img_id", ""),
68
  "available_int": sres.field[id].get("available_int", 1),
69
+ "positions": sres.field[id].get("position_int", "").split("\t")
70
  }
71
+ poss = []
72
+ for i in range(0, len(d["positions"]), 5):
73
+ poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])])
74
+ d["positions"] = poss
75
  res["chunks"].append(d)
76
  return get_json_result(data=res)
77
  except Exception as e:
api/settings.py CHANGED
@@ -21,9 +21,14 @@ from api.utils import get_base_config,decrypt_database_config
21
  from api.utils.file_utils import get_project_base_directory
22
  from api.utils.log_utils import LoggerFactory, getLogger
23
 
24
- from rag.nlp import search
25
- from rag.utils import ELASTICSEARCH
 
 
26
 
 
 
 
27
 
28
  API_VERSION = "v1"
29
  RAG_FLOW_SERVICE_NAME = "ragflow"
@@ -133,16 +138,10 @@ AUTHENTICATION_DEFAULT_TIMEOUT = 30 * 24 * 60 * 60 # s
133
  PRIVILEGE_COMMAND_WHITELIST = []
134
  CHECK_NODES_IDENTITY = False
135
 
 
 
136
  retrievaler = search.Dealer(ELASTICSEARCH)
137
 
138
- # Logger
139
- LoggerFactory.set_directory(os.path.join(get_project_base_directory(), "logs", "api"))
140
- # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
141
- LoggerFactory.LEVEL = 10
142
-
143
- stat_logger = getLogger("stat")
144
- access_logger = getLogger("access")
145
- database_logger = getLogger("database")
146
 
147
  class CustomEnum(Enum):
148
  @classmethod
 
21
  from api.utils.file_utils import get_project_base_directory
22
  from api.utils.log_utils import LoggerFactory, getLogger
23
 
24
+ # Logger
25
+ LoggerFactory.set_directory(os.path.join(get_project_base_directory(), "logs", "api"))
26
+ # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
27
+ LoggerFactory.LEVEL = 10
28
 
29
+ stat_logger = getLogger("stat")
30
+ access_logger = getLogger("access")
31
+ database_logger = getLogger("database")
32
 
33
  API_VERSION = "v1"
34
  RAG_FLOW_SERVICE_NAME = "ragflow"
 
138
  PRIVILEGE_COMMAND_WHITELIST = []
139
  CHECK_NODES_IDENTITY = False
140
 
141
+ from rag.nlp import search
142
+ from rag.utils import ELASTICSEARCH
143
  retrievaler = search.Dealer(ELASTICSEARCH)
144
 
 
 
 
 
 
 
 
 
145
 
146
  class CustomEnum(Enum):
147
  @classmethod
deepdoc/parser/pdf_parser.py CHANGED
@@ -545,7 +545,7 @@ class HuParser:
545
  b_["top"] = b["top"]
546
  self.boxes.pop(i)
547
 
548
- def _extract_table_figure(self, need_image, ZM, return_html):
549
  tables = {}
550
  figures = {}
551
  # extract figure and table boxes
@@ -658,8 +658,9 @@ class HuParser:
658
  self.boxes.pop(i)
659
 
660
  res = []
 
661
 
662
- def cropout(bxs, ltype):
663
  nonlocal ZM
664
  pn = set([b["page_number"] - 1 for b in bxs])
665
  if len(pn) < 2:
@@ -682,6 +683,7 @@ class HuParser:
682
  "layoutno", "")))
683
 
684
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
 
685
  return self.page_images[pn] \
686
  .crop((left * ZM, top * ZM,
687
  right * ZM, bott * ZM))
@@ -692,7 +694,7 @@ class HuParser:
692
  pn[p] = []
693
  pn[p].append(b)
694
  pn = sorted(pn.items(), key=lambda x: x[0])
695
- imgs = [cropout(arr, ltype) for p, arr in pn]
696
  pic = Image.new("RGB",
697
  (int(np.max([i.size[0] for i in imgs])),
698
  int(np.sum([m.size[1] for m in imgs]))),
@@ -714,18 +716,26 @@ class HuParser:
714
  if not txt:
715
  continue
716
 
 
717
  res.append(
718
  (cropout(
719
  bxs,
720
- "figure"),
721
  [txt] if not return_html else [f"<p>{txt}</p>"]))
 
722
 
723
  for k, bxs in tables.items():
724
  if not bxs:
725
  continue
726
- res.append((cropout(bxs, "table"),
 
 
727
  self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
 
728
 
 
 
 
729
  return res
730
 
731
  def proj_match(self, line):
@@ -922,13 +932,13 @@ class HuParser:
922
  self._text_merge()
923
  self._concat_downward()
924
  self._filter_forpages()
925
- tbls = self._extract_table_figure(need_image, zoomin, return_html)
926
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
927
 
928
  def remove_tag(self, txt):
929
  return re.sub(r"@@[\t0-9.-]+?##", "", txt)
930
 
931
- def crop(self, text, ZM=3):
932
  imgs = []
933
  poss = []
934
  for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
@@ -946,6 +956,7 @@ class HuParser:
946
  pos = poss[-1]
947
  poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
948
 
 
949
  for ii, (pns, left, right, top, bottom) in enumerate(poss):
950
  right = left + max_width
951
  bottom *= ZM
@@ -958,6 +969,8 @@ class HuParser:
958
  bottom, self.page_images[pns[0]].size[1])
959
  ))
960
  )
 
 
961
  bottom -= self.page_images[pns[0]].size[1]
962
  for pn in pns[1:]:
963
  imgs.append(
@@ -967,9 +980,12 @@ class HuParser:
967
  self.page_images[pn].size[1])
968
  ))
969
  )
 
 
970
  bottom -= self.page_images[pn].size[1]
971
 
972
  if not imgs:
 
973
  return
974
  height = 0
975
  for img in imgs:
@@ -988,6 +1004,9 @@ class HuParser:
988
  img = Image.alpha_composite(img, overlay).convert("RGB")
989
  pic.paste(img, (0, int(height)))
990
  height += img.size[1] + GAP
 
 
 
991
  return pic
992
 
993
 
 
545
  b_["top"] = b["top"]
546
  self.boxes.pop(i)
547
 
548
+ def _extract_table_figure(self, need_image, ZM, return_html, need_position):
549
  tables = {}
550
  figures = {}
551
  # extract figure and table boxes
 
658
  self.boxes.pop(i)
659
 
660
  res = []
661
+ positions = []
662
 
663
+ def cropout(bxs, ltype, poss):
664
  nonlocal ZM
665
  pn = set([b["page_number"] - 1 for b in bxs])
666
  if len(pn) < 2:
 
683
  "layoutno", "")))
684
 
685
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
686
+ poss.append((pn, left, right, top, bott))
687
  return self.page_images[pn] \
688
  .crop((left * ZM, top * ZM,
689
  right * ZM, bott * ZM))
 
694
  pn[p] = []
695
  pn[p].append(b)
696
  pn = sorted(pn.items(), key=lambda x: x[0])
697
+ imgs = [cropout(arr, ltype, poss) for p, arr in pn]
698
  pic = Image.new("RGB",
699
  (int(np.max([i.size[0] for i in imgs])),
700
  int(np.sum([m.size[1] for m in imgs]))),
 
716
  if not txt:
717
  continue
718
 
719
+ poss = []
720
  res.append(
721
  (cropout(
722
  bxs,
723
+ "figure", poss),
724
  [txt] if not return_html else [f"<p>{txt}</p>"]))
725
+ positions.append(poss)
726
 
727
  for k, bxs in tables.items():
728
  if not bxs:
729
  continue
730
+ bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
731
+ poss = []
732
+ res.append((cropout(bxs, "table", poss),
733
  self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
734
+ positions.append(poss)
735
 
736
+ assert len(positions) == len(res)
737
+
738
+ if need_position: return list(zip(res, positions))
739
  return res
740
 
741
  def proj_match(self, line):
 
932
  self._text_merge()
933
  self._concat_downward()
934
  self._filter_forpages()
935
+ tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
936
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
937
 
938
  def remove_tag(self, txt):
939
  return re.sub(r"@@[\t0-9.-]+?##", "", txt)
940
 
941
+ def crop(self, text, ZM=3, need_position=False):
942
  imgs = []
943
  poss = []
944
  for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
 
956
  pos = poss[-1]
957
  poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
958
 
959
+ positions = []
960
  for ii, (pns, left, right, top, bottom) in enumerate(poss):
961
  right = left + max_width
962
  bottom *= ZM
 
969
  bottom, self.page_images[pns[0]].size[1])
970
  ))
971
  )
972
+ positions.append((pns[0], left, right, top, min(
973
+ bottom, self.page_images[pns[0]].size[1])/ZM))
974
  bottom -= self.page_images[pns[0]].size[1]
975
  for pn in pns[1:]:
976
  imgs.append(
 
980
  self.page_images[pn].size[1])
981
  ))
982
  )
983
+ positions.append((pn, left, right, 0, min(
984
+ bottom, self.page_images[pn].size[1]) / ZM))
985
  bottom -= self.page_images[pn].size[1]
986
 
987
  if not imgs:
988
+ if need_position: return None, None
989
  return
990
  height = 0
991
  for img in imgs:
 
1004
  img = Image.alpha_composite(img, overlay).convert("RGB")
1005
  pic.paste(img, (0, int(height)))
1006
  height += img.size[1] + GAP
1007
+
1008
+ if need_position:
1009
+ return pic, positions
1010
  return pic
1011
 
1012
 
deepdoc/vision/recognizer.py CHANGED
@@ -265,6 +265,7 @@ class Recognizer(object):
265
  return
266
  min_dis, min_i = 1000000, None
267
  for i,b in enumerate(boxes):
 
268
  dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
269
  if dis < min_dis:
270
  min_i = i
 
265
  return
266
  min_dis, min_i = 1000000, None
267
  for i,b in enumerate(boxes):
268
+ if box.get("layoutno", "0") != b.get("layoutno", "0"): continue
269
  dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
270
  if dis < min_dis:
271
  min_i = i
rag/app/book.py CHANGED
@@ -13,7 +13,7 @@
13
  import copy
14
  import re
15
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
16
- hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table
17
  from rag.nlp import huqie
18
  from deepdoc.parser import PdfParser, DocxParser
19
 
@@ -21,6 +21,7 @@ from deepdoc.parser import PdfParser, DocxParser
21
  class Pdf(PdfParser):
22
  def __call__(self, filename, binary=None, from_page=0,
23
  to_page=100000, zoomin=3, callback=None):
 
24
  self.__images__(
25
  filename if not binary else binary,
26
  zoomin,
@@ -40,11 +41,11 @@ class Pdf(PdfParser):
40
  self._filter_forpages()
41
  self._merge_with_same_bullet()
42
  callback(0.75, "Text merging finished.")
43
- tbls = self._extract_table_figure(True, zoomin, False)
44
 
45
  callback(0.8, "Text extraction finished")
46
 
47
- return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
48
 
49
 
50
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -69,7 +70,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
69
  callback(0.8, "Finish parsing.")
70
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
71
  pdf_parser = Pdf()
72
- sections,tbls = pdf_parser(filename if not binary else binary,
73
  from_page=from_page, to_page=to_page, callback=callback)
74
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
75
  callback(0.1, "Start to parse.")
@@ -105,7 +106,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
105
  d = copy.deepcopy(doc)
106
  ck = "\n".join(ck)
107
  if pdf_parser:
108
- d["image"] = pdf_parser.crop(ck)
 
109
  ck = pdf_parser.remove_tag(ck)
110
  tokenize(d, ck, eng)
111
  res.append(d)
 
13
  import copy
14
  import re
15
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
16
+ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions
17
  from rag.nlp import huqie
18
  from deepdoc.parser import PdfParser, DocxParser
19
 
 
21
  class Pdf(PdfParser):
22
  def __call__(self, filename, binary=None, from_page=0,
23
  to_page=100000, zoomin=3, callback=None):
24
+ callback(msg="OCR is running...")
25
  self.__images__(
26
  filename if not binary else binary,
27
  zoomin,
 
41
  self._filter_forpages()
42
  self._merge_with_same_bullet()
43
  callback(0.75, "Text merging finished.")
44
+ tbls = self._extract_table_figure(True, zoomin, False, True)
45
 
46
  callback(0.8, "Text extraction finished")
47
 
48
+ return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls, tbl_poss
49
 
50
 
51
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
70
  callback(0.8, "Finish parsing.")
71
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
72
  pdf_parser = Pdf()
73
+ sections, tbls = pdf_parser(filename if not binary else binary,
74
  from_page=from_page, to_page=to_page, callback=callback)
75
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
76
  callback(0.1, "Start to parse.")
 
106
  d = copy.deepcopy(doc)
107
  ck = "\n".join(ck)
108
  if pdf_parser:
109
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
110
+ add_positions(d, poss)
111
  ck = pdf_parser.remove_tag(ck)
112
  tokenize(d, ck, eng)
113
  res.append(d)
rag/app/laws.py CHANGED
@@ -15,7 +15,7 @@ import re
15
  from io import BytesIO
16
  from docx import Document
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
18
- make_colon_as_title
19
  from rag.nlp import huqie
20
  from deepdoc.parser import PdfParser, DocxParser
21
  from rag.settings import cron_logger
@@ -49,6 +49,7 @@ class Docx(DocxParser):
49
  class Pdf(PdfParser):
50
  def __call__(self, filename, binary=None, from_page=0,
51
  to_page=100000, zoomin=3, callback=None):
 
52
  self.__images__(
53
  filename if not binary else binary,
54
  zoomin,
@@ -122,7 +123,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
122
  ck = "\n".join(ck)
123
  d = copy.deepcopy(doc)
124
  if pdf_parser:
125
- d["image"] = pdf_parser.crop(ck)
 
126
  ck = pdf_parser.remove_tag(ck)
127
  tokenize(d, ck, eng)
128
  res.append(d)
 
15
  from io import BytesIO
16
  from docx import Document
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
18
+ make_colon_as_title, add_positions
19
  from rag.nlp import huqie
20
  from deepdoc.parser import PdfParser, DocxParser
21
  from rag.settings import cron_logger
 
49
  class Pdf(PdfParser):
50
  def __call__(self, filename, binary=None, from_page=0,
51
  to_page=100000, zoomin=3, callback=None):
52
+ callback(msg="OCR is running...")
53
  self.__images__(
54
  filename if not binary else binary,
55
  zoomin,
 
123
  ck = "\n".join(ck)
124
  d = copy.deepcopy(doc)
125
  if pdf_parser:
126
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
127
+ add_positions(d, poss)
128
  ck = pdf_parser.remove_tag(ck)
129
  tokenize(d, ck, eng)
130
  res.append(d)
rag/app/manual.py CHANGED
@@ -2,7 +2,7 @@ import copy
2
  import re
3
 
4
  from api.db import ParserType
5
- from rag.nlp import huqie, tokenize, tokenize_table
6
  from deepdoc.parser import PdfParser
7
  from rag.utils import num_tokens_from_string
8
 
@@ -14,6 +14,7 @@ class Pdf(PdfParser):
14
 
15
  def __call__(self, filename, binary=None, from_page=0,
16
  to_page=100000, zoomin=3, callback=None):
 
17
  self.__images__(
18
  filename if not binary else binary,
19
  zoomin,
@@ -32,7 +33,7 @@ class Pdf(PdfParser):
32
  self._concat_downward(concat_between_pages=False)
33
  self._filter_forpages()
34
  callback(0.77, "Text merging finished")
35
- tbls = self._extract_table_figure(True, zoomin, False)
36
 
37
  # clean mess
38
  for b in self.boxes:
@@ -91,7 +92,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
91
  d = copy.deepcopy(doc)
92
  ck = "\n".join(chunk)
93
  tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
94
- d["image"] = pdf_parser.crop(ck)
 
95
  res.append(d)
96
  chunk = []
97
  tk_cnt = 0
 
2
  import re
3
 
4
  from api.db import ParserType
5
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions
6
  from deepdoc.parser import PdfParser
7
  from rag.utils import num_tokens_from_string
8
 
 
14
 
15
  def __call__(self, filename, binary=None, from_page=0,
16
  to_page=100000, zoomin=3, callback=None):
17
+ callback(msg="OCR is running...")
18
  self.__images__(
19
  filename if not binary else binary,
20
  zoomin,
 
33
  self._concat_downward(concat_between_pages=False)
34
  self._filter_forpages()
35
  callback(0.77, "Text merging finished")
36
+ tbls = self._extract_table_figure(True, zoomin, False, True)
37
 
38
  # clean mess
39
  for b in self.boxes:
 
92
  d = copy.deepcopy(doc)
93
  ck = "\n".join(chunk)
94
  tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
95
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
96
+ add_positions(d, poss)
97
  res.append(d)
98
  chunk = []
99
  tk_cnt = 0
rag/app/naive.py CHANGED
@@ -13,7 +13,7 @@
13
  import copy
14
  import re
15
  from rag.app import laws
16
- from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table
17
  from deepdoc.parser import PdfParser
18
  from rag.settings import cron_logger
19
 
@@ -21,6 +21,7 @@ from rag.settings import cron_logger
21
  class Pdf(PdfParser):
22
  def __call__(self, filename, binary=None, from_page=0,
23
  to_page=100000, zoomin=3, callback=None):
 
24
  self.__images__(
25
  filename if not binary else binary,
26
  zoomin,
@@ -39,7 +40,7 @@ class Pdf(PdfParser):
39
  self._concat_downward(concat_between_pages=False)
40
  self._filter_forpages()
41
  callback(0.77, "Text merging finished")
42
- tbls = self._extract_table_figure(True, zoomin, False)
43
 
44
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
45
  #self._naive_vertical_merge()
@@ -95,11 +96,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
95
 
96
  # wrap up to es documents
97
  for ck in cks:
 
98
  print("--", ck)
99
- if not ck:continue
100
  d = copy.deepcopy(doc)
101
  if pdf_parser:
102
- d["image"] = pdf_parser.crop(ck)
 
103
  ck = pdf_parser.remove_tag(ck)
104
  tokenize(d, ck, eng)
105
  res.append(d)
 
13
  import copy
14
  import re
15
  from rag.app import laws
16
+ from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
17
  from deepdoc.parser import PdfParser
18
  from rag.settings import cron_logger
19
 
 
21
  class Pdf(PdfParser):
22
  def __call__(self, filename, binary=None, from_page=0,
23
  to_page=100000, zoomin=3, callback=None):
24
+ callback(msg="OCR is running...")
25
  self.__images__(
26
  filename if not binary else binary,
27
  zoomin,
 
40
  self._concat_downward(concat_between_pages=False)
41
  self._filter_forpages()
42
  callback(0.77, "Text merging finished")
43
+ tbls = self._extract_table_figure(True, zoomin, False, True)
44
 
45
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
46
  #self._naive_vertical_merge()
 
96
 
97
  # wrap up to es documents
98
  for ck in cks:
99
+ if len(ck.strip()) == 0:continue
100
  print("--", ck)
 
101
  d = copy.deepcopy(doc)
102
  if pdf_parser:
103
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
104
+ add_positions(d, poss)
105
  ck = pdf_parser.remove_tag(ck)
106
  tokenize(d, ck, eng)
107
  res.append(d)
rag/app/paper.py CHANGED
@@ -15,7 +15,7 @@ import re
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
- from rag.nlp import huqie, tokenize, tokenize_table
19
  from deepdoc.parser import PdfParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
@@ -28,6 +28,7 @@ class Pdf(PdfParser):
28
 
29
  def __call__(self, filename, binary=None, from_page=0,
30
  to_page=100000, zoomin=3, callback=None):
 
31
  self.__images__(
32
  filename if not binary else binary,
33
  zoomin,
@@ -47,7 +48,7 @@ class Pdf(PdfParser):
47
  self._concat_downward(concat_between_pages=False)
48
  self._filter_forpages()
49
  callback(0.75, "Text merging finished.")
50
- tbls = self._extract_table_figure(True, zoomin, False)
51
 
52
  # clean mess
53
  if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -165,7 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
165
  txt = pdf_parser.remove_tag(paper["abstract"])
166
  d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
167
  d["important_tks"] = " ".join(d["important_kwd"])
168
- d["image"] = pdf_parser.crop(paper["abstract"])
 
169
  tokenize(d, txt, eng)
170
  res.append(d)
171
 
@@ -198,8 +200,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
198
  for p in proj:
199
  d = copy.deepcopy(doc)
200
  txt += "\n" + pdf_parser.remove_tag(p)
201
- d["image"] = pdf_parser.crop(p)
202
- tokenize(d, txt)
 
203
  res.append(d)
204
 
205
  i = 0
@@ -210,7 +213,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
210
  d = copy.deepcopy(doc)
211
  ck = "\n".join(chunk)
212
  tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
213
- d["image"] = pdf_parser.crop(ck)
 
214
  res.append(d)
215
  chunk = []
216
  tk_cnt = 0
 
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions
19
  from deepdoc.parser import PdfParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
 
28
 
29
  def __call__(self, filename, binary=None, from_page=0,
30
  to_page=100000, zoomin=3, callback=None):
31
+ callback(msg="OCR is running...")
32
  self.__images__(
33
  filename if not binary else binary,
34
  zoomin,
 
48
  self._concat_downward(concat_between_pages=False)
49
  self._filter_forpages()
50
  callback(0.75, "Text merging finished.")
51
+ tbls = self._extract_table_figure(True, zoomin, False, True)
52
 
53
  # clean mess
54
  if column_width < self.page_images[0].size[0] / zoomin / 2:
 
166
  txt = pdf_parser.remove_tag(paper["abstract"])
167
  d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
168
  d["important_tks"] = " ".join(d["important_kwd"])
169
+ d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
170
+ add_positions(d, poss)
171
  tokenize(d, txt, eng)
172
  res.append(d)
173
 
 
200
  for p in proj:
201
  d = copy.deepcopy(doc)
202
  txt += "\n" + pdf_parser.remove_tag(p)
203
+ d["image"], poss = pdf_parser.crop(p, need_position=True)
204
+ add_positions(d, poss)
205
+ tokenize(d, txt, eng)
206
  res.append(d)
207
 
208
  i = 0
 
213
  d = copy.deepcopy(doc)
214
  ck = "\n".join(chunk)
215
  tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
216
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
217
+ add_positions(d, poss)
218
  res.append(d)
219
  chunk = []
220
  tk_cnt = 0
rag/app/presentation.py CHANGED
@@ -48,6 +48,7 @@ class Pdf(PdfParser):
48
  return False
49
 
50
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
 
51
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
52
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
53
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
@@ -94,9 +95,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
94
  return res
95
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
96
  pdf_parser = Pdf()
97
- for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
98
  d = copy.deepcopy(doc)
99
  d["image"] = img
 
100
  tokenize(d, txt, pdf_parser.is_english)
101
  res.append(d)
102
  return res
 
48
  return False
49
 
50
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
51
+ callback(msg="OCR is running...")
52
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
53
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
54
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
 
95
  return res
96
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
97
  pdf_parser = Pdf()
98
+ for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
99
  d = copy.deepcopy(doc)
100
  d["image"] = img
101
+ d["page_num_obj"] = [pn+1]
102
  tokenize(d, txt, pdf_parser.is_english)
103
  res.append(d)
104
  return res
rag/nlp/__init__.py CHANGED
@@ -83,17 +83,39 @@ def tokenize(d, t, eng):
83
  def tokenize_table(tbls, doc, eng, batch_size=10):
84
  res = []
85
  # add tables
86
- for img, rows in tbls:
 
 
 
 
 
 
 
 
 
 
87
  de = "; " if eng else "; "
88
  for i in range(0, len(rows), batch_size):
89
  d = copy.deepcopy(doc)
90
  r = de.join(rows[i:i + batch_size])
91
  tokenize(d, r, eng)
92
  d["image"] = img
 
93
  res.append(d)
94
  return res
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
97
  def remove_contents_table(sections, eng=False):
98
  i = 0
99
  while i < len(sections):
 
83
  def tokenize_table(tbls, doc, eng, batch_size=10):
84
  res = []
85
  # add tables
86
+ for (img, rows), poss in tbls:
87
+ if not rows:continue
88
+ if isinstance(rows, str):
89
+ d = copy.deepcopy(doc)
90
+ r = re.sub(r"<[^<>]{,12}>", "", rows)
91
+ tokenize(d, r, eng)
92
+ d["content_with_weight"] = rows
93
+ d["image"] = img
94
+ add_positions(d, poss)
95
+ res.append(d)
96
+ continue
97
  de = "; " if eng else "; "
98
  for i in range(0, len(rows), batch_size):
99
  d = copy.deepcopy(doc)
100
  r = de.join(rows[i:i + batch_size])
101
  tokenize(d, r, eng)
102
  d["image"] = img
103
+ add_positions(d, poss)
104
  res.append(d)
105
  return res
106
 
107
 
108
+ def add_positions(d, poss):
109
+ if not poss:return
110
+ d["page_num_int"] = []
111
+ d["position_int"] = []
112
+ d["top_int"] = []
113
+ for pn, left, right, top, bottom in poss:
114
+ d["page_num_int"].append(pn+1)
115
+ d["top_int"].append(top)
116
+ d["position_int"].append((pn+1, left, right, top, bottom))
117
+
118
+
119
  def remove_contents_table(sections, eng=False):
120
  i = 0
121
  while i < len(sections):
rag/nlp/search.py CHANGED
@@ -68,17 +68,25 @@ class Dealer:
68
  pg = int(req.get("page", 1)) - 1
69
  ps = int(req.get("size", 1000))
70
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
71
- "image_id", "doc_id", "q_512_vec", "q_768_vec",
72
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
73
 
74
  s = s.query(bqry)[pg * ps:(pg + 1) * ps]
75
  s = s.highlight("content_ltks")
76
  s = s.highlight("title_ltks")
77
  if not qst:
78
- s = s.sort(
79
- {"create_time": {"order": "desc", "unmapped_type": "date"}},
80
- {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
81
- )
 
 
 
 
 
 
 
 
82
 
83
  if qst:
84
  s = s.highlight_options(
@@ -169,7 +177,7 @@ class Dealer:
169
  m = {n: d.get(n) for n in flds if d.get(n) is not None}
170
  for n, v in m.items():
171
  if isinstance(v, type([])):
172
- m[n] = "\t".join([str(vv) for vv in v])
173
  continue
174
  if not isinstance(v, type("")):
175
  m[n] = str(m[n])
 
68
  pg = int(req.get("page", 1)) - 1
69
  ps = int(req.get("size", 1000))
70
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
71
+ "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
72
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
73
 
74
  s = s.query(bqry)[pg * ps:(pg + 1) * ps]
75
  s = s.highlight("content_ltks")
76
  s = s.highlight("title_ltks")
77
  if not qst:
78
+ if not req.get("sort"):
79
+ s = s.sort(
80
+ {"create_time": {"order": "desc", "unmapped_type": "date"}},
81
+ {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
82
+ )
83
+ else:
84
+ s = s.sort(
85
+ {"page_num_int": {"order": "asc", "unmapped_type": "float"}},
86
+ {"top_int": {"order": "asc", "unmapped_type": "float"}},
87
+ {"create_time": {"order": "desc", "unmapped_type": "date"}},
88
+ {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
89
+ )
90
 
91
  if qst:
92
  s = s.highlight_options(
 
177
  m = {n: d.get(n) for n in flds if d.get(n) is not None}
178
  for n, v in m.items():
179
  if isinstance(v, type([])):
180
+ m[n] = "\t".join([str(vv) if not isinstance(vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
181
  continue
182
  if not isinstance(v, type("")):
183
  m[n] = str(m[n])
rag/svr/task_executor.py CHANGED
@@ -48,6 +48,7 @@ from api.utils.file_utils import get_project_base_directory
48
  BATCH_SIZE = 64
49
 
50
  FACTORY = {
 
51
  ParserType.NAIVE.value: naive,
52
  ParserType.PAPER.value: paper,
53
  ParserType.BOOK.value: book,
@@ -228,6 +229,8 @@ def main(comm, mod):
228
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
229
  if es_r:
230
  callback(-1, "Index failure!")
 
 
231
  cron_logger.error(str(es_r))
232
  else:
233
  if TaskService.do_cancel(r["id"]):
 
48
  BATCH_SIZE = 64
49
 
50
  FACTORY = {
51
+ "general": naive,
52
  ParserType.NAIVE.value: naive,
53
  ParserType.PAPER.value: paper,
54
  ParserType.BOOK.value: book,
 
229
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
230
  if es_r:
231
  callback(-1, "Index failure!")
232
+ ELASTICSEARCH.deleteByQuery(
233
+ Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
234
  cron_logger.error(str(es_r))
235
  else:
236
  if TaskService.do_cancel(r["id"]):