KevinHuSh commited on
Commit
7d85666
·
1 Parent(s): 825281b

refine manul parser (#131)

Browse files
README.md CHANGED
@@ -50,7 +50,7 @@ platform to empower your business with AI.
50
 
51
  # Release Notification
52
  **Star us on GitHub, and be notified for a new releases instantly!**
53
- ![star-us](https://github.com/langgenius/dify/assets/100913391/95f37259-7370-4456-a9f0-0bc01ef8642f)
54
 
55
  # Installation
56
  ## System Requirements
 
50
 
51
  # Release Notification
52
  **Star us on GitHub, and be notified for a new releases instantly!**
53
+ ![star-us](https://github.com/infiniflow/ragflow/assets/12318111/2c2fbb5e-c403-496f-a1fd-64ba0fdbf74f)
54
 
55
  # Installation
56
  ## System Requirements
api/apps/conversation_app.py CHANGED
@@ -274,6 +274,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
274
  return retrievaler.sql_retrieval(sql, format="json"), sql
275
 
276
  tbl, sql = get_table()
 
 
277
  if tbl.get("error") and tried_times <= 2:
278
  user_promt = """
279
  表名:{};
 
274
  return retrievaler.sql_retrieval(sql, format="json"), sql
275
 
276
  tbl, sql = get_table()
277
+ if tbl is None:
278
+ return None, None
279
  if tbl.get("error") and tried_times <= 2:
280
  user_promt = """
281
  表名:{};
api/apps/llm_app.py CHANGED
@@ -107,7 +107,7 @@ def list():
107
  llms = LLMService.get_all()
108
  llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value]
109
  for m in llms:
110
- m["available"] = m["fid"] in facts
111
 
112
  res = {}
113
  for m in llms:
 
107
  llms = LLMService.get_all()
108
  llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value]
109
  for m in llms:
110
+ m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding"
111
 
112
  res = {}
113
  for m in llms:
api/db/init_data.py CHANGED
@@ -227,7 +227,7 @@ def init_llm_factory():
227
  "model_type": LLMType.CHAT.value
228
  }, {
229
  "fid": factory_infos[3]["name"],
230
- "llm_name": "flag-enbedding",
231
  "tags": "TEXT EMBEDDING,",
232
  "max_tokens": 128 * 1000,
233
  "model_type": LLMType.EMBEDDING.value
@@ -241,7 +241,7 @@ def init_llm_factory():
241
  "model_type": LLMType.CHAT.value
242
  }, {
243
  "fid": factory_infos[4]["name"],
244
- "llm_name": "flag-enbedding",
245
  "tags": "TEXT EMBEDDING,",
246
  "max_tokens": 128 * 1000,
247
  "model_type": LLMType.EMBEDDING.value
 
227
  "model_type": LLMType.CHAT.value
228
  }, {
229
  "fid": factory_infos[3]["name"],
230
+ "llm_name": "flag-embedding",
231
  "tags": "TEXT EMBEDDING,",
232
  "max_tokens": 128 * 1000,
233
  "model_type": LLMType.EMBEDDING.value
 
241
  "model_type": LLMType.CHAT.value
242
  }, {
243
  "fid": factory_infos[4]["name"],
244
+ "llm_name": "flag-embedding",
245
  "tags": "TEXT EMBEDDING,",
246
  "max_tokens": 128 * 1000,
247
  "model_type": LLMType.EMBEDDING.value
api/settings.py CHANGED
@@ -72,13 +72,13 @@ default_llm = {
72
  },
73
  "Local": {
74
  "chat_model": "qwen-14B-chat",
75
- "embedding_model": "flag-enbedding",
76
  "image2text_model": "",
77
  "asr_model": "",
78
  },
79
  "Moonshot": {
80
  "chat_model": "moonshot-v1-8k",
81
- "embedding_model": "flag-enbedding",
82
  "image2text_model": "",
83
  "asr_model": "",
84
  }
 
72
  },
73
  "Local": {
74
  "chat_model": "qwen-14B-chat",
75
+ "embedding_model": "flag-embedding",
76
  "image2text_model": "",
77
  "asr_model": "",
78
  },
79
  "Moonshot": {
80
  "chat_model": "moonshot-v1-8k",
81
+ "embedding_model": "",
82
  "image2text_model": "",
83
  "asr_model": "",
84
  }
deepdoc/parser/pdf_parser.py CHANGED
@@ -247,7 +247,7 @@ class HuParser:
247
  b["SP"] = ii
248
 
249
  def __ocr(self, pagenum, img, chars, ZM=3):
250
- bxs = self.ocr(np.array(img))
251
  if not bxs:
252
  self.boxes.append([])
253
  return
@@ -278,8 +278,10 @@ class HuParser:
278
 
279
  for b in bxs:
280
  if not b["text"]:
281
- b["text"] = b["txt"]
 
282
  del b["txt"]
 
283
  if self.mean_height[-1] == 0:
284
  self.mean_height[-1] = np.median([b["bottom"] - b["top"]
285
  for b in bxs])
 
247
  b["SP"] = ii
248
 
249
  def __ocr(self, pagenum, img, chars, ZM=3):
250
+ bxs = self.ocr.detect(np.array(img))
251
  if not bxs:
252
  self.boxes.append([])
253
  return
 
278
 
279
  for b in bxs:
280
  if not b["text"]:
281
+ left, right, top, bott = b["x0"]*ZM, b["x1"]*ZM, b["top"]*ZM, b["bottom"]*ZM
282
+ b["text"] = self.ocr.recognize(np.array(img), np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
283
  del b["txt"]
284
+ bxs = [b for b in bxs if b["text"]]
285
  if self.mean_height[-1] == 0:
286
  self.mean_height[-1] = np.median([b["bottom"] - b["top"]
287
  for b in bxs])
deepdoc/vision/ocr.py CHANGED
@@ -69,7 +69,7 @@ def load_model(model_dir, nm):
69
  options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
70
  options.intra_op_num_threads = 2
71
  options.inter_op_num_threads = 2
72
- if ort.get_device() == "GPU":
73
  sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
74
  else:
75
  sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
@@ -366,7 +366,7 @@ class TextDetector(object):
366
  'keep_keys': ['image', 'shape']
367
  }
368
  }]
369
- postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.6, "max_candidates": 1000,
370
  "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"}
371
 
372
  self.postprocess_op = build_post_process(postprocess_params)
@@ -534,6 +534,34 @@ class OCR(object):
534
  break
535
  return _boxes
536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  def __call__(self, img, cls=True):
538
  time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
539
 
@@ -562,6 +590,7 @@ class OCR(object):
562
  img_crop_list.append(img_crop)
563
 
564
  rec_res, elapse = self.text_recognizer(img_crop_list)
 
565
  time_dict['rec'] = elapse
566
  cron_logger.debug("rec_res num : {}, elapsed : {}".format(
567
  len(rec_res), elapse))
@@ -575,6 +604,7 @@ class OCR(object):
575
  end = time.time()
576
  time_dict['all'] = end - start
577
 
 
578
  #for bno in range(len(img_crop_list)):
579
  # print(f"{bno}, {rec_res[bno]}")
580
 
 
69
  options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
70
  options.intra_op_num_threads = 2
71
  options.inter_op_num_threads = 2
72
+ if False and ort.get_device() == "GPU":
73
  sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
74
  else:
75
  sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
 
366
  'keep_keys': ['image', 'shape']
367
  }
368
  }]
369
+ postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.5, "max_candidates": 1000,
370
  "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"}
371
 
372
  self.postprocess_op = build_post_process(postprocess_params)
 
534
  break
535
  return _boxes
536
 
537
+ def detect(self, img):
538
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
539
+
540
+ if img is None:
541
+ return None, None, time_dict
542
+
543
+ start = time.time()
544
+ dt_boxes, elapse = self.text_detector(img)
545
+ time_dict['det'] = elapse
546
+
547
+ if dt_boxes is None:
548
+ end = time.time()
549
+ time_dict['all'] = end - start
550
+ return None, None, time_dict
551
+ else:
552
+ cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
553
+ len(dt_boxes), elapse))
554
+
555
+ return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))])
556
+
557
+ def recognize(self, ori_im, box):
558
+ img_crop = self.get_rotate_crop_image(ori_im, box)
559
+
560
+ rec_res, elapse = self.text_recognizer([img_crop])
561
+ text, score = rec_res[0]
562
+ if score < self.drop_score:return ""
563
+ return text
564
+
565
  def __call__(self, img, cls=True):
566
  time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
567
 
 
590
  img_crop_list.append(img_crop)
591
 
592
  rec_res, elapse = self.text_recognizer(img_crop_list)
593
+
594
  time_dict['rec'] = elapse
595
  cron_logger.debug("rec_res num : {}, elapsed : {}".format(
596
  len(rec_res), elapse))
 
604
  end = time.time()
605
  time_dict['all'] = end - start
606
 
607
+
608
  #for bno in range(len(img_crop_list)):
609
  # print(f"{bno}, {rec_res[bno]}")
610
 
deepdoc/vision/recognizer.py CHANGED
@@ -41,7 +41,7 @@ class Recognizer(object):
41
  if not os.path.exists(model_file_path):
42
  raise ValueError("not find model file path {}".format(
43
  model_file_path))
44
- if ort.get_device() == "GPU":
45
  options = ort.SessionOptions()
46
  options.enable_cpu_mem_arena = False
47
  self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')])
 
41
  if not os.path.exists(model_file_path):
42
  raise ValueError("not find model file path {}".format(
43
  model_file_path))
44
+ if False and ort.get_device() == "GPU":
45
  options = ort.SessionOptions()
46
  options.enable_cpu_mem_arena = False
47
  self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')])
rag/app/manual.py CHANGED
@@ -2,7 +2,7 @@ import copy
2
  import re
3
 
4
  from api.db import ParserType
5
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions
6
  from deepdoc.parser import PdfParser
7
  from rag.utils import num_tokens_from_string
8
 
@@ -14,6 +14,8 @@ class Pdf(PdfParser):
14
 
15
  def __call__(self, filename, binary=None, from_page=0,
16
  to_page=100000, zoomin=3, callback=None):
 
 
17
  callback(msg="OCR is running...")
18
  self.__images__(
19
  filename if not binary else binary,
@@ -23,19 +25,38 @@ class Pdf(PdfParser):
23
  callback
24
  )
25
  callback(msg="OCR finished.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- from timeit import default_timer as timer
28
- start = timer()
29
  self._layouts_rec(zoomin)
30
  callback(0.65, "Layout analysis finished.")
31
  print("paddle layouts:", timer() - start)
32
  self._table_transformer_job(zoomin)
33
  callback(0.67, "Table analysis finished.")
34
  self._text_merge()
35
- self._concat_downward(concat_between_pages=False)
 
36
  self._filter_forpages()
37
  callback(0.68, "Text merging finished")
38
- tbls = self._extract_table_figure(True, zoomin, True, True)
39
 
40
  # clean mess
41
  for b in self.boxes:
@@ -44,25 +65,33 @@ class Pdf(PdfParser):
44
  # merge chunks with the same bullets
45
  self._merge_with_same_bullet()
46
 
47
- # merge title with decent chunk
48
- i = 0
49
- while i + 1 < len(self.boxes):
50
- b = self.boxes[i]
51
- if b.get("layoutno","").find("title") < 0:
52
- i += 1
53
- continue
54
- b_ = self.boxes[i + 1]
55
- b_["text"] = b["text"] + "\n" + b_["text"]
56
- b_["x0"] = min(b["x0"], b_["x0"])
57
- b_["x1"] = max(b["x1"], b_["x1"])
58
- b_["top"] = b["top"]
59
- self.boxes.pop(i)
60
-
61
- callback(0.8, "Parsing finished")
62
- for b in self.boxes: print(b["text"], b.get("layoutno"))
63
-
64
- print(tbls)
65
- return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
 
 
 
 
 
 
 
 
66
 
67
 
68
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -73,7 +102,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
73
 
74
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
75
  pdf_parser = Pdf()
76
- cks, tbls = pdf_parser(filename if not binary else binary,
77
  from_page=from_page, to_page=to_page, callback=callback)
78
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
79
  doc = {
@@ -84,16 +113,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
84
  # is it English
85
  eng = lang.lower() == "english"#pdf_parser.is_english
86
 
87
- res = tokenize_table(tbls, doc, eng)
88
-
89
  i = 0
90
  chunk = []
91
  tk_cnt = 0
 
92
  def add_chunk():
93
  nonlocal chunk, res, doc, pdf_parser, tk_cnt
94
  d = copy.deepcopy(doc)
95
  ck = "\n".join(chunk)
96
- tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
97
  d["image"], poss = pdf_parser.crop(ck, need_position=True)
98
  add_positions(d, poss)
99
  res.append(d)
@@ -101,7 +129,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
101
  tk_cnt = 0
102
 
103
  while i < len(cks):
104
- if tk_cnt > 128: add_chunk()
105
  txt = cks[i]
106
  txt_ = pdf_parser.remove_tag(txt)
107
  i += 1
@@ -109,6 +137,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
109
  chunk.append(txt)
110
  tk_cnt += cnt
111
  if chunk: add_chunk()
 
112
  for i, d in enumerate(res):
113
  print(d)
114
  # d["image"].save(f"./logs/{i}.jpg")
@@ -117,6 +146,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
117
 
118
  if __name__ == "__main__":
119
  import sys
120
- def dummy(a, b):
121
  pass
122
  chunk(sys.argv[1], callback=dummy)
 
2
  import re
3
 
4
  from api.db import ParserType
5
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
6
  from deepdoc.parser import PdfParser
7
  from rag.utils import num_tokens_from_string
8
 
 
14
 
15
  def __call__(self, filename, binary=None, from_page=0,
16
  to_page=100000, zoomin=3, callback=None):
17
+ from timeit import default_timer as timer
18
+ start = timer()
19
  callback(msg="OCR is running...")
20
  self.__images__(
21
  filename if not binary else binary,
 
25
  callback
26
  )
27
  callback(msg="OCR finished.")
28
+ #for bb in self.boxes:
29
+ # for b in bb:
30
+ # print(b)
31
+ print("OCR:", timer()-start)
32
+
33
+ def get_position(bx):
34
+ poss = []
35
+ pn = bx["page_number"]
36
+ top = bx["top"] - self.page_cum_height[pn - 1]
37
+ bott = bx["bottom"] - self.page_cum_height[pn - 1]
38
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
39
+ while bott * zoomin > self.page_images[pn - 1].size[1]:
40
+ bott -= self.page_images[pn- 1].size[1] / zoomin
41
+ top = 0
42
+ pn += 1
43
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
44
+ return poss
45
+
46
+ def tag(pn, left, right, top, bottom):
47
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
48
+ .format(pn, left, right, top, bottom)
49
 
 
 
50
  self._layouts_rec(zoomin)
51
  callback(0.65, "Layout analysis finished.")
52
  print("paddle layouts:", timer() - start)
53
  self._table_transformer_job(zoomin)
54
  callback(0.67, "Table analysis finished.")
55
  self._text_merge()
56
+ tbls = self._extract_table_figure(True, zoomin, True, True)
57
+ self._naive_vertical_merge()
58
  self._filter_forpages()
59
  callback(0.68, "Text merging finished")
 
60
 
61
  # clean mess
62
  for b in self.boxes:
 
65
  # merge chunks with the same bullets
66
  self._merge_with_same_bullet()
67
 
68
+ # set pivot using the most frequent type of title,
69
+ # then merge between 2 pivot
70
+ bull = bullets_category([b["text"] for b in self.boxes])
71
+ most_level, levels = title_frequency(bull, [(b["text"], b.get("layout_no","")) for b in self.boxes])
72
+ assert len(self.boxes) == len(levels)
73
+ sec_ids = []
74
+ sid = 0
75
+ for i, lvl in enumerate(levels):
76
+ if lvl <= most_level: sid += 1
77
+ sec_ids.append(sid)
78
+ #print(lvl, self.boxes[i]["text"], most_level)
79
+
80
+ sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
81
+ for (img, rows), poss in tbls:
82
+ sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss]))
83
+
84
+ chunks = []
85
+ last_sid = -2
86
+ for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
87
+ poss = "\t".join([tag(*pos) for pos in poss])
88
+ if sec_id == last_sid or sec_id == -1:
89
+ if chunks:
90
+ chunks[-1] += "\n" + txt + poss
91
+ continue
92
+ chunks.append(txt + poss)
93
+ if sec_id >-1: last_sid = sec_id
94
+ return chunks
95
 
96
 
97
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
102
 
103
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
104
  pdf_parser = Pdf()
105
+ cks = pdf_parser(filename if not binary else binary,
106
  from_page=from_page, to_page=to_page, callback=callback)
107
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
108
  doc = {
 
113
  # is it English
114
  eng = lang.lower() == "english"#pdf_parser.is_english
115
 
 
 
116
  i = 0
117
  chunk = []
118
  tk_cnt = 0
119
+ res = []
120
  def add_chunk():
121
  nonlocal chunk, res, doc, pdf_parser, tk_cnt
122
  d = copy.deepcopy(doc)
123
  ck = "\n".join(chunk)
124
+ tokenize(d, pdf_parser.remove_tag(ck), eng)
125
  d["image"], poss = pdf_parser.crop(ck, need_position=True)
126
  add_positions(d, poss)
127
  res.append(d)
 
129
  tk_cnt = 0
130
 
131
  while i < len(cks):
132
+ if tk_cnt > 256: add_chunk()
133
  txt = cks[i]
134
  txt_ = pdf_parser.remove_tag(txt)
135
  i += 1
 
137
  chunk.append(txt)
138
  tk_cnt += cnt
139
  if chunk: add_chunk()
140
+
141
  for i, d in enumerate(res):
142
  print(d)
143
  # d["image"].save(f"./logs/{i}.jpg")
 
146
 
147
  if __name__ == "__main__":
148
  import sys
149
+ def dummy(prog=None, msg=""):
150
  pass
151
  chunk(sys.argv[1], callback=dummy)
rag/app/naive.py CHANGED
@@ -100,7 +100,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
100
  print("--", ck)
101
  d = copy.deepcopy(doc)
102
  if pdf_parser:
103
- d["image"], poss = pdf_parser.crop(ck, need_position=True)
 
 
 
104
  add_positions(d, poss)
105
  ck = pdf_parser.remove_tag(ck)
106
  tokenize(d, ck, eng)
 
100
  print("--", ck)
101
  d = copy.deepcopy(doc)
102
  if pdf_parser:
103
+ try:
104
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
105
+ except Exception as e:
106
+ continue
107
  add_positions(d, poss)
108
  ck = pdf_parser.remove_tag(ck)
109
  tokenize(d, ck, eng)
rag/nlp/__init__.py CHANGED
@@ -1,4 +1,6 @@
1
  import random
 
 
2
  from rag.utils import num_tokens_from_string
3
  from . import huqie
4
  from nltk import word_tokenize
@@ -175,6 +177,36 @@ def make_colon_as_title(sections):
175
  i += 1
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def hierarchical_merge(bull, sections, depth):
179
  if not sections or bull < 0:
180
  return []
@@ -185,12 +217,6 @@ def hierarchical_merge(bull, sections, depth):
185
  bullets_size = len(BULLET_PATTERN[bull])
186
  levels = [[] for _ in range(bullets_size + 2)]
187
 
188
- def not_title(txt):
189
- if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
190
- return False
191
- if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
192
- return True
193
- return re.search(r"[,;,。;!!]", txt)
194
 
195
  for i, (txt, layout) in enumerate(sections):
196
  for j, p in enumerate(BULLET_PATTERN[bull]):
 
1
  import random
2
+ from collections import Counter
3
+
4
  from rag.utils import num_tokens_from_string
5
  from . import huqie
6
  from nltk import word_tokenize
 
177
  i += 1
178
 
179
 
180
+ def title_frequency(bull, sections):
181
+ bullets_size = len(BULLET_PATTERN[bull])
182
+ levels = [bullets_size+1 for _ in range(len(sections))]
183
+ if not sections or bull < 0:
184
+ return bullets_size+1, levels
185
+
186
+ for i, (txt, layout) in enumerate(sections):
187
+ for j, p in enumerate(BULLET_PATTERN[bull]):
188
+ if re.match(p, txt.strip()):
189
+ levels[i] = j
190
+ break
191
+ else:
192
+ if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
193
+ levels[i] = bullets_size
194
+ most_level = bullets_size+1
195
+ for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
196
+ if l <= bullets_size:
197
+ most_level = l
198
+ break
199
+ return most_level, levels
200
+
201
+
202
+ def not_title(txt):
203
+ if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
204
+ return False
205
+ if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
206
+ return True
207
+ return re.search(r"[,;,。;!!]", txt)
208
+
209
+
210
  def hierarchical_merge(bull, sections, depth):
211
  if not sections or bull < 0:
212
  return []
 
217
  bullets_size = len(BULLET_PATTERN[bull])
218
  levels = [[] for _ in range(bullets_size + 2)]
219
 
 
 
 
 
 
 
220
 
221
  for i, (txt, layout) in enumerate(sections):
222
  for j, p in enumerate(BULLET_PATTERN[bull]):
rag/nlp/query.py CHANGED
@@ -38,7 +38,7 @@ class EsQueryer:
38
  "",
39
  txt)
40
  return re.sub(
41
- r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was)*", "", txt, re.IGNORECASE)
42
 
43
  def question(self, txt, tbl="qa", min_match="60%"):
44
  txt = re.sub(
@@ -50,16 +50,16 @@ class EsQueryer:
50
  txt = EsQueryer.rmWWW(txt)
51
 
52
  if not self.isChinese(txt):
53
- tks = txt.split(" ")
54
- q = []
55
  for i in range(1, len(tks)):
56
- q.append("\"%s %s\"~2" % (tks[i - 1], tks[i]))
57
  if not q:
58
  q.append(txt)
59
  return Q("bool",
60
  must=Q("query_string", fields=self.flds,
61
  type="best_fields", query=" OR ".join(q),
62
- boost=1, minimum_should_match="60%")
63
  ), txt.split(" ")
64
 
65
  def needQieqie(tk):
@@ -147,7 +147,7 @@ class EsQueryer:
147
  atks = toDict(atks)
148
  btkss = [toDict(tks) for tks in btkss]
149
  tksim = [self.similarity(atks, btks) for btks in btkss]
150
- return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, sims[0], tksim
151
 
152
  def similarity(self, qtwt, dtwt):
153
  if isinstance(dtwt, type("")):
 
38
  "",
39
  txt)
40
  return re.sub(
41
+ r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
42
 
43
  def question(self, txt, tbl="qa", min_match="60%"):
44
  txt = re.sub(
 
50
  txt = EsQueryer.rmWWW(txt)
51
 
52
  if not self.isChinese(txt):
53
+ tks = [t for t in txt.split(" ") if t.strip()]
54
+ q = tks
55
  for i in range(1, len(tks)):
56
+ q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
57
  if not q:
58
  q.append(txt)
59
  return Q("bool",
60
  must=Q("query_string", fields=self.flds,
61
  type="best_fields", query=" OR ".join(q),
62
+ boost=1, minimum_should_match=min_match)
63
  ), txt.split(" ")
64
 
65
  def needQieqie(tk):
 
147
  atks = toDict(atks)
148
  btkss = [toDict(tks) for tks in btkss]
149
  tksim = [self.similarity(atks, btks) for btks in btkss]
150
+ return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
151
 
152
  def similarity(self, qtwt, dtwt):
153
  if isinstance(dtwt, type("")):
rag/nlp/search.py CHANGED
@@ -119,6 +119,7 @@ class Dealer:
119
  s["knn"]["filter"] = bqry.to_dict()
120
  s["knn"]["similarity"] = 0.17
121
  res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
 
122
 
123
  kwds = set([])
124
  for k in keywords:
 
119
  s["knn"]["filter"] = bqry.to_dict()
120
  s["knn"]["similarity"] = 0.17
121
  res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
122
+ es_logger.info("【Q】: {}".format(json.dumps(s)))
123
 
124
  kwds = set([])
125
  for k in keywords: