Kevin Hu commited on
Commit
77dc93a
·
1 Parent(s): d2db126

fix mind map bug (#1934)

Browse files

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/apps/document_app.py CHANGED
@@ -452,7 +452,7 @@ def get_image(image_id):
452
  @login_required
453
  @validate_request("conversation_id")
454
  def upload_and_parse():
455
- req = request.json
456
  if 'file' not in request.files:
457
  return get_json_result(
458
  data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
@@ -463,7 +463,7 @@ def upload_and_parse():
463
  return get_json_result(
464
  data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
465
 
466
- e, conv = ConversationService.get_by_id(req["conversation_id"])
467
  if not e:
468
  return get_data_error_result(retmsg="Conversation not found!")
469
  e, dia = DialogService.get_by_id(conv.dialog_id)
@@ -487,6 +487,12 @@ def upload_and_parse():
487
  def dummy(prog=None, msg=""):
488
  pass
489
 
 
 
 
 
 
 
490
  parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
491
  exe = ThreadPoolExecutor(max_workers=12)
492
  threads = []
@@ -497,7 +503,7 @@ def upload_and_parse():
497
  "from_page": 0,
498
  "to_page": 100000
499
  }
500
- threads.append(exe.submit(naive.chunk, d["name"], blob, **kwargs))
501
 
502
  for (docinfo,_), th in zip(files, threads):
503
  docs = []
@@ -550,7 +556,7 @@ def upload_and_parse():
550
  for doc_id in docids:
551
  cks = [c for c in docs if c["doc_id"] == doc_id]
552
 
553
- if parser_ids[doc_id] != ParserType.PICTURE.value:
554
  mindmap = MindMapExtractor(llm_bdl)
555
  try:
556
  mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
@@ -564,7 +570,7 @@ def upload_and_parse():
564
  except Exception as e:
565
  stat_logger.error("Mind map generation error:", traceback.format_exc())
566
 
567
- vects = embedding(doc_id, cks)
568
  assert len(cks) == len(vects)
569
  for i, d in enumerate(cks):
570
  v = vects[i]
@@ -575,4 +581,4 @@ def upload_and_parse():
575
  DocumentService.increment_chunk_num(
576
  doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
577
 
578
- return get_json_result(data=[d["id"] for d in files])
 
452
  @login_required
453
  @validate_request("conversation_id")
454
  def upload_and_parse():
455
+ from rag.app import presentation, picture, naive, audio, email
456
  if 'file' not in request.files:
457
  return get_json_result(
458
  data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
 
463
  return get_json_result(
464
  data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
465
 
466
+ e, conv = ConversationService.get_by_id(request.form.get("conversation_id"))
467
  if not e:
468
  return get_data_error_result(retmsg="Conversation not found!")
469
  e, dia = DialogService.get_by_id(conv.dialog_id)
 
487
  def dummy(prog=None, msg=""):
488
  pass
489
 
490
+ FACTORY = {
491
+ ParserType.PRESENTATION.value: presentation,
492
+ ParserType.PICTURE.value: picture,
493
+ ParserType.AUDIO.value: audio,
494
+ ParserType.EMAIL.value: email
495
+ }
496
  parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
497
  exe = ThreadPoolExecutor(max_workers=12)
498
  threads = []
 
503
  "from_page": 0,
504
  "to_page": 100000
505
  }
506
+ threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
507
 
508
  for (docinfo,_), th in zip(files, threads):
509
  docs = []
 
556
  for doc_id in docids:
557
  cks = [c for c in docs if c["doc_id"] == doc_id]
558
 
559
+ if False and parser_ids[doc_id] != ParserType.PICTURE.value:
560
  mindmap = MindMapExtractor(llm_bdl)
561
  try:
562
  mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
 
570
  except Exception as e:
571
  stat_logger.error("Mind map generation error:", traceback.format_exc())
572
 
573
+ vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
574
  assert len(cks) == len(vects)
575
  for i, d in enumerate(cks):
576
  v = vects[i]
 
581
  DocumentService.increment_chunk_num(
582
  doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
583
 
584
+ return get_json_result(data=[d["id"] for d,_ in files])
api/db/services/api_service.py CHANGED
@@ -46,6 +46,7 @@ class API4ConversationService(CommonService):
46
  @classmethod
47
  @DB.connection_context()
48
  def stats(cls, tenant_id, from_date, to_date, source=None):
 
49
  return cls.model.select(
50
  cls.model.create_date.truncate("day").alias("dt"),
51
  peewee.fn.COUNT(
 
46
  @classmethod
47
  @DB.connection_context()
48
  def stats(cls, tenant_id, from_date, to_date, source=None):
49
+ if len(to_date) == 10: to_date += " 23:59:59"
50
  return cls.model.select(
51
  cls.model.create_date.truncate("day").alias("dt"),
52
  peewee.fn.COUNT(
graphrag/mind_map_extractor.py CHANGED
@@ -113,7 +113,7 @@ class MindMapExtractor:
113
  "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
114
  merge_json.items() if isinstance(v, dict) and self._key(k)]}
115
  else:
116
- k = self._key(list(self._be_children.keys())[0])
117
  merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
118
 
119
  except Exception as e:
 
113
  "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
114
  merge_json.items() if isinstance(v, dict) and self._key(k)]}
115
  else:
116
+ k = self._key(list(merge_json.keys())[0])
117
  merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
118
 
119
  except Exception as e:
rag/app/naive.py CHANGED
@@ -61,9 +61,8 @@ class Docx(DocxParser):
61
  if pn > to_page:
62
  break
63
  if from_page <= pn < to_page:
64
- current_image = None
65
  if p.text.strip():
66
- if p.style.name == 'Caption':
67
  former_image = None
68
  if lines and lines[-1][1] and lines[-1][2] != 'Caption':
69
  former_image = lines[-1][1].pop()
 
61
  if pn > to_page:
62
  break
63
  if from_page <= pn < to_page:
 
64
  if p.text.strip():
65
+ if p.style and p.style.name == 'Caption':
66
  former_image = None
67
  if lines and lines[-1][1] and lines[-1][2] != 'Caption':
68
  former_image = lines[-1][1].pop()