Kevin Hu commited on
Commit
642b6f3
·
1 Parent(s): d3e6ea3

Add doc meta data. (#4442)

Browse files

### What problem does this PR solve?

#3690

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/chunk_app.py CHANGED
@@ -116,8 +116,7 @@ def get():
116
 
117
  @manager.route('/set', methods=['POST']) # noqa: F821
118
  @login_required
119
- @validate_request("doc_id", "chunk_id", "content_with_weight",
120
- "important_kwd", "question_kwd")
121
  def set():
122
  req = request.json
123
  d = {
@@ -125,14 +124,16 @@ def set():
125
  "content_with_weight": req["content_with_weight"]}
126
  d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
127
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
128
- if req.get("important_kwd"):
129
  d["important_kwd"] = req["important_kwd"]
130
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
131
- if req.get("question_kwd"):
132
  d["question_kwd"] = req["question_kwd"]
133
  d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
134
- if req.get("tag_kwd"):
135
  d["tag_kwd"] = req["tag_kwd"]
 
 
136
  if "available_int" in req:
137
  d["available_int"] = req["available_int"]
138
 
@@ -157,7 +158,7 @@ def set():
157
  d = beAdoc(d, arr[0], arr[1], not any(
158
  [rag_tokenizer.is_chinese(t) for t in q + a]))
159
 
160
- v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
161
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
162
  d["q_%d_vec" % len(v)] = v.tolist()
163
  settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
 
116
 
117
  @manager.route('/set', methods=['POST']) # noqa: F821
118
  @login_required
119
+ @validate_request("doc_id", "chunk_id", "content_with_weight")
 
120
  def set():
121
  req = request.json
122
  d = {
 
124
  "content_with_weight": req["content_with_weight"]}
125
  d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
126
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
127
+ if "important_kwd" in req:
128
  d["important_kwd"] = req["important_kwd"]
129
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
130
+ if "question_kwd" in req:
131
  d["question_kwd"] = req["question_kwd"]
132
  d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
133
+ if "tag_kwd" in req:
134
  d["tag_kwd"] = req["tag_kwd"]
135
+ if "tag_feas" in req:
136
+ d["tag_feas"] = req["tag_feas"]
137
  if "available_int" in req:
138
  d["available_int"] = req["available_int"]
139
 
 
158
  d = beAdoc(d, arr[0], arr[1], not any(
159
  [rag_tokenizer.is_chinese(t) for t in q + a]))
160
 
161
+ v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
162
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
163
  d["q_%d_vec" % len(v)] = v.tolist()
164
  settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
api/apps/conversation_app.py CHANGED
@@ -27,12 +27,13 @@ from flask_login import login_required, current_user
27
  from api.db import LLMType
28
  from api.db.services.dialog_service import DialogService, chat, ask, label_question
29
  from api.db.services.knowledgebase_service import KnowledgebaseService
30
- from api.db.services.llm_service import LLMBundle, TenantService, TenantLLMService
31
  from api import settings
32
  from api.utils.api_utils import get_json_result
33
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
34
  from graphrag.mind_map_extractor import MindMapExtractor
35
 
 
36
  @manager.route('/set', methods=['POST']) # noqa: F821
37
  @login_required
38
  def set_conversation():
@@ -376,8 +377,7 @@ def mindmap():
376
  if not e:
377
  return get_data_error_result(message="Knowledgebase not found!")
378
 
379
- embd_mdl = TenantLLMService.model_instance(
380
- kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
381
  chat_mdl = LLMBundle(current_user.id, LLMType.CHAT)
382
  question = req["question"]
383
  ranks = settings.retrievaler.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, 1, 12,
 
27
  from api.db import LLMType
28
  from api.db.services.dialog_service import DialogService, chat, ask, label_question
29
  from api.db.services.knowledgebase_service import KnowledgebaseService
30
+ from api.db.services.llm_service import LLMBundle, TenantService
31
  from api import settings
32
  from api.utils.api_utils import get_json_result
33
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
34
  from graphrag.mind_map_extractor import MindMapExtractor
35
 
36
+
37
  @manager.route('/set', methods=['POST']) # noqa: F821
38
  @login_required
39
  def set_conversation():
 
377
  if not e:
378
  return get_data_error_result(message="Knowledgebase not found!")
379
 
380
+ embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id)
 
381
  chat_mdl = LLMBundle(current_user.id, LLMType.CHAT)
382
  question = req["question"]
383
  ranks = settings.retrievaler.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, 1, 12,
api/apps/document_app.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
 
16
  import os.path
17
  import pathlib
18
  import re
@@ -593,3 +594,34 @@ def parse():
593
  txt = FileService.parse_docs(file_objs, current_user.id)
594
 
595
  return get_json_result(data=txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
16
+ import json
17
  import os.path
18
  import pathlib
19
  import re
 
594
  txt = FileService.parse_docs(file_objs, current_user.id)
595
 
596
  return get_json_result(data=txt)
597
+
598
+
599
+ @manager.route('/set_meta', methods=['POST']) # noqa: F821
600
+ @login_required
601
+ @validate_request("doc_id", "meta")
602
+ def set_meta():
603
+ req = request.json
604
+ if not DocumentService.accessible(req["doc_id"], current_user.id):
605
+ return get_json_result(
606
+ data=False,
607
+ message='No authorization.',
608
+ code=settings.RetCode.AUTHENTICATION_ERROR
609
+ )
610
+ try:
611
+ meta = json.loads(req["meta"])
612
+ except Exception as e:
613
+ return get_json_result(
614
+ data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
615
+ try:
616
+ e, doc = DocumentService.get_by_id(req["doc_id"])
617
+ if not e:
618
+ return get_data_error_result(message="Document not found!")
619
+
620
+ if not DocumentService.update_by_id(
621
+ req["doc_id"], {"meta_fields": meta}):
622
+ return get_data_error_result(
623
+ message="Database error (meta updates)!")
624
+
625
+ return get_json_result(data=True)
626
+ except Exception as e:
627
+ return server_error_response(e)
api/db/db_models.py CHANGED
@@ -760,6 +760,7 @@ class Document(DataBaseModel):
760
  default="")
761
  process_begin_at = DateTimeField(null=True, index=True)
762
  process_duation = FloatField(default=0)
 
763
 
764
  run = CharField(
765
  max_length=1,
@@ -1112,3 +1113,10 @@ def migrate_db():
1112
  )
1113
  except Exception:
1114
  pass
 
 
 
 
 
 
 
 
760
  default="")
761
  process_begin_at = DateTimeField(null=True, index=True)
762
  process_duation = FloatField(default=0)
763
+ meta_fields = JSONField(null=True, default={})
764
 
765
  run = CharField(
766
  max_length=1,
 
1113
  )
1114
  except Exception:
1115
  pass
1116
+ try:
1117
+ migrate(
1118
+ migrator.add_column("document", "meta_fields",
1119
+ JSONField(null=True, default={}))
1120
+ )
1121
+ except Exception:
1122
+ pass
api/db/services/dialog_service.py CHANGED
@@ -122,15 +122,17 @@ def kb_prompt(kbinfos, max_tokens):
122
  knowledges = knowledges[:i]
123
  break
124
 
 
 
 
125
  doc2chunks = defaultdict(list)
126
- for i, ck in enumerate(kbinfos["chunks"]):
127
- if i >= chunks_num:
128
- break
129
  doc2chunks[ck["docnm_kwd"]].append(ck["content_with_weight"])
130
 
131
  knowledges = []
132
  for nm, chunks in doc2chunks.items():
133
- txt = f"Document: {nm} \nContains the following relevant fragments:\n"
 
134
  for i, chunk in enumerate(chunks, 1):
135
  txt += f"{i}. {chunk}\n"
136
  knowledges.append(txt)
 
122
  knowledges = knowledges[:i]
123
  break
124
 
125
+ #docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
126
+ #docs = {d.id: d.meta_fields for d in docs}
127
+
128
  doc2chunks = defaultdict(list)
129
+ for ck in kbinfos["chunks"][:chunks_num]:
 
 
130
  doc2chunks[ck["docnm_kwd"]].append(ck["content_with_weight"])
131
 
132
  knowledges = []
133
  for nm, chunks in doc2chunks.items():
134
+ txt = f"Document: {nm} \n"
135
+ txt += "Contains the following relevant fragments:\n"
136
  for i, chunk in enumerate(chunks, 1):
137
  txt += f"{i}. {chunk}\n"
138
  knowledges.append(txt)