Kevin Hu
commited on
Commit
·
642b6f3
1
Parent(s):
d3e6ea3
Add doc meta data. (#4442)
Browse files### What problem does this PR solve?
#3690
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/chunk_app.py +7 -6
- api/apps/conversation_app.py +3 -3
- api/apps/document_app.py +32 -0
- api/db/db_models.py +8 -0
- api/db/services/dialog_service.py +6 -4
api/apps/chunk_app.py
CHANGED
@@ -116,8 +116,7 @@ def get():
|
|
116 |
|
117 |
@manager.route('/set', methods=['POST']) # noqa: F821
|
118 |
@login_required
|
119 |
-
@validate_request("doc_id", "chunk_id", "content_with_weight"
|
120 |
-
"important_kwd", "question_kwd")
|
121 |
def set():
|
122 |
req = request.json
|
123 |
d = {
|
@@ -125,14 +124,16 @@ def set():
|
|
125 |
"content_with_weight": req["content_with_weight"]}
|
126 |
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
127 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
128 |
-
if
|
129 |
d["important_kwd"] = req["important_kwd"]
|
130 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
131 |
-
if
|
132 |
d["question_kwd"] = req["question_kwd"]
|
133 |
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
|
134 |
-
if
|
135 |
d["tag_kwd"] = req["tag_kwd"]
|
|
|
|
|
136 |
if "available_int" in req:
|
137 |
d["available_int"] = req["available_int"]
|
138 |
|
@@ -157,7 +158,7 @@ def set():
|
|
157 |
d = beAdoc(d, arr[0], arr[1], not any(
|
158 |
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
159 |
|
160 |
-
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d
|
161 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
162 |
d["q_%d_vec" % len(v)] = v.tolist()
|
163 |
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
|
|
116 |
|
117 |
@manager.route('/set', methods=['POST']) # noqa: F821
|
118 |
@login_required
|
119 |
+
@validate_request("doc_id", "chunk_id", "content_with_weight")
|
|
|
120 |
def set():
|
121 |
req = request.json
|
122 |
d = {
|
|
|
124 |
"content_with_weight": req["content_with_weight"]}
|
125 |
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
126 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
127 |
+
if "important_kwd" in req:
|
128 |
d["important_kwd"] = req["important_kwd"]
|
129 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
130 |
+
if "question_kwd" in req:
|
131 |
d["question_kwd"] = req["question_kwd"]
|
132 |
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
|
133 |
+
if "tag_kwd" in req:
|
134 |
d["tag_kwd"] = req["tag_kwd"]
|
135 |
+
if "tag_feas" in req:
|
136 |
+
d["tag_feas"] = req["tag_feas"]
|
137 |
if "available_int" in req:
|
138 |
d["available_int"] = req["available_int"]
|
139 |
|
|
|
158 |
d = beAdoc(d, arr[0], arr[1], not any(
|
159 |
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
160 |
|
161 |
+
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
|
162 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
163 |
d["q_%d_vec" % len(v)] = v.tolist()
|
164 |
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
api/apps/conversation_app.py
CHANGED
@@ -27,12 +27,13 @@ from flask_login import login_required, current_user
|
|
27 |
from api.db import LLMType
|
28 |
from api.db.services.dialog_service import DialogService, chat, ask, label_question
|
29 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
30 |
-
from api.db.services.llm_service import LLMBundle, TenantService
|
31 |
from api import settings
|
32 |
from api.utils.api_utils import get_json_result
|
33 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
34 |
from graphrag.mind_map_extractor import MindMapExtractor
|
35 |
|
|
|
36 |
@manager.route('/set', methods=['POST']) # noqa: F821
|
37 |
@login_required
|
38 |
def set_conversation():
|
@@ -376,8 +377,7 @@ def mindmap():
|
|
376 |
if not e:
|
377 |
return get_data_error_result(message="Knowledgebase not found!")
|
378 |
|
379 |
-
embd_mdl =
|
380 |
-
kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
|
381 |
chat_mdl = LLMBundle(current_user.id, LLMType.CHAT)
|
382 |
question = req["question"]
|
383 |
ranks = settings.retrievaler.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, 1, 12,
|
|
|
27 |
from api.db import LLMType
|
28 |
from api.db.services.dialog_service import DialogService, chat, ask, label_question
|
29 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
30 |
+
from api.db.services.llm_service import LLMBundle, TenantService
|
31 |
from api import settings
|
32 |
from api.utils.api_utils import get_json_result
|
33 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
34 |
from graphrag.mind_map_extractor import MindMapExtractor
|
35 |
|
36 |
+
|
37 |
@manager.route('/set', methods=['POST']) # noqa: F821
|
38 |
@login_required
|
39 |
def set_conversation():
|
|
|
377 |
if not e:
|
378 |
return get_data_error_result(message="Knowledgebase not found!")
|
379 |
|
380 |
+
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id)
|
|
|
381 |
chat_mdl = LLMBundle(current_user.id, LLMType.CHAT)
|
382 |
question = req["question"]
|
383 |
ranks = settings.retrievaler.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, 1, 12,
|
api/apps/document_app.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
|
|
16 |
import os.path
|
17 |
import pathlib
|
18 |
import re
|
@@ -593,3 +594,34 @@ def parse():
|
|
593 |
txt = FileService.parse_docs(file_objs, current_user.id)
|
594 |
|
595 |
return get_json_result(data=txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
16 |
+
import json
|
17 |
import os.path
|
18 |
import pathlib
|
19 |
import re
|
|
|
594 |
txt = FileService.parse_docs(file_objs, current_user.id)
|
595 |
|
596 |
return get_json_result(data=txt)
|
597 |
+
|
598 |
+
|
599 |
+
@manager.route('/set_meta', methods=['POST']) # noqa: F821
|
600 |
+
@login_required
|
601 |
+
@validate_request("doc_id", "meta")
|
602 |
+
def set_meta():
|
603 |
+
req = request.json
|
604 |
+
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
605 |
+
return get_json_result(
|
606 |
+
data=False,
|
607 |
+
message='No authorization.',
|
608 |
+
code=settings.RetCode.AUTHENTICATION_ERROR
|
609 |
+
)
|
610 |
+
try:
|
611 |
+
meta = json.loads(req["meta"])
|
612 |
+
except Exception as e:
|
613 |
+
return get_json_result(
|
614 |
+
data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
|
615 |
+
try:
|
616 |
+
e, doc = DocumentService.get_by_id(req["doc_id"])
|
617 |
+
if not e:
|
618 |
+
return get_data_error_result(message="Document not found!")
|
619 |
+
|
620 |
+
if not DocumentService.update_by_id(
|
621 |
+
req["doc_id"], {"meta_fields": meta}):
|
622 |
+
return get_data_error_result(
|
623 |
+
message="Database error (meta updates)!")
|
624 |
+
|
625 |
+
return get_json_result(data=True)
|
626 |
+
except Exception as e:
|
627 |
+
return server_error_response(e)
|
api/db/db_models.py
CHANGED
@@ -760,6 +760,7 @@ class Document(DataBaseModel):
|
|
760 |
default="")
|
761 |
process_begin_at = DateTimeField(null=True, index=True)
|
762 |
process_duation = FloatField(default=0)
|
|
|
763 |
|
764 |
run = CharField(
|
765 |
max_length=1,
|
@@ -1112,3 +1113,10 @@ def migrate_db():
|
|
1112 |
)
|
1113 |
except Exception:
|
1114 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
760 |
default="")
|
761 |
process_begin_at = DateTimeField(null=True, index=True)
|
762 |
process_duation = FloatField(default=0)
|
763 |
+
meta_fields = JSONField(null=True, default={})
|
764 |
|
765 |
run = CharField(
|
766 |
max_length=1,
|
|
|
1113 |
)
|
1114 |
except Exception:
|
1115 |
pass
|
1116 |
+
try:
|
1117 |
+
migrate(
|
1118 |
+
migrator.add_column("document", "meta_fields",
|
1119 |
+
JSONField(null=True, default={}))
|
1120 |
+
)
|
1121 |
+
except Exception:
|
1122 |
+
pass
|
api/db/services/dialog_service.py
CHANGED
@@ -122,15 +122,17 @@ def kb_prompt(kbinfos, max_tokens):
|
|
122 |
knowledges = knowledges[:i]
|
123 |
break
|
124 |
|
|
|
|
|
|
|
125 |
doc2chunks = defaultdict(list)
|
126 |
-
for
|
127 |
-
if i >= chunks_num:
|
128 |
-
break
|
129 |
doc2chunks[ck["docnm_kwd"]].append(ck["content_with_weight"])
|
130 |
|
131 |
knowledges = []
|
132 |
for nm, chunks in doc2chunks.items():
|
133 |
-
txt = f"Document: {nm} \
|
|
|
134 |
for i, chunk in enumerate(chunks, 1):
|
135 |
txt += f"{i}. {chunk}\n"
|
136 |
knowledges.append(txt)
|
|
|
122 |
knowledges = knowledges[:i]
|
123 |
break
|
124 |
|
125 |
+
#docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
|
126 |
+
#docs = {d.id: d.meta_fields for d in docs}
|
127 |
+
|
128 |
doc2chunks = defaultdict(list)
|
129 |
+
for ck in kbinfos["chunks"][:chunks_num]:
|
|
|
|
|
130 |
doc2chunks[ck["docnm_kwd"]].append(ck["content_with_weight"])
|
131 |
|
132 |
knowledges = []
|
133 |
for nm, chunks in doc2chunks.items():
|
134 |
+
txt = f"Document: {nm} \n"
|
135 |
+
txt += "Contains the following relevant fragments:\n"
|
136 |
for i, chunk in enumerate(chunks, 1):
|
137 |
txt += f"{i}. {chunk}\n"
|
138 |
knowledges.append(txt)
|