Kevin Hu commited on
Commit
44731b3
·
1 Parent(s): 8efa7c5

add auto keywords and auto-question (#2965)

Browse files

### What problem does this PR solve?

#2687

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/api_app.py CHANGED
@@ -25,7 +25,7 @@ from api.db import FileType, LLMType, ParserType, FileSource
25
  from api.db.db_models import APIToken, Task, File
26
  from api.db.services import duplicate_name
27
  from api.db.services.api_service import APITokenService, API4ConversationService
28
- from api.db.services.dialog_service import DialogService, chat
29
  from api.db.services.document_service import DocumentService, doc_upload_and_parse
30
  from api.db.services.file2document_service import File2DocumentService
31
  from api.db.services.file_service import FileService
@@ -38,7 +38,6 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
38
  generate_confirmation_token
39
 
40
  from api.utils.file_utils import filename_type, thumbnail
41
- from rag.nlp import keyword_extraction
42
  from rag.utils.storage_factory import STORAGE_IMPL
43
 
44
  from api.db.services.canvas_service import UserCanvasService
 
25
  from api.db.db_models import APIToken, Task, File
26
  from api.db.services import duplicate_name
27
  from api.db.services.api_service import APITokenService, API4ConversationService
28
+ from api.db.services.dialog_service import DialogService, chat, keyword_extraction
29
  from api.db.services.document_service import DocumentService, doc_upload_and_parse
30
  from api.db.services.file2document_service import File2DocumentService
31
  from api.db.services.file_service import FileService
 
38
  generate_confirmation_token
39
 
40
  from api.utils.file_utils import filename_type, thumbnail
 
41
  from rag.utils.storage_factory import STORAGE_IMPL
42
 
43
  from api.db.services.canvas_service import UserCanvasService
api/apps/chunk_app.py CHANGED
@@ -21,8 +21,9 @@ from flask import request
21
  from flask_login import login_required, current_user
22
  from elasticsearch_dsl import Q
23
 
 
24
  from rag.app.qa import rmPrefix, beAdoc
25
- from rag.nlp import search, rag_tokenizer, keyword_extraction
26
  from rag.utils.es_conn import ELASTICSEARCH
27
  from rag.utils import rmSpace
28
  from api.db import LLMType, ParserType
 
21
  from flask_login import login_required, current_user
22
  from elasticsearch_dsl import Q
23
 
24
+ from api.db.services.dialog_service import keyword_extraction
25
  from rag.app.qa import rmPrefix, beAdoc
26
+ from rag.nlp import search, rag_tokenizer
27
  from rag.utils.es_conn import ELASTICSEARCH
28
  from rag.utils import rmSpace
29
  from api.db import LLMType, ParserType
api/apps/sdk/chat.py CHANGED
@@ -16,16 +16,15 @@
16
  from flask import request
17
 
18
  from api.db import StatusEnum
19
- from api.db.db_models import TenantLLM
20
  from api.db.services.dialog_service import DialogService
21
  from api.db.services.knowledgebase_service import KnowledgebaseService
22
- from api.db.services.llm_service import LLMService, TenantLLMService
23
  from api.db.services.user_service import TenantService
24
- from api.settings import RetCode
25
  from api.utils import get_uuid
26
  from api.utils.api_utils import get_error_data_result, token_required
27
  from api.utils.api_utils import get_result
28
 
 
29
  @manager.route('/chat', methods=['POST'])
30
  @token_required
31
  def create(tenant_id):
 
16
  from flask import request
17
 
18
  from api.db import StatusEnum
 
19
  from api.db.services.dialog_service import DialogService
20
  from api.db.services.knowledgebase_service import KnowledgebaseService
21
+ from api.db.services.llm_service import TenantLLMService
22
  from api.db.services.user_service import TenantService
 
23
  from api.utils import get_uuid
24
  from api.utils.api_utils import get_error_data_result, token_required
25
  from api.utils.api_utils import get_result
26
 
27
+
28
  @manager.route('/chat', methods=['POST'])
29
  @token_required
30
  def create(tenant_id):
api/apps/sdk/dify_retrieval.py CHANGED
@@ -1,10 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from flask import request, jsonify
2
 
3
- from db import LLMType, ParserType
4
- from db.services.knowledgebase_service import KnowledgebaseService
5
- from db.services.llm_service import LLMBundle
6
- from settings import retrievaler, kg_retrievaler, RetCode
7
- from utils.api_utils import validate_request, build_error_result, apikey_required
8
 
9
 
10
  @manager.route('/dify/retrieval', methods=['POST'])
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
  from flask import request, jsonify
17
 
18
+ from api.db import LLMType, ParserType
19
+ from api.db.services.knowledgebase_service import KnowledgebaseService
20
+ from api.db.services.llm_service import LLMBundle
21
+ from api.settings import retrievaler, kg_retrievaler, RetCode
22
+ from api.utils.api_utils import validate_request, build_error_result, apikey_required
23
 
24
 
25
  @manager.route('/dify/retrieval', methods=['POST'])
api/apps/sdk/doc.py CHANGED
@@ -1,48 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pathlib
2
- import re
3
  import datetime
4
- import json
5
- import traceback
6
-
7
- from botocore.docs.method import document_model_driven_method
8
- from flask import request
9
- from flask_login import login_required, current_user
10
- from elasticsearch_dsl import Q
11
- from pygments import highlight
12
- from sphinx.addnodes import document
13
 
 
14
  from rag.app.qa import rmPrefix, beAdoc
15
- from rag.nlp import search, rag_tokenizer, keyword_extraction
16
- from rag.utils.es_conn import ELASTICSEARCH
17
- from rag.utils import rmSpace
18
  from api.db import LLMType, ParserType
19
- from api.db.services.knowledgebase_service import KnowledgebaseService
20
  from api.db.services.llm_service import TenantLLMService
21
- from api.db.services.user_service import UserTenantService
22
- from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
23
- from api.db.services.document_service import DocumentService
24
- from api.settings import RetCode, retrievaler, kg_retrievaler
25
- from api.utils.api_utils import get_result
26
  import hashlib
27
  import re
28
- from api.utils.api_utils import get_result, token_required, get_error_data_result
29
-
30
- from api.db.db_models import Task, File
31
-
32
  from api.db.services.task_service import TaskService, queue_tasks
33
- from api.db.services.user_service import TenantService, UserTenantService
34
-
35
- from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
36
-
37
- from api.utils.api_utils import get_result, get_result, get_error_data_result
38
-
39
- from functools import partial
40
  from io import BytesIO
41
-
42
  from elasticsearch_dsl import Q
43
  from flask import request, send_file
44
- from flask_login import login_required
45
-
46
  from api.db import FileSource, TaskStatus, FileType
47
  from api.db.db_models import File
48
  from api.db.services.document_service import DocumentService
@@ -50,8 +39,7 @@ from api.db.services.file2document_service import File2DocumentService
50
  from api.db.services.file_service import FileService
51
  from api.db.services.knowledgebase_service import KnowledgebaseService
52
  from api.settings import RetCode, retrievaler
53
- from api.utils.api_utils import construct_json_result, construct_error_response
54
- from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
55
  from rag.nlp import search
56
  from rag.utils import rmSpace
57
  from rag.utils.es_conn import ELASTICSEARCH
@@ -365,7 +353,6 @@ def list_chunks(tenant_id,dataset_id,document_id):
365
  return get_result(data=res)
366
 
367
 
368
-
369
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
370
  @token_required
371
  def create(tenant_id,dataset_id,document_id):
@@ -454,7 +441,6 @@ def rm_chunk(tenant_id,dataset_id,document_id):
454
  return get_result()
455
 
456
 
457
-
458
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
459
  @token_required
460
  def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
@@ -512,7 +498,6 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
512
  return get_result()
513
 
514
 
515
-
516
  @manager.route('/retrieval', methods=['POST'])
517
  @token_required
518
  def retrieval_test(tenant_id):
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
  import pathlib
 
17
  import datetime
 
 
 
 
 
 
 
 
 
18
 
19
+ from api.db.services.dialog_service import keyword_extraction
20
  from rag.app.qa import rmPrefix, beAdoc
21
+ from rag.nlp import rag_tokenizer
 
 
22
  from api.db import LLMType, ParserType
 
23
  from api.db.services.llm_service import TenantLLMService
24
+ from api.settings import kg_retrievaler
 
 
 
 
25
  import hashlib
26
  import re
27
+ from api.utils.api_utils import token_required
28
+ from api.db.db_models import Task
 
 
29
  from api.db.services.task_service import TaskService, queue_tasks
30
+ from api.utils.api_utils import server_error_response
31
+ from api.utils.api_utils import get_result, get_error_data_result
 
 
 
 
 
32
  from io import BytesIO
 
33
  from elasticsearch_dsl import Q
34
  from flask import request, send_file
 
 
35
  from api.db import FileSource, TaskStatus, FileType
36
  from api.db.db_models import File
37
  from api.db.services.document_service import DocumentService
 
39
  from api.db.services.file_service import FileService
40
  from api.db.services.knowledgebase_service import KnowledgebaseService
41
  from api.settings import RetCode, retrievaler
42
+ from api.utils.api_utils import construct_json_result
 
43
  from rag.nlp import search
44
  from rag.utils import rmSpace
45
  from rag.utils.es_conn import ELASTICSEARCH
 
353
  return get_result(data=res)
354
 
355
 
 
356
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
357
  @token_required
358
  def create(tenant_id,dataset_id,document_id):
 
441
  return get_result()
442
 
443
 
 
444
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
445
  @token_required
446
  def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
 
498
  return get_result()
499
 
500
 
 
501
  @manager.route('/retrieval', methods=['POST'])
502
  @token_required
503
  def retrieval_test(tenant_id):
api/db/services/dialog_service.py CHANGED
@@ -28,7 +28,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
28
  from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
29
  from api.settings import chat_logger, retrievaler, kg_retrievaler
30
  from rag.app.resume import forbidden_select_fields4resume
31
- from rag.nlp import keyword_extraction
32
  from rag.nlp.search import index_name
33
  from rag.utils import rmSpace, num_tokens_from_string, encoder
34
  from api.utils.file_utils import get_project_base_directory
@@ -80,6 +79,7 @@ class ConversationService(CommonService):
80
 
81
  return list(sessions.dicts())
82
 
 
83
  def message_fit_in(msg, max_length=4000):
84
  def count():
85
  nonlocal msg
@@ -456,6 +456,58 @@ def rewrite(tenant_id, llm_id, question):
456
  return ans
457
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  def full_question(tenant_id, llm_id, messages):
460
  if llm_id2llm_type(llm_id) == "image2text":
461
  chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
 
28
  from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
29
  from api.settings import chat_logger, retrievaler, kg_retrievaler
30
  from rag.app.resume import forbidden_select_fields4resume
 
31
  from rag.nlp.search import index_name
32
  from rag.utils import rmSpace, num_tokens_from_string, encoder
33
  from api.utils.file_utils import get_project_base_directory
 
79
 
80
  return list(sessions.dicts())
81
 
82
+
83
  def message_fit_in(msg, max_length=4000):
84
  def count():
85
  nonlocal msg
 
456
  return ans
457
 
458
 
459
+ def keyword_extraction(chat_mdl, content, topn=3):
460
+ prompt = f"""
461
+ Role: You're a text analyzer.
462
+ Task: extract the most important keywords/phrases of a given piece of text content.
463
+ Requirements:
464
+ - Summarize the text content, and give top {topn} important keywords/phrases.
465
+ - The keywords MUST be in language of the given piece of text content.
466
+ - The keywords are delimited by ENGLISH COMMA.
467
+ - Keywords ONLY in output.
468
+
469
+ ### Text Content
470
+ {content}
471
+
472
+ """
473
+ msg = [
474
+ {"role": "system", "content": prompt},
475
+ {"role": "user", "content": "Output: "}
476
+ ]
477
+ _, msg = message_fit_in(msg, chat_mdl.max_length)
478
+ kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
479
+ if isinstance(kwd, tuple): kwd = kwd[0]
480
+ if kwd.find("**ERROR**") >=0: return ""
481
+ return kwd
482
+
483
+
484
+ def question_proposal(chat_mdl, content, topn=3):
485
+ prompt = f"""
486
+ Role: You're a text analyzer.
487
+ Task: propose {topn} questions about a given piece of text content.
488
+ Requirements:
489
+ - Understand and summarize the text content, and propose top {topn} important questions.
490
+ - The questions SHOULD NOT have overlapping meanings.
491
+ - The questions SHOULD cover the main content of the text as much as possible.
492
+ - The questions MUST be in language of the given piece of text content.
493
+ - One question per line.
494
+ - Question ONLY in output.
495
+
496
+ ### Text Content
497
+ {content}
498
+
499
+ """
500
+ msg = [
501
+ {"role": "system", "content": prompt},
502
+ {"role": "user", "content": "Output: "}
503
+ ]
504
+ _, msg = message_fit_in(msg, chat_mdl.max_length)
505
+ kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
506
+ if isinstance(kwd, tuple): kwd = kwd[0]
507
+ if kwd.find("**ERROR**") >= 0: return ""
508
+ return kwd
509
+
510
+
511
  def full_question(tenant_id, llm_id, messages):
512
  if llm_id2llm_type(llm_id) == "image2text":
513
  chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
rag/nlp/__init__.py CHANGED
@@ -570,14 +570,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
570
 
571
  return cks, images
572
 
573
-
574
- def keyword_extraction(chat_mdl, content):
575
- prompt = """
576
- You're a question analyzer.
577
- 1. Please give me the most important keyword/phrase of this question.
578
- Answer format: (in language of user's question)
579
- - keyword:
580
- """
581
- kwd = chat_mdl.chat(prompt, [{"role": "user", "content": content}], {"temperature": 0.2})
582
- if isinstance(kwd, tuple): return kwd[0]
583
- return kwd
 
570
 
571
  return cks, images
572
 
 
 
 
 
 
 
 
 
 
 
 
rag/svr/task_executor.py CHANGED
@@ -34,6 +34,7 @@ import pandas as pd
34
  from elasticsearch_dsl import Q
35
 
36
  from api.db import LLMType, ParserType
 
37
  from api.db.services.document_service import DocumentService
38
  from api.db.services.llm_service import LLMBundle
39
  from api.db.services.task_service import TaskService
@@ -198,6 +199,23 @@ def build(row):
198
  d["_id"] = md5.hexdigest()
199
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
200
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  if not d.get("image"):
202
  docs.append(d)
203
  continue
 
34
  from elasticsearch_dsl import Q
35
 
36
  from api.db import LLMType, ParserType
37
+ from api.db.services.dialog_service import keyword_extraction, question_proposal
38
  from api.db.services.document_service import DocumentService
39
  from api.db.services.llm_service import LLMBundle
40
  from api.db.services.task_service import TaskService
 
199
  d["_id"] = md5.hexdigest()
200
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
201
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
202
+
203
+ if row["parser_config"].get("auto_keywords", 0):
204
+ chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
205
+ d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
206
+ row["parser_config"]["auto_keywords"]).split(",")
207
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
208
+
209
+ if row["parser_config"].get("auto_questions", 0):
210
+ chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
211
+ qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
212
+ ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
213
+ qst = rag_tokenizer.tokenize(qst)
214
+ if "content_ltks" in ck:
215
+ ck["content_ltks"] += " " + qst
216
+ if "content_sm_ltks" in ck:
217
+ ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
218
+
219
  if not d.get("image"):
220
  docs.append(d)
221
  continue