Kevin Hu commited on
Commit
6054f54
Β·
1 Parent(s): 8d4e686

Add graphrag (#1793)

Browse files

### What problem does this PR solve?

#1594

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. {graph β†’ agent}/README.md +0 -0
  2. {graph β†’ agent}/README_zh.md +0 -0
  3. {graph β†’ agent}/__init__.py +0 -0
  4. {graph β†’ agent}/canvas.py +3 -3
  5. {graph β†’ agent}/component/__init__.py +0 -0
  6. {graph β†’ agent}/component/answer.py +1 -1
  7. {graph β†’ agent}/component/arxiv.py +2 -4
  8. {graph β†’ agent}/component/baidu.py +2 -2
  9. {graph β†’ agent}/component/base.py +2 -2
  10. {graph β†’ agent}/component/begin.py +2 -3
  11. {graph β†’ agent}/component/bing.py +2 -2
  12. {graph β†’ agent}/component/categorize.py +2 -5
  13. {graph β†’ agent}/component/cite.py +1 -1
  14. {graph β†’ agent}/component/duckduckgo.py +2 -4
  15. {graph β†’ agent}/component/generate.py +1 -3
  16. {graph β†’ agent}/component/google.py +2 -2
  17. {graph β†’ agent}/component/googlescholar.py +2 -2
  18. {graph β†’ agent}/component/keyword.py +2 -2
  19. {graph β†’ agent}/component/message.py +1 -4
  20. {graph β†’ agent}/component/pubmed.py +2 -4
  21. {graph β†’ agent}/component/relevant.py +1 -1
  22. {graph β†’ agent}/component/retrieval.py +1 -1
  23. {graph β†’ agent}/component/rewrite.py +1 -1
  24. {graph β†’ agent}/component/switch.py +1 -6
  25. {graph β†’ agent}/component/wikipedia.py +2 -2
  26. {graph β†’ agent}/settings.py +0 -0
  27. {graph β†’ agent}/templates/HR_callout_zh.json +0 -0
  28. {graph β†’ agent}/templates/customer_service.json +0 -0
  29. {graph β†’ agent}/templates/general_chat_bot.json +0 -0
  30. {graph β†’ agent}/templates/interpreter.json +0 -0
  31. {graph β†’ agent}/templates/websearch_assistant.json +0 -0
  32. {graph β†’ agent}/test/client.py +2 -3
  33. {graph β†’ agent}/test/dsl_examples/categorize.json +0 -0
  34. {graph β†’ agent}/test/dsl_examples/customer_service.json +0 -0
  35. {graph β†’ agent}/test/dsl_examples/headhunter_zh.json +0 -0
  36. {graph β†’ agent}/test/dsl_examples/intergreper.json +0 -0
  37. {graph β†’ agent}/test/dsl_examples/interpreter.json +0 -0
  38. {graph β†’ agent}/test/dsl_examples/keyword_wikipedia_and_generate.json +0 -0
  39. {graph β†’ agent}/test/dsl_examples/retrieval_and_generate.json +0 -0
  40. {graph β†’ agent}/test/dsl_examples/retrieval_categorize_and_generate.json +0 -0
  41. {graph β†’ agent}/test/dsl_examples/retrieval_relevant_and_generate.json +0 -0
  42. {graph β†’ agent}/test/dsl_examples/retrieval_relevant_keyword_baidu_and_generate.json +0 -0
  43. {graph β†’ agent}/test/dsl_examples/retrieval_relevant_rewrite_and_generate.json +0 -0
  44. api/apps/api_app.py +1 -3
  45. api/apps/canvas_app.py +1 -4
  46. api/apps/chunk_app.py +36 -10
  47. api/apps/dataset_api.py +2 -1
  48. api/db/__init__.py +1 -0
  49. api/db/init_data.py +3 -3
  50. api/db/services/dialog_service.py +8 -5
{graph β†’ agent}/README.md RENAMED
File without changes
{graph β†’ agent}/README_zh.md RENAMED
File without changes
{graph β†’ agent}/__init__.py RENAMED
File without changes
{graph β†’ agent}/canvas.py RENAMED
@@ -22,9 +22,9 @@ from functools import partial
22
 
23
  import pandas as pd
24
 
25
- from graph.component import component_class
26
- from graph.component.base import ComponentBase
27
- from graph.settings import flow_logger, DEBUG
28
 
29
 
30
  class Canvas(ABC):
 
22
 
23
  import pandas as pd
24
 
25
+ from agent.component import component_class
26
+ from agent.component.base import ComponentBase
27
+ from agent.settings import flow_logger, DEBUG
28
 
29
 
30
  class Canvas(ABC):
{graph β†’ agent}/component/__init__.py RENAMED
File without changes
{graph β†’ agent}/component/answer.py RENAMED
@@ -19,7 +19,7 @@ from functools import partial
19
 
20
  import pandas as pd
21
 
22
- from graph.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class AnswerParam(ComponentParamBase):
 
19
 
20
  import pandas as pd
21
 
22
+ from agent.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class AnswerParam(ComponentParamBase):
{graph β†’ agent}/component/arxiv.py RENAMED
@@ -13,13 +13,11 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
- import random
17
  from abc import ABC
18
- from functools import partial
19
  import arxiv
20
  import pandas as pd
21
- from graph.settings import DEBUG
22
- from graph.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class ArXivParam(ComponentParamBase):
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  from abc import ABC
 
17
  import arxiv
18
  import pandas as pd
19
+ from agent.settings import DEBUG
20
+ from agent.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class ArXivParam(ComponentParamBase):
{graph β†’ agent}/component/baidu.py RENAMED
@@ -19,8 +19,8 @@ from functools import partial
19
  import pandas as pd
20
  import requests
21
  import re
22
- from graph.settings import DEBUG
23
- from graph.component.base import ComponentBase, ComponentParamBase
24
 
25
 
26
  class BaiduParam(ComponentParamBase):
 
19
  import pandas as pd
20
  import requests
21
  import re
22
+ from agent.settings import DEBUG
23
+ from agent.component.base import ComponentBase, ComponentParamBase
24
 
25
 
26
  class BaiduParam(ComponentParamBase):
{graph β†’ agent}/component/base.py RENAMED
@@ -23,8 +23,8 @@ from typing import List, Dict, Tuple, Union
23
 
24
  import pandas as pd
25
 
26
- from graph import settings
27
- from graph.settings import flow_logger, DEBUG
28
 
29
  _FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
30
  _DEPRECATED_PARAMS = "_deprecated_params"
 
23
 
24
  import pandas as pd
25
 
26
+ from agent import settings
27
+ from agent.settings import flow_logger, DEBUG
28
 
29
  _FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
30
  _DEPRECATED_PARAMS = "_deprecated_params"
{graph β†’ agent}/component/begin.py RENAMED
@@ -13,11 +13,10 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
- import json
17
  from functools import partial
18
-
19
  import pandas as pd
20
- from graph.component.base import ComponentBase, ComponentParamBase
 
21
 
22
  class BeginParam(ComponentParamBase):
23
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  from functools import partial
 
17
  import pandas as pd
18
+ from agent.component.base import ComponentBase, ComponentParamBase
19
+
20
 
21
  class BeginParam(ComponentParamBase):
22
 
{graph β†’ agent}/component/bing.py RENAMED
@@ -16,8 +16,8 @@
16
  from abc import ABC
17
  import requests
18
  import pandas as pd
19
- from graph.settings import DEBUG
20
- from graph.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class BingParam(ComponentParamBase):
 
16
  from abc import ABC
17
  import requests
18
  import pandas as pd
19
+ from agent.settings import DEBUG
20
+ from agent.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class BingParam(ComponentParamBase):
{graph β†’ agent}/component/categorize.py RENAMED
@@ -14,13 +14,10 @@
14
  # limitations under the License.
15
  #
16
  from abc import ABC
17
-
18
- import pandas as pd
19
-
20
  from api.db import LLMType
21
  from api.db.services.llm_service import LLMBundle
22
- from graph.component import GenerateParam, Generate
23
- from graph.settings import DEBUG
24
 
25
 
26
  class CategorizeParam(GenerateParam):
 
14
  # limitations under the License.
15
  #
16
  from abc import ABC
 
 
 
17
  from api.db import LLMType
18
  from api.db.services.llm_service import LLMBundle
19
+ from agent.component import GenerateParam, Generate
20
+ from agent.settings import DEBUG
21
 
22
 
23
  class CategorizeParam(GenerateParam):
{graph β†’ agent}/component/cite.py RENAMED
@@ -21,7 +21,7 @@ from api.db import LLMType
21
  from api.db.services.knowledgebase_service import KnowledgebaseService
22
  from api.db.services.llm_service import LLMBundle
23
  from api.settings import retrievaler
24
- from graph.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class CiteParam(ComponentParamBase):
 
21
  from api.db.services.knowledgebase_service import KnowledgebaseService
22
  from api.db.services.llm_service import LLMBundle
23
  from api.settings import retrievaler
24
+ from agent.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class CiteParam(ComponentParamBase):
{graph β†’ agent}/component/duckduckgo.py RENAMED
@@ -13,13 +13,11 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
- import random
17
  from abc import ABC
18
- from functools import partial
19
  from duckduckgo_search import DDGS
20
  import pandas as pd
21
- from graph.settings import DEBUG
22
- from graph.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class DuckDuckGoParam(ComponentParamBase):
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  from abc import ABC
 
17
  from duckduckgo_search import DDGS
18
  import pandas as pd
19
+ from agent.settings import DEBUG
20
+ from agent.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class DuckDuckGoParam(ComponentParamBase):
{graph β†’ agent}/component/generate.py RENAMED
@@ -15,13 +15,11 @@
15
  #
16
  import re
17
  from functools import partial
18
-
19
  import pandas as pd
20
-
21
  from api.db import LLMType
22
  from api.db.services.llm_service import LLMBundle
23
  from api.settings import retrievaler
24
- from graph.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class GenerateParam(ComponentParamBase):
 
15
  #
16
  import re
17
  from functools import partial
 
18
  import pandas as pd
 
19
  from api.db import LLMType
20
  from api.db.services.llm_service import LLMBundle
21
  from api.settings import retrievaler
22
+ from agent.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class GenerateParam(ComponentParamBase):
{graph β†’ agent}/component/google.py RENAMED
@@ -16,8 +16,8 @@
16
  from abc import ABC
17
  from serpapi import GoogleSearch
18
  import pandas as pd
19
- from graph.settings import DEBUG
20
- from graph.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class GoogleParam(ComponentParamBase):
 
16
  from abc import ABC
17
  from serpapi import GoogleSearch
18
  import pandas as pd
19
+ from agent.settings import DEBUG
20
+ from agent.component.base import ComponentBase, ComponentParamBase
21
 
22
 
23
  class GoogleParam(ComponentParamBase):
{graph β†’ agent}/component/googlescholar.py RENAMED
@@ -15,8 +15,8 @@
15
  #
16
  from abc import ABC
17
  import pandas as pd
18
- from graph.settings import DEBUG
19
- from graph.component.base import ComponentBase, ComponentParamBase
20
  from scholarly import scholarly
21
 
22
 
 
15
  #
16
  from abc import ABC
17
  import pandas as pd
18
+ from agent.settings import DEBUG
19
+ from agent.component.base import ComponentBase, ComponentParamBase
20
  from scholarly import scholarly
21
 
22
 
{graph β†’ agent}/component/keyword.py RENAMED
@@ -17,8 +17,8 @@ import re
17
  from abc import ABC
18
  from api.db import LLMType
19
  from api.db.services.llm_service import LLMBundle
20
- from graph.component import GenerateParam, Generate
21
- from graph.settings import DEBUG
22
 
23
 
24
  class KeywordExtractParam(GenerateParam):
 
17
  from abc import ABC
18
  from api.db import LLMType
19
  from api.db.services.llm_service import LLMBundle
20
+ from agent.component import GenerateParam, Generate
21
+ from agent.settings import DEBUG
22
 
23
 
24
  class KeywordExtractParam(GenerateParam):
{graph β†’ agent}/component/message.py RENAMED
@@ -16,10 +16,7 @@
16
  import random
17
  from abc import ABC
18
  from functools import partial
19
-
20
- import pandas as pd
21
-
22
- from graph.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class MessageParam(ComponentParamBase):
 
16
  import random
17
  from abc import ABC
18
  from functools import partial
19
+ from agent.component.base import ComponentBase, ComponentParamBase
 
 
 
20
 
21
 
22
  class MessageParam(ComponentParamBase):
{graph β†’ agent}/component/pubmed.py RENAMED
@@ -13,14 +13,12 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
- import random
17
  from abc import ABC
18
- from functools import partial
19
  from Bio import Entrez
20
  import pandas as pd
21
  import xml.etree.ElementTree as ET
22
- from graph.settings import DEBUG
23
- from graph.component.base import ComponentBase, ComponentParamBase
24
 
25
 
26
  class PubMedParam(ComponentParamBase):
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  from abc import ABC
 
17
  from Bio import Entrez
18
  import pandas as pd
19
  import xml.etree.ElementTree as ET
20
+ from agent.settings import DEBUG
21
+ from agent.component.base import ComponentBase, ComponentParamBase
22
 
23
 
24
  class PubMedParam(ComponentParamBase):
{graph β†’ agent}/component/relevant.py RENAMED
@@ -16,7 +16,7 @@
16
  from abc import ABC
17
  from api.db import LLMType
18
  from api.db.services.llm_service import LLMBundle
19
- from graph.component import GenerateParam, Generate
20
  from rag.utils import num_tokens_from_string, encoder
21
 
22
 
 
16
  from abc import ABC
17
  from api.db import LLMType
18
  from api.db.services.llm_service import LLMBundle
19
+ from agent.component import GenerateParam, Generate
20
  from rag.utils import num_tokens_from_string, encoder
21
 
22
 
{graph β†’ agent}/component/retrieval.py RENAMED
@@ -21,7 +21,7 @@ from api.db import LLMType
21
  from api.db.services.knowledgebase_service import KnowledgebaseService
22
  from api.db.services.llm_service import LLMBundle
23
  from api.settings import retrievaler
24
- from graph.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class RetrievalParam(ComponentParamBase):
 
21
  from api.db.services.knowledgebase_service import KnowledgebaseService
22
  from api.db.services.llm_service import LLMBundle
23
  from api.settings import retrievaler
24
+ from agent.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class RetrievalParam(ComponentParamBase):
{graph β†’ agent}/component/rewrite.py RENAMED
@@ -16,7 +16,7 @@
16
  from abc import ABC
17
  from api.db import LLMType
18
  from api.db.services.llm_service import LLMBundle
19
- from graph.component import GenerateParam, Generate
20
 
21
 
22
  class RewriteQuestionParam(GenerateParam):
 
16
  from abc import ABC
17
  from api.db import LLMType
18
  from api.db.services.llm_service import LLMBundle
19
+ from agent.component import GenerateParam, Generate
20
 
21
 
22
  class RewriteQuestionParam(GenerateParam):
{graph β†’ agent}/component/switch.py RENAMED
@@ -16,12 +16,7 @@
16
  from abc import ABC
17
 
18
  import pandas as pd
19
-
20
- from api.db import LLMType
21
- from api.db.services.knowledgebase_service import KnowledgebaseService
22
- from api.db.services.llm_service import LLMBundle
23
- from api.settings import retrievaler
24
- from graph.component.base import ComponentBase, ComponentParamBase
25
 
26
 
27
  class SwitchParam(ComponentParamBase):
 
16
  from abc import ABC
17
 
18
  import pandas as pd
19
+ from agent.component.base import ComponentBase, ComponentParamBase
 
 
 
 
 
20
 
21
 
22
  class SwitchParam(ComponentParamBase):
{graph β†’ agent}/component/wikipedia.py RENAMED
@@ -18,8 +18,8 @@ from abc import ABC
18
  from functools import partial
19
  import wikipedia
20
  import pandas as pd
21
- from graph.settings import DEBUG
22
- from graph.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class WikipediaParam(ComponentParamBase):
 
18
  from functools import partial
19
  import wikipedia
20
  import pandas as pd
21
+ from agent.settings import DEBUG
22
+ from agent.component.base import ComponentBase, ComponentParamBase
23
 
24
 
25
  class WikipediaParam(ComponentParamBase):
{graph β†’ agent}/settings.py RENAMED
File without changes
{graph β†’ agent}/templates/HR_callout_zh.json RENAMED
File without changes
{graph β†’ agent}/templates/customer_service.json RENAMED
File without changes
{graph β†’ agent}/templates/general_chat_bot.json RENAMED
File without changes
{graph β†’ agent}/templates/interpreter.json RENAMED
File without changes
{graph β†’ agent}/templates/websearch_assistant.json RENAMED
File without changes
{graph β†’ agent}/test/client.py RENAMED
@@ -16,9 +16,8 @@
16
  import argparse
17
  import os
18
  from functools import partial
19
- import readline
20
- from graph.canvas import Canvas
21
- from graph.settings import DEBUG
22
 
23
  if __name__ == '__main__':
24
  parser = argparse.ArgumentParser()
 
16
  import argparse
17
  import os
18
  from functools import partial
19
+ from agent.canvas import Canvas
20
+ from agent.settings import DEBUG
 
21
 
22
  if __name__ == '__main__':
23
  parser = argparse.ArgumentParser()
{graph β†’ agent}/test/dsl_examples/categorize.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/customer_service.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/headhunter_zh.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/intergreper.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/interpreter.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/keyword_wikipedia_and_generate.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/retrieval_and_generate.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/retrieval_categorize_and_generate.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/retrieval_relevant_and_generate.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/retrieval_relevant_keyword_baidu_and_generate.json RENAMED
File without changes
{graph β†’ agent}/test/dsl_examples/retrieval_relevant_rewrite_and_generate.json RENAMED
File without changes
api/apps/api_app.py CHANGED
@@ -20,7 +20,7 @@ from datetime import datetime, timedelta
20
  from flask import request, Response
21
  from flask_login import login_required, current_user
22
 
23
- from api.db import FileType, ParserType, FileSource, LLMType
24
  from api.db.db_models import APIToken, API4Conversation, Task, File
25
  from api.db.services import duplicate_name
26
  from api.db.services.api_service import APITokenService, API4ConversationService
@@ -29,7 +29,6 @@ from api.db.services.document_service import DocumentService
29
  from api.db.services.file2document_service import File2DocumentService
30
  from api.db.services.file_service import FileService
31
  from api.db.services.knowledgebase_service import KnowledgebaseService
32
- from api.db.services.llm_service import TenantLLMService
33
  from api.db.services.task_service import queue_tasks, TaskService
34
  from api.db.services.user_service import UserTenantService
35
  from api.settings import RetCode, retrievaler
@@ -38,7 +37,6 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
38
  from itsdangerous import URLSafeTimedSerializer
39
 
40
  from api.utils.file_utils import filename_type, thumbnail
41
- from rag.nlp import keyword_extraction
42
  from rag.utils.minio_conn import MINIO
43
 
44
 
 
20
  from flask import request, Response
21
  from flask_login import login_required, current_user
22
 
23
+ from api.db import FileType, ParserType, FileSource
24
  from api.db.db_models import APIToken, API4Conversation, Task, File
25
  from api.db.services import duplicate_name
26
  from api.db.services.api_service import APITokenService, API4ConversationService
 
29
  from api.db.services.file2document_service import File2DocumentService
30
  from api.db.services.file_service import FileService
31
  from api.db.services.knowledgebase_service import KnowledgebaseService
 
32
  from api.db.services.task_service import queue_tasks, TaskService
33
  from api.db.services.user_service import UserTenantService
34
  from api.settings import RetCode, retrievaler
 
37
  from itsdangerous import URLSafeTimedSerializer
38
 
39
  from api.utils.file_utils import filename_type, thumbnail
 
40
  from rag.utils.minio_conn import MINIO
41
 
42
 
api/apps/canvas_app.py CHANGED
@@ -15,15 +15,12 @@
15
  #
16
  import json
17
  from functools import partial
18
-
19
  from flask import request, Response
20
  from flask_login import login_required, current_user
21
-
22
- from api.db.db_models import UserCanvas
23
  from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
24
  from api.utils import get_uuid
25
  from api.utils.api_utils import get_json_result, server_error_response, validate_request
26
- from graph.canvas import Canvas
27
 
28
 
29
  @manager.route('/templates', methods=['GET'])
 
15
  #
16
  import json
17
  from functools import partial
 
18
  from flask import request, Response
19
  from flask_login import login_required, current_user
 
 
20
  from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
21
  from api.utils import get_uuid
22
  from api.utils.api_utils import get_json_result, server_error_response, validate_request
23
+ from agent.canvas import Canvas
24
 
25
 
26
  @manager.route('/templates', methods=['GET'])
api/apps/chunk_app.py CHANGED
@@ -14,6 +14,8 @@
14
  # limitations under the License.
15
  #
16
  import datetime
 
 
17
 
18
  from flask import request
19
  from flask_login import login_required, current_user
@@ -29,7 +31,7 @@ from api.db.services.llm_service import TenantLLMService
29
  from api.db.services.user_service import UserTenantService
30
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
31
  from api.db.services.document_service import DocumentService
32
- from api.settings import RetCode, retrievaler
33
  from api.utils.api_utils import get_json_result
34
  import hashlib
35
  import re
@@ -61,7 +63,8 @@ def list_chunk():
61
  for id in sres.ids:
62
  d = {
63
  "chunk_id": id,
64
- "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[id].get(
 
65
  "content_with_weight", ""),
66
  "doc_id": sres.field[id]["doc_id"],
67
  "docnm_kwd": sres.field[id]["docnm_kwd"],
@@ -136,11 +139,11 @@ def set():
136
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
137
  if not tenant_id:
138
  return get_data_error_result(retmsg="Tenant not found!")
139
-
140
  embd_id = DocumentService.get_embd_id(req["doc_id"])
141
  embd_mdl = TenantLLMService.model_instance(
142
  tenant_id, LLMType.EMBEDDING.value, embd_id)
143
-
144
  e, doc = DocumentService.get_by_id(req["doc_id"])
145
  if not e:
146
  return get_data_error_result(retmsg="Document not found!")
@@ -185,7 +188,7 @@ def switch():
185
 
186
  @manager.route('/rm', methods=['POST'])
187
  @login_required
188
- @validate_request("chunk_ids","doc_id")
189
  def rm():
190
  req = request.json
191
  try:
@@ -230,11 +233,11 @@ def create():
230
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
231
  if not tenant_id:
232
  return get_data_error_result(retmsg="Tenant not found!")
233
-
234
  embd_id = DocumentService.get_embd_id(req["doc_id"])
235
  embd_mdl = TenantLLMService.model_instance(
236
  tenant_id, LLMType.EMBEDDING.value, embd_id)
237
-
238
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
239
  v = 0.1 * v[0] + 0.9 * v[1]
240
  d["q_%d_vec" % len(v)] = v.tolist()
@@ -277,9 +280,10 @@ def retrieval_test():
277
  chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
278
  question += keyword_extraction(chat_mdl, question)
279
 
280
- ranks = retrievaler.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size,
281
- similarity_threshold, vector_similarity_weight, top,
282
- doc_ids, rerank_mdl=rerank_mdl)
 
283
  for c in ranks["chunks"]:
284
  if "vector" in c:
285
  del c["vector"]
@@ -290,3 +294,25 @@ def retrieval_test():
290
  return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
291
  retcode=RetCode.DATA_ERROR)
292
  return server_error_response(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # limitations under the License.
15
  #
16
  import datetime
17
+ import json
18
+ import traceback
19
 
20
  from flask import request
21
  from flask_login import login_required, current_user
 
31
  from api.db.services.user_service import UserTenantService
32
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
33
  from api.db.services.document_service import DocumentService
34
+ from api.settings import RetCode, retrievaler, kg_retrievaler
35
  from api.utils.api_utils import get_json_result
36
  import hashlib
37
  import re
 
63
  for id in sres.ids:
64
  d = {
65
  "chunk_id": id,
66
+ "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
67
+ id].get(
68
  "content_with_weight", ""),
69
  "doc_id": sres.field[id]["doc_id"],
70
  "docnm_kwd": sres.field[id]["docnm_kwd"],
 
139
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
140
  if not tenant_id:
141
  return get_data_error_result(retmsg="Tenant not found!")
142
+
143
  embd_id = DocumentService.get_embd_id(req["doc_id"])
144
  embd_mdl = TenantLLMService.model_instance(
145
  tenant_id, LLMType.EMBEDDING.value, embd_id)
146
+
147
  e, doc = DocumentService.get_by_id(req["doc_id"])
148
  if not e:
149
  return get_data_error_result(retmsg="Document not found!")
 
188
 
189
  @manager.route('/rm', methods=['POST'])
190
  @login_required
191
+ @validate_request("chunk_ids", "doc_id")
192
  def rm():
193
  req = request.json
194
  try:
 
233
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
234
  if not tenant_id:
235
  return get_data_error_result(retmsg="Tenant not found!")
236
+
237
  embd_id = DocumentService.get_embd_id(req["doc_id"])
238
  embd_mdl = TenantLLMService.model_instance(
239
  tenant_id, LLMType.EMBEDDING.value, embd_id)
240
+
241
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
242
  v = 0.1 * v[0] + 0.9 * v[1]
243
  d["q_%d_vec" % len(v)] = v.tolist()
 
280
  chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
281
  question += keyword_extraction(chat_mdl, question)
282
 
283
+ retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
284
+ ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size,
285
+ similarity_threshold, vector_similarity_weight, top,
286
+ doc_ids, rerank_mdl=rerank_mdl)
287
  for c in ranks["chunks"]:
288
  if "vector" in c:
289
  del c["vector"]
 
294
  return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
295
  retcode=RetCode.DATA_ERROR)
296
  return server_error_response(e)
297
+
298
+
299
+ @manager.route('/knowledge_graph', methods=['GET'])
300
+ @login_required
301
+ def knowledge_graph():
302
+ doc_id = request.args["doc_id"]
303
+ req = {
304
+ "doc_ids":[doc_id],
305
+ "knowledge_graph_kwd": ["graph", "mind_map"]
306
+ }
307
+ tenant_id = DocumentService.get_tenant_id(doc_id)
308
+ sres = retrievaler.search(req, search.index_name(tenant_id))
309
+ obj = {"graph": {}, "mind_map": {}}
310
+ for id in sres.ids[:2]:
311
+ ty = sres.field[id]["knowledge_graph_kwd"]
312
+ try:
313
+ obj[ty] = json.loads(sres.field[id]["content_with_weight"])
314
+ except Exception as e:
315
+ print(traceback.format_exc(), flush=True)
316
+
317
+ return get_json_result(data=obj)
318
+
api/apps/dataset_api.py CHANGED
@@ -623,7 +623,7 @@ def doc_parse_callback(doc_id, prog=None, msg=""):
623
  if cancel:
624
  raise Exception("The parsing process has been cancelled!")
625
 
626
-
627
  def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
628
  match parser_name:
629
  case "book":
@@ -656,6 +656,7 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
656
  return False
657
 
658
  return True
 
659
 
660
 
661
  @manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
 
623
  if cancel:
624
  raise Exception("The parsing process has been cancelled!")
625
 
626
+ """
627
  def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
628
  match parser_name:
629
  case "book":
 
656
  return False
657
 
658
  return True
659
+ """
660
 
661
 
662
  @manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
api/db/__init__.py CHANGED
@@ -85,6 +85,7 @@ class ParserType(StrEnum):
85
  PICTURE = "picture"
86
  ONE = "one"
87
  AUDIO = "audio"
 
88
 
89
 
90
  class FileSource(StrEnum):
 
85
  PICTURE = "picture"
86
  ONE = "one"
87
  AUDIO = "audio"
88
+ KG = "knowledge_graph"
89
 
90
 
91
  class FileSource(StrEnum):
api/db/init_data.py CHANGED
@@ -122,7 +122,7 @@ def init_llm_factory():
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
124
  TenantService.filter_update([1 == 1], {
125
- "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
126
  ## insert openai two embedding models to the current openai user.
127
  print("Start to insert 2 OpenAI embedding models...")
128
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@@ -145,7 +145,7 @@ def init_llm_factory():
145
  """
146
  drop table llm;
147
  drop table llm_factories;
148
- update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
149
  alter table knowledgebase modify avatar longtext;
150
  alter table user modify avatar longtext;
151
  alter table dialog modify icon longtext;
@@ -153,7 +153,7 @@ def init_llm_factory():
153
 
154
 
155
  def add_graph_templates():
156
- dir = os.path.join(get_project_base_directory(), "graph", "templates")
157
  for fnm in os.listdir(dir):
158
  try:
159
  cnvs = json.load(open(os.path.join(dir, fnm), "r"))
 
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
124
  TenantService.filter_update([1 == 1], {
125
+ "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
126
  ## insert openai two embedding models to the current openai user.
127
  print("Start to insert 2 OpenAI embedding models...")
128
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
 
145
  """
146
  drop table llm;
147
  drop table llm_factories;
148
+ update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph';
149
  alter table knowledgebase modify avatar longtext;
150
  alter table user modify avatar longtext;
151
  alter table dialog modify icon longtext;
 
153
 
154
 
155
  def add_graph_templates():
156
+ dir = os.path.join(get_project_base_directory(), "agent", "templates")
157
  for fnm in os.listdir(dir):
158
  try:
159
  cnvs = json.load(open(os.path.join(dir, fnm), "r"))
api/db/services/dialog_service.py CHANGED
@@ -18,12 +18,12 @@ import json
18
  import re
19
  from copy import deepcopy
20
 
21
- from api.db import LLMType
22
  from api.db.db_models import Dialog, Conversation
23
  from api.db.services.common_service import CommonService
24
  from api.db.services.knowledgebase_service import KnowledgebaseService
25
  from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
26
- from api.settings import chat_logger, retrievaler
27
  from rag.app.resume import forbidden_select_fields4resume
28
  from rag.nlp import keyword_extraction
29
  from rag.nlp.search import index_name
@@ -101,6 +101,9 @@ def chat(dialog, messages, stream=True, **kwargs):
101
  yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
102
  return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
103
 
 
 
 
104
  questions = [m["content"] for m in messages if m["role"] == "user"]
105
  embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
106
  if llm_id2llm_type(dialog.llm_id) == "image2text":
@@ -138,7 +141,7 @@ def chat(dialog, messages, stream=True, **kwargs):
138
  else:
139
  if prompt_config.get("keyword", False):
140
  questions[-1] += keyword_extraction(chat_mdl, questions[-1])
141
- kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
142
  dialog.similarity_threshold,
143
  dialog.vector_similarity_weight,
144
  doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
@@ -147,7 +150,7 @@ def chat(dialog, messages, stream=True, **kwargs):
147
  #self-rag
148
  if dialog.prompt_config.get("self_rag") and not relevant(dialog.tenant_id, dialog.llm_id, questions[-1], knowledges):
149
  questions[-1] = rewrite(dialog.tenant_id, dialog.llm_id, questions[-1])
150
- kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
151
  dialog.similarity_threshold,
152
  dialog.vector_similarity_weight,
153
  doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
@@ -179,7 +182,7 @@ def chat(dialog, messages, stream=True, **kwargs):
179
  nonlocal prompt_config, knowledges, kwargs, kbinfos
180
  refs = []
181
  if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
182
- answer, idx = retrievaler.insert_citations(answer,
183
  [ck["content_ltks"]
184
  for ck in kbinfos["chunks"]],
185
  [ck["vector"]
 
18
  import re
19
  from copy import deepcopy
20
 
21
+ from api.db import LLMType, ParserType
22
  from api.db.db_models import Dialog, Conversation
23
  from api.db.services.common_service import CommonService
24
  from api.db.services.knowledgebase_service import KnowledgebaseService
25
  from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
26
+ from api.settings import chat_logger, retrievaler, kg_retrievaler
27
  from rag.app.resume import forbidden_select_fields4resume
28
  from rag.nlp import keyword_extraction
29
  from rag.nlp.search import index_name
 
101
  yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
102
  return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
103
 
104
+ is_kg = all([kb.parser_id == ParserType.KG for kb in kbs])
105
+ retr = retrievaler if not is_kg else kg_retrievaler
106
+
107
  questions = [m["content"] for m in messages if m["role"] == "user"]
108
  embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
109
  if llm_id2llm_type(dialog.llm_id) == "image2text":
 
141
  else:
142
  if prompt_config.get("keyword", False):
143
  questions[-1] += keyword_extraction(chat_mdl, questions[-1])
144
+ kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
145
  dialog.similarity_threshold,
146
  dialog.vector_similarity_weight,
147
  doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
 
150
  #self-rag
151
  if dialog.prompt_config.get("self_rag") and not relevant(dialog.tenant_id, dialog.llm_id, questions[-1], knowledges):
152
  questions[-1] = rewrite(dialog.tenant_id, dialog.llm_id, questions[-1])
153
+ kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
154
  dialog.similarity_threshold,
155
  dialog.vector_similarity_weight,
156
  doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
 
182
  nonlocal prompt_config, knowledges, kwargs, kbinfos
183
  refs = []
184
  if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
185
+ answer, idx = retr.insert_citations(answer,
186
  [ck["content_ltks"]
187
  for ck in kbinfos["chunks"]],
188
  [ck["vector"]