GYH commited on
Commit
d9499f9
·
1 Parent(s): 8fb1944

Add Graph Baidusearch and dsl_example (#1378)

Browse files

### What problem does this PR solve?

#918

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

graph/component/__init__.py CHANGED
@@ -8,6 +8,8 @@ from .switch import Switch, SwitchParam
8
  from .relevant import Relevant, RelevantParam
9
  from .message import Message, MessageParam
10
  from .rewrite import RewriteQuestion, RewriteQuestionParam
 
 
11
 
12
 
13
  def component_class(class_name):
 
8
  from .relevant import Relevant, RelevantParam
9
  from .message import Message, MessageParam
10
  from .rewrite import RewriteQuestion, RewriteQuestionParam
11
+ from .keyword import KeywordExtract, KeywordExtractParam
12
+ from .baidu import Baidu, BaiduParam
13
 
14
 
15
  def component_class(class_name):
graph/component/baidu.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import random
17
+ from abc import ABC
18
+ from functools import partial
19
+ import pandas as pd
20
+ import requests
21
+ import re
22
+
23
+ from graph.component.base import ComponentBase, ComponentParamBase
24
+
25
+
26
+ class BaiduParam(ComponentParamBase):
27
+ """
28
+ Define the Baidu component parameters.
29
+ """
30
+
31
+ def __init__(self):
32
+ super().__init__()
33
+ self.top_n = 10
34
+
35
+ def check(self):
36
+ self.check_positive_integer(self.top_n, "Top N")
37
+
38
+
39
+ class Baidu(ComponentBase, ABC):
40
+ component_name = "Baidu"
41
+
42
+ def _run(self, history, **kwargs):
43
+ ans = self.get_input()
44
+ ans = " - ".join(ans["content"]) if "content" in ans else ""
45
+ if not ans:
46
+ return Baidu.be_output(self._param.no)
47
+
48
+ url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n)
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
51
+ response = requests.get(url=url, headers=headers)
52
+
53
+ baidu_res = re.findall(r'"contentText":"(.*?)"', response.text)
54
+ url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text)
55
+ for i in range(min(len(baidu_res), len(url_res))):
56
+ baidu_res[i] += '<a>' + url_res[i] + '</a>'
57
+
58
+ del url_res
59
+
60
+ br = pd.DataFrame(baidu_res, columns=['content'])
61
+ print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", br)
62
+ return br
graph/component/keyword.py CHANGED
@@ -22,18 +22,17 @@ from graph.settings import DEBUG
22
 
23
 
24
  class KeywordExtractParam(GenerateParam):
25
-
26
  """
27
  Define the KeywordExtract component parameters.
28
  """
 
29
  def __init__(self):
30
  super().__init__()
31
- self.temperature = 0.5
32
- self.prompt = ""
33
- self.topn = 1
34
 
35
  def check(self):
36
  super().check()
 
37
 
38
  def get_prompt(self):
39
  self.prompt = """
@@ -43,12 +42,12 @@ class KeywordExtractParam(GenerateParam):
43
  - Use comma as a delimiter to separate keywords/phrases.
44
  - Answer format: (in language of user's question)
45
  - keyword:
46
- """%self.topn
47
  return self.prompt
48
 
49
 
50
  class KeywordExtract(Generate, ABC):
51
- component_name = "RewriteQuestion"
52
 
53
  def _run(self, history, **kwargs):
54
  q = ""
@@ -64,5 +63,3 @@ class KeywordExtract(Generate, ABC):
64
  ans = re.sub(r".*keyword:", "", ans).strip()
65
  if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
66
  return KeywordExtract.be_output(ans)
67
-
68
-
 
22
 
23
 
24
  class KeywordExtractParam(GenerateParam):
 
25
  """
26
  Define the KeywordExtract component parameters.
27
  """
28
+
29
  def __init__(self):
30
  super().__init__()
31
+ self.top_n = 1
 
 
32
 
33
  def check(self):
34
  super().check()
35
+ self.check_positive_integer(self.top_n, "Top N")
36
 
37
  def get_prompt(self):
38
  self.prompt = """
 
42
  - Use comma as a delimiter to separate keywords/phrases.
43
  - Answer format: (in language of user's question)
44
  - keyword:
45
+ """ % self.top_n
46
  return self.prompt
47
 
48
 
49
  class KeywordExtract(Generate, ABC):
50
+ component_name = "KeywordExtract"
51
 
52
  def _run(self, history, **kwargs):
53
  q = ""
 
63
  ans = re.sub(r".*keyword:", "", ans).strip()
64
  if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
65
  return KeywordExtract.be_output(ans)
 
 
graph/test/dsl_examples/retrieval_relevant_keyword_baidu_and_generate.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "components": {
3
+ "begin": {
4
+ "obj":{
5
+ "component_name": "Begin",
6
+ "params": {
7
+ "prologue": "Hi there!"
8
+ }
9
+ },
10
+ "downstream": ["answer:0"],
11
+ "upstream": []
12
+ },
13
+ "answer:0": {
14
+ "obj": {
15
+ "component_name": "Answer",
16
+ "params": {}
17
+ },
18
+ "downstream": ["retrieval:0"],
19
+ "upstream": ["begin"]
20
+ },
21
+ "retrieval:0": {
22
+ "obj": {
23
+ "component_name": "Retrieval",
24
+ "params": {
25
+ "similarity_threshold": 0.2,
26
+ "keywords_similarity_weight": 0.3,
27
+ "top_n": 6,
28
+ "top_k": 1024,
29
+ "rerank_id": "BAAI/bge-reranker-v2-m3",
30
+ "kb_ids": ["21ca4e6a2c8911ef8b1e0242ac120006"],
31
+ "empty_response": "Sorry, knowledge base has noting related information."
32
+ }
33
+ },
34
+ "downstream": ["relevant:0"],
35
+ "upstream": ["answer:0"]
36
+ },
37
+ "relevant:0": {
38
+ "obj": {
39
+ "component_name": "Relevant",
40
+ "params": {
41
+ "llm_id": "deepseek-chat",
42
+ "temperature": 0.02,
43
+ "yes": "generate:0",
44
+ "no": "keyword:0"
45
+ }
46
+ },
47
+ "downstream": ["keyword:0", "generate:0"],
48
+ "upstream": ["retrieval:0"]
49
+ },
50
+ "generate:0": {
51
+ "obj": {
52
+ "component_name": "Generate",
53
+ "params": {
54
+ "llm_id": "deepseek-chat",
55
+ "prompt": "You are an intelligent assistant. Please answer the question based on content of knowledge base. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\". Answers need to consider chat history.\n Knowledge base content is as following:\n {input}\n The above is the content of knowledge base.",
56
+ "temperature": 0.2
57
+ }
58
+ },
59
+ "downstream": ["answer:0"],
60
+ "upstream": ["relevant:0"]
61
+ },
62
+ "keyword:0": {
63
+ "obj": {
64
+ "component_name": "KeywordExtract",
65
+ "params": {
66
+ "llm_id": "deepseek-chat",
67
+ "prompt": "- Role: You're a question analyzer.\n - Requirements:\n - Summarize user's question, and give top %s important keyword/phrase.\n - Use comma as a delimiter to separate keywords/phrases.\n - Answer format: (in language of user's question)\n - keyword: ",
68
+ "temperature": 0.2,
69
+ "top_n": 1
70
+ }
71
+ },
72
+ "downstream": ["baidu:0"],
73
+ "upstream": ["relevant:0"]
74
+ },
75
+ "baidu:0": {
76
+ "obj":{
77
+ "component_name": "Baidu",
78
+ "params": {
79
+ "top_n": 10
80
+ }
81
+ },
82
+ "downstream": ["generate:1"],
83
+ "upstream": ["keyword:0"]
84
+ },
85
+ "generate:1": {
86
+ "obj": {
87
+ "component_name": "Generate",
88
+ "params": {
89
+ "llm_id": "deepseek-chat",
90
+ "prompt": "You are an intelligent assistant. Please answer the question based on content searched from Baidu. When the answer from a Baidu search is incomplete, you need to output the URL link of the corresponding content as well. When all the content searched from Baidu is irrelevant to the question, your answer must include the sentence, \"The answer you are looking for is not found in the Baidu search!\". Answers need to consider chat history.\n The content of Baidu search is as follows:\n {input}\n The above is the content of Baidu search.",
91
+ "temperature": 0.2
92
+ }
93
+ },
94
+ "downstream": ["answer:0"],
95
+ "upstream": ["baidu:0"]
96
+ }
97
+ },
98
+ "history": [],
99
+ "path": [],
100
+ "messages": [],
101
+ "reference": {},
102
+ "answer": []
103
+ }