GYH
commited on
Commit
·
d9499f9
1
Parent(s):
8fb1944
Add Graph Baidusearch and dsl_example (#1378)
Browse files### What problem does this PR solve?
#918
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
graph/component/__init__.py
CHANGED
@@ -8,6 +8,8 @@ from .switch import Switch, SwitchParam
|
|
8 |
from .relevant import Relevant, RelevantParam
|
9 |
from .message import Message, MessageParam
|
10 |
from .rewrite import RewriteQuestion, RewriteQuestionParam
|
|
|
|
|
11 |
|
12 |
|
13 |
def component_class(class_name):
|
|
|
8 |
from .relevant import Relevant, RelevantParam
|
9 |
from .message import Message, MessageParam
|
10 |
from .rewrite import RewriteQuestion, RewriteQuestionParam
|
11 |
+
from .keyword import KeywordExtract, KeywordExtractParam
|
12 |
+
from .baidu import Baidu, BaiduParam
|
13 |
|
14 |
|
15 |
def component_class(class_name):
|
graph/component/baidu.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
+
import random
|
17 |
+
from abc import ABC
|
18 |
+
from functools import partial
|
19 |
+
import pandas as pd
|
20 |
+
import requests
|
21 |
+
import re
|
22 |
+
|
23 |
+
from graph.component.base import ComponentBase, ComponentParamBase
|
24 |
+
|
25 |
+
|
26 |
+
class BaiduParam(ComponentParamBase):
|
27 |
+
"""
|
28 |
+
Define the Baidu component parameters.
|
29 |
+
"""
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
super().__init__()
|
33 |
+
self.top_n = 10
|
34 |
+
|
35 |
+
def check(self):
|
36 |
+
self.check_positive_integer(self.top_n, "Top N")
|
37 |
+
|
38 |
+
|
39 |
+
class Baidu(ComponentBase, ABC):
|
40 |
+
component_name = "Baidu"
|
41 |
+
|
42 |
+
def _run(self, history, **kwargs):
|
43 |
+
ans = self.get_input()
|
44 |
+
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
45 |
+
if not ans:
|
46 |
+
return Baidu.be_output(self._param.no)
|
47 |
+
|
48 |
+
url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n)
|
49 |
+
headers = {
|
50 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
|
51 |
+
response = requests.get(url=url, headers=headers)
|
52 |
+
|
53 |
+
baidu_res = re.findall(r'"contentText":"(.*?)"', response.text)
|
54 |
+
url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text)
|
55 |
+
for i in range(min(len(baidu_res), len(url_res))):
|
56 |
+
baidu_res[i] += '<a>' + url_res[i] + '</a>'
|
57 |
+
|
58 |
+
del url_res
|
59 |
+
|
60 |
+
br = pd.DataFrame(baidu_res, columns=['content'])
|
61 |
+
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", br)
|
62 |
+
return br
|
graph/component/keyword.py
CHANGED
@@ -22,18 +22,17 @@ from graph.settings import DEBUG
|
|
22 |
|
23 |
|
24 |
class KeywordExtractParam(GenerateParam):
|
25 |
-
|
26 |
"""
|
27 |
Define the KeywordExtract component parameters.
|
28 |
"""
|
|
|
29 |
def __init__(self):
|
30 |
super().__init__()
|
31 |
-
self.
|
32 |
-
self.prompt = ""
|
33 |
-
self.topn = 1
|
34 |
|
35 |
def check(self):
|
36 |
super().check()
|
|
|
37 |
|
38 |
def get_prompt(self):
|
39 |
self.prompt = """
|
@@ -43,12 +42,12 @@ class KeywordExtractParam(GenerateParam):
|
|
43 |
- Use comma as a delimiter to separate keywords/phrases.
|
44 |
- Answer format: (in language of user's question)
|
45 |
- keyword:
|
46 |
-
"""%self.
|
47 |
return self.prompt
|
48 |
|
49 |
|
50 |
class KeywordExtract(Generate, ABC):
|
51 |
-
component_name = "
|
52 |
|
53 |
def _run(self, history, **kwargs):
|
54 |
q = ""
|
@@ -64,5 +63,3 @@ class KeywordExtract(Generate, ABC):
|
|
64 |
ans = re.sub(r".*keyword:", "", ans).strip()
|
65 |
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
|
66 |
return KeywordExtract.be_output(ans)
|
67 |
-
|
68 |
-
|
|
|
22 |
|
23 |
|
24 |
class KeywordExtractParam(GenerateParam):
|
|
|
25 |
"""
|
26 |
Define the KeywordExtract component parameters.
|
27 |
"""
|
28 |
+
|
29 |
def __init__(self):
|
30 |
super().__init__()
|
31 |
+
self.top_n = 1
|
|
|
|
|
32 |
|
33 |
def check(self):
|
34 |
super().check()
|
35 |
+
self.check_positive_integer(self.top_n, "Top N")
|
36 |
|
37 |
def get_prompt(self):
|
38 |
self.prompt = """
|
|
|
42 |
- Use comma as a delimiter to separate keywords/phrases.
|
43 |
- Answer format: (in language of user's question)
|
44 |
- keyword:
|
45 |
+
""" % self.top_n
|
46 |
return self.prompt
|
47 |
|
48 |
|
49 |
class KeywordExtract(Generate, ABC):
|
50 |
+
component_name = "KeywordExtract"
|
51 |
|
52 |
def _run(self, history, **kwargs):
|
53 |
q = ""
|
|
|
63 |
ans = re.sub(r".*keyword:", "", ans).strip()
|
64 |
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
|
65 |
return KeywordExtract.be_output(ans)
|
|
|
|
graph/test/dsl_examples/retrieval_relevant_keyword_baidu_and_generate.json
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"components": {
|
3 |
+
"begin": {
|
4 |
+
"obj":{
|
5 |
+
"component_name": "Begin",
|
6 |
+
"params": {
|
7 |
+
"prologue": "Hi there!"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"downstream": ["answer:0"],
|
11 |
+
"upstream": []
|
12 |
+
},
|
13 |
+
"answer:0": {
|
14 |
+
"obj": {
|
15 |
+
"component_name": "Answer",
|
16 |
+
"params": {}
|
17 |
+
},
|
18 |
+
"downstream": ["retrieval:0"],
|
19 |
+
"upstream": ["begin"]
|
20 |
+
},
|
21 |
+
"retrieval:0": {
|
22 |
+
"obj": {
|
23 |
+
"component_name": "Retrieval",
|
24 |
+
"params": {
|
25 |
+
"similarity_threshold": 0.2,
|
26 |
+
"keywords_similarity_weight": 0.3,
|
27 |
+
"top_n": 6,
|
28 |
+
"top_k": 1024,
|
29 |
+
"rerank_id": "BAAI/bge-reranker-v2-m3",
|
30 |
+
"kb_ids": ["21ca4e6a2c8911ef8b1e0242ac120006"],
|
31 |
+
"empty_response": "Sorry, knowledge base has noting related information."
|
32 |
+
}
|
33 |
+
},
|
34 |
+
"downstream": ["relevant:0"],
|
35 |
+
"upstream": ["answer:0"]
|
36 |
+
},
|
37 |
+
"relevant:0": {
|
38 |
+
"obj": {
|
39 |
+
"component_name": "Relevant",
|
40 |
+
"params": {
|
41 |
+
"llm_id": "deepseek-chat",
|
42 |
+
"temperature": 0.02,
|
43 |
+
"yes": "generate:0",
|
44 |
+
"no": "keyword:0"
|
45 |
+
}
|
46 |
+
},
|
47 |
+
"downstream": ["keyword:0", "generate:0"],
|
48 |
+
"upstream": ["retrieval:0"]
|
49 |
+
},
|
50 |
+
"generate:0": {
|
51 |
+
"obj": {
|
52 |
+
"component_name": "Generate",
|
53 |
+
"params": {
|
54 |
+
"llm_id": "deepseek-chat",
|
55 |
+
"prompt": "You are an intelligent assistant. Please answer the question based on content of knowledge base. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\". Answers need to consider chat history.\n Knowledge base content is as following:\n {input}\n The above is the content of knowledge base.",
|
56 |
+
"temperature": 0.2
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"downstream": ["answer:0"],
|
60 |
+
"upstream": ["relevant:0"]
|
61 |
+
},
|
62 |
+
"keyword:0": {
|
63 |
+
"obj": {
|
64 |
+
"component_name": "KeywordExtract",
|
65 |
+
"params": {
|
66 |
+
"llm_id": "deepseek-chat",
|
67 |
+
"prompt": "- Role: You're a question analyzer.\n - Requirements:\n - Summarize user's question, and give top %s important keyword/phrase.\n - Use comma as a delimiter to separate keywords/phrases.\n - Answer format: (in language of user's question)\n - keyword: ",
|
68 |
+
"temperature": 0.2,
|
69 |
+
"top_n": 1
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"downstream": ["baidu:0"],
|
73 |
+
"upstream": ["relevant:0"]
|
74 |
+
},
|
75 |
+
"baidu:0": {
|
76 |
+
"obj":{
|
77 |
+
"component_name": "Baidu",
|
78 |
+
"params": {
|
79 |
+
"top_n": 10
|
80 |
+
}
|
81 |
+
},
|
82 |
+
"downstream": ["generate:1"],
|
83 |
+
"upstream": ["keyword:0"]
|
84 |
+
},
|
85 |
+
"generate:1": {
|
86 |
+
"obj": {
|
87 |
+
"component_name": "Generate",
|
88 |
+
"params": {
|
89 |
+
"llm_id": "deepseek-chat",
|
90 |
+
"prompt": "You are an intelligent assistant. Please answer the question based on content searched from Baidu. When the answer from a Baidu search is incomplete, you need to output the URL link of the corresponding content as well. When all the content searched from Baidu is irrelevant to the question, your answer must include the sentence, \"The answer you are looking for is not found in the Baidu search!\". Answers need to consider chat history.\n The content of Baidu search is as follows:\n {input}\n The above is the content of Baidu search.",
|
91 |
+
"temperature": 0.2
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"downstream": ["answer:0"],
|
95 |
+
"upstream": ["baidu:0"]
|
96 |
+
}
|
97 |
+
},
|
98 |
+
"history": [],
|
99 |
+
"path": [],
|
100 |
+
"messages": [],
|
101 |
+
"reference": {},
|
102 |
+
"answer": []
|
103 |
+
}
|