wwwlll
commited on
Commit
·
13bc594
1
Parent(s):
f63d19e
Add agent component for web crawler (#2878)
Browse files### What problem does this PR solve?
Add agent component for web crawler
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- agent/component/__init__.py +1 -0
- agent/component/crawler.py +71 -0
- web/src/assets/svg/crawler.svg +1 -0
- web/src/locales/en.ts +10 -0
- web/src/locales/zh-traditional.ts +9 -0
- web/src/locales/zh.ts +9 -0
- web/src/pages/flow/constant.tsx +12 -0
- web/src/pages/flow/flow-drawer/index.tsx +2 -0
- web/src/pages/flow/form/crawler-form/index.tsx +37 -0
agent/component/__init__.py
CHANGED
@@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
|
|
28 |
from .jin10 import Jin10, Jin10Param
|
29 |
from .tushare import TuShare, TuShareParam
|
30 |
from .akshare import AkShare, AkShareParam
|
|
|
31 |
|
32 |
|
33 |
def component_class(class_name):
|
|
|
28 |
from .jin10 import Jin10, Jin10Param
|
29 |
from .tushare import TuShare, TuShareParam
|
30 |
from .akshare import AkShare, AkShareParam
|
31 |
+
from .crawler import Crawler, CrawlerParam
|
32 |
|
33 |
|
34 |
def component_class(class_name):
|
agent/component/crawler.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
+
from abc import ABC
|
17 |
+
import asyncio
|
18 |
+
from crawl4ai import AsyncWebCrawler
|
19 |
+
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
|
21 |
+
class CrawlerParam(ComponentParamBase):
|
22 |
+
"""
|
23 |
+
Define the Crawler component parameters.
|
24 |
+
"""
|
25 |
+
|
26 |
+
def __init__(self):
|
27 |
+
super().__init__()
|
28 |
+
|
29 |
+
def check(self):
|
30 |
+
return True
|
31 |
+
|
32 |
+
|
33 |
+
class Crawler(ComponentBase, ABC):
|
34 |
+
component_name = "Crawler"
|
35 |
+
|
36 |
+
def _run(self, history, **kwargs):
|
37 |
+
ans = self.get_input()
|
38 |
+
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
39 |
+
if not ans:
|
40 |
+
return Crawler.be_output("")
|
41 |
+
try:
|
42 |
+
result = asyncio.run(self.get_web(ans))
|
43 |
+
|
44 |
+
return Crawler.be_output(result)
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
|
48 |
+
|
49 |
+
|
50 |
+
async def get_web(self, url):
|
51 |
+
proxy = self._param.proxy if self._param.proxy else None
|
52 |
+
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
53 |
+
result = await crawler.arun(
|
54 |
+
url=url,
|
55 |
+
bypass_cache=True
|
56 |
+
)
|
57 |
+
|
58 |
+
match self._param.extract_type:
|
59 |
+
case 'html':
|
60 |
+
return result.cleaned_html
|
61 |
+
case 'markdown':
|
62 |
+
return result.markdown
|
63 |
+
case 'content':
|
64 |
+
return result.extracted_content
|
65 |
+
case _:
|
66 |
+
return result.markdown
|
67 |
+
# print(result.markdown)
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
web/src/assets/svg/crawler.svg
ADDED
|
web/src/locales/en.ts
CHANGED
@@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
|
|
928 |
yahooFinance: 'YahooFinance',
|
929 |
yahooFinanceDescription:
|
930 |
'The component queries information about the company based on the provided ticker symbol.',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
931 |
info: 'Info',
|
932 |
history: 'History',
|
933 |
financials: 'Financials',
|
|
|
928 |
yahooFinance: 'YahooFinance',
|
929 |
yahooFinanceDescription:
|
930 |
'The component queries information about the company based on the provided ticker symbol.',
|
931 |
+
crawler: 'Web Crawler',
|
932 |
+
crawlerDescription:
|
933 |
+
'This component can be used to crawl HTML source code from a specified URL.',
|
934 |
+
proxy: 'Proxy',
|
935 |
+
crawlerResultOptions: {
|
936 |
+
html: 'Html',
|
937 |
+
markdown: 'Markdown',
|
938 |
+
content: 'Content',
|
939 |
+
},
|
940 |
+
extractType: 'extractType',
|
941 |
info: 'Info',
|
942 |
history: 'History',
|
943 |
financials: 'Financials',
|
web/src/locales/zh-traditional.ts
CHANGED
@@ -877,6 +877,15 @@ export default {
|
|
877 |
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
|
878 |
yahooFinance: '雅虎財經',
|
879 |
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
880 |
info: '訊息',
|
881 |
history: '歷史',
|
882 |
financials: '財務',
|
|
|
877 |
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
|
878 |
yahooFinance: '雅虎財經',
|
879 |
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
|
880 |
+
crawler: '網頁爬蟲',
|
881 |
+
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
|
882 |
+
proxy: '代理',
|
883 |
+
crawlerResultOptions: {
|
884 |
+
html: 'Html',
|
885 |
+
markdown: 'Markdown',
|
886 |
+
content: '文本',
|
887 |
+
},
|
888 |
+
extractType: '提取類型',
|
889 |
info: '訊息',
|
890 |
history: '歷史',
|
891 |
financials: '財務',
|
web/src/locales/zh.ts
CHANGED
@@ -897,6 +897,15 @@ export default {
|
|
897 |
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
|
898 |
yahooFinance: '雅虎财经',
|
899 |
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
900 |
info: '信息',
|
901 |
history: '历史',
|
902 |
financials: '财务',
|
|
|
897 |
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
|
898 |
yahooFinance: '雅虎财经',
|
899 |
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
|
900 |
+
crawler: '网页爬虫',
|
901 |
+
crawlerDescription: '该组件可用于从指定url爬取html源码。',
|
902 |
+
proxy: '代理',
|
903 |
+
crawlerResultOptions: {
|
904 |
+
html: 'Html',
|
905 |
+
markdown: 'Markdown',
|
906 |
+
content: '文本',
|
907 |
+
},
|
908 |
+
extractType: '提取类型',
|
909 |
info: '信息',
|
910 |
history: '历史',
|
911 |
financials: '财务',
|
web/src/pages/flow/constant.tsx
CHANGED
@@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
|
|
4 |
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
|
5 |
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
|
6 |
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
|
|
|
7 |
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
|
8 |
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
|
9 |
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
|
@@ -73,6 +74,7 @@ export enum Operator {
|
|
73 |
Concentrator = 'Concentrator',
|
74 |
TuShare = 'TuShare',
|
75 |
Note = 'Note',
|
|
|
76 |
}
|
77 |
|
78 |
export const CommonOperatorList = Object.values(Operator).filter(
|
@@ -110,6 +112,7 @@ export const operatorIconMap = {
|
|
110 |
[Operator.Concentrator]: ConcentratorIcon,
|
111 |
[Operator.TuShare]: TuShareIcon,
|
112 |
[Operator.Note]: NoteIcon,
|
|
|
113 |
};
|
114 |
|
115 |
export const operatorMap: Record<
|
@@ -233,6 +236,9 @@ export const operatorMap: Record<
|
|
233 |
},
|
234 |
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
|
235 |
[Operator.Note]: { backgroundColor: '#f8cfa0' },
|
|
|
|
|
|
|
236 |
};
|
237 |
|
238 |
export const componentMenuList = [
|
@@ -323,6 +329,9 @@ export const componentMenuList = [
|
|
323 |
{
|
324 |
name: Operator.TuShare,
|
325 |
},
|
|
|
|
|
|
|
326 |
];
|
327 |
|
328 |
export const initialRetrievalValues = {
|
@@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
|
|
572 |
[Operator.Jin10]: [Operator.Begin],
|
573 |
[Operator.Concentrator]: [Operator.Begin],
|
574 |
[Operator.TuShare]: [Operator.Begin],
|
|
|
575 |
};
|
576 |
|
577 |
export const NodeMap = {
|
@@ -605,6 +615,7 @@ export const NodeMap = {
|
|
605 |
[Operator.Jin10]: 'ragNode',
|
606 |
[Operator.TuShare]: 'ragNode',
|
607 |
[Operator.Note]: 'noteNode',
|
|
|
608 |
};
|
609 |
|
610 |
export const LanguageOptions = [
|
@@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
|
|
2791 |
'fenghuang',
|
2792 |
'jinrongjie',
|
2793 |
];
|
|
|
|
4 |
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
|
5 |
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
|
6 |
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
|
7 |
+
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
|
8 |
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
|
9 |
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
|
10 |
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
|
|
|
74 |
Concentrator = 'Concentrator',
|
75 |
TuShare = 'TuShare',
|
76 |
Note = 'Note',
|
77 |
+
Crawler = 'Crawler',
|
78 |
}
|
79 |
|
80 |
export const CommonOperatorList = Object.values(Operator).filter(
|
|
|
112 |
[Operator.Concentrator]: ConcentratorIcon,
|
113 |
[Operator.TuShare]: TuShareIcon,
|
114 |
[Operator.Note]: NoteIcon,
|
115 |
+
[Operator.Crawler]: CrawlerIcon,
|
116 |
};
|
117 |
|
118 |
export const operatorMap: Record<
|
|
|
236 |
},
|
237 |
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
|
238 |
[Operator.Note]: { backgroundColor: '#f8cfa0' },
|
239 |
+
[Operator.Crawler]: {
|
240 |
+
backgroundColor: '#dee0e2',
|
241 |
+
},
|
242 |
};
|
243 |
|
244 |
export const componentMenuList = [
|
|
|
329 |
{
|
330 |
name: Operator.TuShare,
|
331 |
},
|
332 |
+
{
|
333 |
+
name: Operator.Crawler,
|
334 |
+
},
|
335 |
];
|
336 |
|
337 |
export const initialRetrievalValues = {
|
|
|
581 |
[Operator.Jin10]: [Operator.Begin],
|
582 |
[Operator.Concentrator]: [Operator.Begin],
|
583 |
[Operator.TuShare]: [Operator.Begin],
|
584 |
+
[Operator.Crawler]: [Operator.Begin],
|
585 |
};
|
586 |
|
587 |
export const NodeMap = {
|
|
|
615 |
[Operator.Jin10]: 'ragNode',
|
616 |
[Operator.TuShare]: 'ragNode',
|
617 |
[Operator.Note]: 'noteNode',
|
618 |
+
[Operator.Crawler]: 'ragNode',
|
619 |
};
|
620 |
|
621 |
export const LanguageOptions = [
|
|
|
2802 |
'fenghuang',
|
2803 |
'jinrongjie',
|
2804 |
];
|
2805 |
+
export const CrawlerResultOptions = ['markdown', 'html', 'content'];
|
web/src/pages/flow/flow-drawer/index.tsx
CHANGED
@@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
|
|
12 |
import BeginForm from '../form/begin-form';
|
13 |
import BingForm from '../form/bing-form';
|
14 |
import CategorizeForm from '../form/categorize-form';
|
|
|
15 |
import DeepLForm from '../form/deepl-form';
|
16 |
import DuckDuckGoForm from '../form/duckduckgo-form';
|
17 |
import ExeSQLForm from '../form/exesql-form';
|
@@ -70,6 +71,7 @@ const FormMap = {
|
|
70 |
[Operator.YahooFinance]: YahooFinanceForm,
|
71 |
[Operator.Jin10]: Jin10Form,
|
72 |
[Operator.TuShare]: TuShareForm,
|
|
|
73 |
};
|
74 |
|
75 |
const EmptyContent = () => <div>empty</div>;
|
|
|
12 |
import BeginForm from '../form/begin-form';
|
13 |
import BingForm from '../form/bing-form';
|
14 |
import CategorizeForm from '../form/categorize-form';
|
15 |
+
import CrawlerForm from '../form/crawler-form';
|
16 |
import DeepLForm from '../form/deepl-form';
|
17 |
import DuckDuckGoForm from '../form/duckduckgo-form';
|
18 |
import ExeSQLForm from '../form/exesql-form';
|
|
|
71 |
[Operator.YahooFinance]: YahooFinanceForm,
|
72 |
[Operator.Jin10]: Jin10Form,
|
73 |
[Operator.TuShare]: TuShareForm,
|
74 |
+
[Operator.Crawler]: CrawlerForm,
|
75 |
};
|
76 |
|
77 |
const EmptyContent = () => <div>empty</div>;
|
web/src/pages/flow/form/crawler-form/index.tsx
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useTranslate } from '@/hooks/common-hooks';
|
2 |
+
import { Form, Input, Select } from 'antd';
|
3 |
+
import { useMemo } from 'react';
|
4 |
+
import { CrawlerResultOptions } from '../../constant';
|
5 |
+
import { IOperatorForm } from '../../interface';
|
6 |
+
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
|
7 |
+
const { t } = useTranslate('flow');
|
8 |
+
const crawlerResultOptions = useMemo(() => {
|
9 |
+
return CrawlerResultOptions.map((x) => ({
|
10 |
+
value: x,
|
11 |
+
label: t(`crawlerResultOptions.${x}`),
|
12 |
+
}));
|
13 |
+
}, [t]);
|
14 |
+
return (
|
15 |
+
<Form
|
16 |
+
name="basic"
|
17 |
+
labelCol={{ span: 6 }}
|
18 |
+
wrapperCol={{ span: 18 }}
|
19 |
+
autoComplete="off"
|
20 |
+
form={form}
|
21 |
+
onValuesChange={onValuesChange}
|
22 |
+
>
|
23 |
+
<Form.Item label={t('proxy')} name={'proxy'}>
|
24 |
+
<Input placeholder="like: http://127.0.0.1:8888"></Input>
|
25 |
+
</Form.Item>
|
26 |
+
<Form.Item
|
27 |
+
label={t('extractType')}
|
28 |
+
name={'extract_type'}
|
29 |
+
initialValue="markdown"
|
30 |
+
>
|
31 |
+
<Select options={crawlerResultOptions}></Select>
|
32 |
+
</Form.Item>
|
33 |
+
</Form>
|
34 |
+
);
|
35 |
+
};
|
36 |
+
|
37 |
+
export default CrawlerForm;
|