wwwlll commited on
Commit
13bc594
·
1 Parent(s): f63d19e

Add agent component for web crawler (#2878)

Browse files

### What problem does this PR solve?

Add agent component for web crawler

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

agent/component/__init__.py CHANGED
@@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
28
  from .jin10 import Jin10, Jin10Param
29
  from .tushare import TuShare, TuShareParam
30
  from .akshare import AkShare, AkShareParam
 
31
 
32
 
33
  def component_class(class_name):
 
28
  from .jin10 import Jin10, Jin10Param
29
  from .tushare import TuShare, TuShareParam
30
  from .akshare import AkShare, AkShareParam
31
+ from .crawler import Crawler, CrawlerParam
32
 
33
 
34
  def component_class(class_name):
agent/component/crawler.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from abc import ABC
17
+ import asyncio
18
+ from crawl4ai import AsyncWebCrawler
19
+ from agent.component.base import ComponentBase, ComponentParamBase
20
+
21
+ class CrawlerParam(ComponentParamBase):
22
+ """
23
+ Define the Crawler component parameters.
24
+ """
25
+
26
+ def __init__(self):
27
+ super().__init__()
28
+
29
+ def check(self):
30
+ return True
31
+
32
+
33
+ class Crawler(ComponentBase, ABC):
34
+ component_name = "Crawler"
35
+
36
+ def _run(self, history, **kwargs):
37
+ ans = self.get_input()
38
+ ans = " - ".join(ans["content"]) if "content" in ans else ""
39
+ if not ans:
40
+ return Crawler.be_output("")
41
+ try:
42
+ result = asyncio.run(self.get_web(ans))
43
+
44
+ return Crawler.be_output(result)
45
+
46
+ except Exception as e:
47
+ return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
48
+
49
+
50
+ async def get_web(self, url):
51
+ proxy = self._param.proxy if self._param.proxy else None
52
+ async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
53
+ result = await crawler.arun(
54
+ url=url,
55
+ bypass_cache=True
56
+ )
57
+
58
+ match self._param.extract_type:
59
+ case 'html':
60
+ return result.cleaned_html
61
+ case 'markdown':
62
+ return result.markdown
63
+ case 'content':
64
+ return result.extracted_content
65
+ case _:
66
+ return result.markdown
67
+ # print(result.markdown)
68
+
69
+
70
+
71
+
web/src/assets/svg/crawler.svg ADDED
web/src/locales/en.ts CHANGED
@@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
928
  yahooFinance: 'YahooFinance',
929
  yahooFinanceDescription:
930
  'The component queries information about the company based on the provided ticker symbol.',
 
 
 
 
 
 
 
 
 
 
931
  info: 'Info',
932
  history: 'History',
933
  financials: 'Financials',
 
928
  yahooFinance: 'YahooFinance',
929
  yahooFinanceDescription:
930
  'The component queries information about the company based on the provided ticker symbol.',
931
+ crawler: 'Web Crawler',
932
+ crawlerDescription:
933
+ 'This component can be used to crawl HTML source code from a specified URL.',
934
+ proxy: 'Proxy',
935
+ crawlerResultOptions: {
936
+ html: 'Html',
937
+ markdown: 'Markdown',
938
+ content: 'Content',
939
+ },
940
+ extractType: 'extractType',
941
  info: 'Info',
942
  history: 'History',
943
  financials: 'Financials',
web/src/locales/zh-traditional.ts CHANGED
@@ -877,6 +877,15 @@ export default {
877
  akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
878
  yahooFinance: '雅虎財經',
879
  yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
 
 
 
 
 
 
 
 
 
880
  info: '訊息',
881
  history: '歷史',
882
  financials: '財務',
 
877
  akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
878
  yahooFinance: '雅虎財經',
879
  yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
880
+ crawler: '網頁爬蟲',
881
+ crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
882
+ proxy: '代理',
883
+ crawlerResultOptions: {
884
+ html: 'Html',
885
+ markdown: 'Markdown',
886
+ content: '文本',
887
+ },
888
+ extractType: '提取類型',
889
  info: '訊息',
890
  history: '歷史',
891
  financials: '財務',
web/src/locales/zh.ts CHANGED
@@ -897,6 +897,15 @@ export default {
897
  akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
898
  yahooFinance: '雅虎财经',
899
  yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
 
 
 
 
 
 
 
 
 
900
  info: '信息',
901
  history: '历史',
902
  financials: '财务',
 
897
  akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
898
  yahooFinance: '雅虎财经',
899
  yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
900
+ crawler: '网页爬虫',
901
+ crawlerDescription: '该组件可用于从指定url爬取html源码。',
902
+ proxy: '代理',
903
+ crawlerResultOptions: {
904
+ html: 'Html',
905
+ markdown: 'Markdown',
906
+ content: '文本',
907
+ },
908
+ extractType: '提取类型',
909
  info: '信息',
910
  history: '历史',
911
  financials: '财务',
web/src/pages/flow/constant.tsx CHANGED
@@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
4
  import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
5
  import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
6
  import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
 
7
  import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
8
  import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
9
  import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
@@ -73,6 +74,7 @@ export enum Operator {
73
  Concentrator = 'Concentrator',
74
  TuShare = 'TuShare',
75
  Note = 'Note',
 
76
  }
77
 
78
  export const CommonOperatorList = Object.values(Operator).filter(
@@ -110,6 +112,7 @@ export const operatorIconMap = {
110
  [Operator.Concentrator]: ConcentratorIcon,
111
  [Operator.TuShare]: TuShareIcon,
112
  [Operator.Note]: NoteIcon,
 
113
  };
114
 
115
  export const operatorMap: Record<
@@ -233,6 +236,9 @@ export const operatorMap: Record<
233
  },
234
  [Operator.TuShare]: { backgroundColor: '#f8cfa0' },
235
  [Operator.Note]: { backgroundColor: '#f8cfa0' },
 
 
 
236
  };
237
 
238
  export const componentMenuList = [
@@ -323,6 +329,9 @@ export const componentMenuList = [
323
  {
324
  name: Operator.TuShare,
325
  },
 
 
 
326
  ];
327
 
328
  export const initialRetrievalValues = {
@@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
572
  [Operator.Jin10]: [Operator.Begin],
573
  [Operator.Concentrator]: [Operator.Begin],
574
  [Operator.TuShare]: [Operator.Begin],
 
575
  };
576
 
577
  export const NodeMap = {
@@ -605,6 +615,7 @@ export const NodeMap = {
605
  [Operator.Jin10]: 'ragNode',
606
  [Operator.TuShare]: 'ragNode',
607
  [Operator.Note]: 'noteNode',
 
608
  };
609
 
610
  export const LanguageOptions = [
@@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
2791
  'fenghuang',
2792
  'jinrongjie',
2793
  ];
 
 
4
  import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
5
  import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
6
  import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
7
+ import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
8
  import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
9
  import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
10
  import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
 
74
  Concentrator = 'Concentrator',
75
  TuShare = 'TuShare',
76
  Note = 'Note',
77
+ Crawler = 'Crawler',
78
  }
79
 
80
  export const CommonOperatorList = Object.values(Operator).filter(
 
112
  [Operator.Concentrator]: ConcentratorIcon,
113
  [Operator.TuShare]: TuShareIcon,
114
  [Operator.Note]: NoteIcon,
115
+ [Operator.Crawler]: CrawlerIcon,
116
  };
117
 
118
  export const operatorMap: Record<
 
236
  },
237
  [Operator.TuShare]: { backgroundColor: '#f8cfa0' },
238
  [Operator.Note]: { backgroundColor: '#f8cfa0' },
239
+ [Operator.Crawler]: {
240
+ backgroundColor: '#dee0e2',
241
+ },
242
  };
243
 
244
  export const componentMenuList = [
 
329
  {
330
  name: Operator.TuShare,
331
  },
332
+ {
333
+ name: Operator.Crawler,
334
+ },
335
  ];
336
 
337
  export const initialRetrievalValues = {
 
581
  [Operator.Jin10]: [Operator.Begin],
582
  [Operator.Concentrator]: [Operator.Begin],
583
  [Operator.TuShare]: [Operator.Begin],
584
+ [Operator.Crawler]: [Operator.Begin],
585
  };
586
 
587
  export const NodeMap = {
 
615
  [Operator.Jin10]: 'ragNode',
616
  [Operator.TuShare]: 'ragNode',
617
  [Operator.Note]: 'noteNode',
618
+ [Operator.Crawler]: 'ragNode',
619
  };
620
 
621
  export const LanguageOptions = [
 
2802
  'fenghuang',
2803
  'jinrongjie',
2804
  ];
2805
+ export const CrawlerResultOptions = ['markdown', 'html', 'content'];
web/src/pages/flow/flow-drawer/index.tsx CHANGED
@@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
12
  import BeginForm from '../form/begin-form';
13
  import BingForm from '../form/bing-form';
14
  import CategorizeForm from '../form/categorize-form';
 
15
  import DeepLForm from '../form/deepl-form';
16
  import DuckDuckGoForm from '../form/duckduckgo-form';
17
  import ExeSQLForm from '../form/exesql-form';
@@ -70,6 +71,7 @@ const FormMap = {
70
  [Operator.YahooFinance]: YahooFinanceForm,
71
  [Operator.Jin10]: Jin10Form,
72
  [Operator.TuShare]: TuShareForm,
 
73
  };
74
 
75
  const EmptyContent = () => <div>empty</div>;
 
12
  import BeginForm from '../form/begin-form';
13
  import BingForm from '../form/bing-form';
14
  import CategorizeForm from '../form/categorize-form';
15
+ import CrawlerForm from '../form/crawler-form';
16
  import DeepLForm from '../form/deepl-form';
17
  import DuckDuckGoForm from '../form/duckduckgo-form';
18
  import ExeSQLForm from '../form/exesql-form';
 
71
  [Operator.YahooFinance]: YahooFinanceForm,
72
  [Operator.Jin10]: Jin10Form,
73
  [Operator.TuShare]: TuShareForm,
74
+ [Operator.Crawler]: CrawlerForm,
75
  };
76
 
77
  const EmptyContent = () => <div>empty</div>;
web/src/pages/flow/form/crawler-form/index.tsx ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useTranslate } from '@/hooks/common-hooks';
2
+ import { Form, Input, Select } from 'antd';
3
+ import { useMemo } from 'react';
4
+ import { CrawlerResultOptions } from '../../constant';
5
+ import { IOperatorForm } from '../../interface';
6
+ const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
7
+ const { t } = useTranslate('flow');
8
+ const crawlerResultOptions = useMemo(() => {
9
+ return CrawlerResultOptions.map((x) => ({
10
+ value: x,
11
+ label: t(`crawlerResultOptions.${x}`),
12
+ }));
13
+ }, [t]);
14
+ return (
15
+ <Form
16
+ name="basic"
17
+ labelCol={{ span: 6 }}
18
+ wrapperCol={{ span: 18 }}
19
+ autoComplete="off"
20
+ form={form}
21
+ onValuesChange={onValuesChange}
22
+ >
23
+ <Form.Item label={t('proxy')} name={'proxy'}>
24
+ <Input placeholder="like: http://127.0.0.1:8888"></Input>
25
+ </Form.Item>
26
+ <Form.Item
27
+ label={t('extractType')}
28
+ name={'extract_type'}
29
+ initialValue="markdown"
30
+ >
31
+ <Select options={crawlerResultOptions}></Select>
32
+ </Form.Item>
33
+ </Form>
34
+ );
35
+ };
36
+
37
+ export default CrawlerForm;