balibabu commited on
Commit
980e3c4
·
1 Parent(s): 2d8207d

Implements RAPTOR for better chunking #882 (#883)

Browse files

### What problem does this PR solve?

Implements RAPTOR for better chunking #882

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

web/src/components/chunk-method-modal/index.tsx CHANGED
@@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
23
 
24
  import { useTranslate } from '@/hooks/commonHooks';
25
  import LayoutRecognize from '../layout-recognize';
 
 
 
26
  import styles from './index.less';
27
 
28
  interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
@@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
111
  onCancel={hideModal}
112
  afterClose={afterClose}
113
  confirmLoading={loading}
 
114
  >
115
  <Space size={[0, 8]} wrap>
116
  <Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
@@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
255
  </Form.Item>
256
  )}
257
  {showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
 
 
 
258
  </Form>
259
  </Modal>
260
  );
 
23
 
24
  import { useTranslate } from '@/hooks/commonHooks';
25
  import LayoutRecognize from '../layout-recognize';
26
+ import ParseConfiguration, {
27
+ showRaptorParseConfiguration,
28
+ } from '../parse-configuration';
29
  import styles from './index.less';
30
 
31
  interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
 
114
  onCancel={hideModal}
115
  afterClose={afterClose}
116
  confirmLoading={loading}
117
+ width={700}
118
  >
119
  <Space size={[0, 8]} wrap>
120
  <Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
 
259
  </Form.Item>
260
  )}
261
  {showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
262
+ {showRaptorParseConfiguration(selectedTag) && (
263
+ <ParseConfiguration></ParseConfiguration>
264
+ )}
265
  </Form>
266
  </Modal>
267
  );
web/src/components/parse-configuration/index.tsx ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useTranslate } from '@/hooks/commonHooks';
2
+ import { PlusOutlined } from '@ant-design/icons';
3
+ import {
4
+ Button,
5
+ Divider,
6
+ Flex,
7
+ Form,
8
+ Input,
9
+ InputNumber,
10
+ Slider,
11
+ Switch,
12
+ } from 'antd';
13
+ import random from 'lodash/random';
14
+
15
+ export const excludedParseMethods = ['table', 'resume', 'one'];
16
+
17
+ export const showRaptorParseConfiguration = (parserId: string) => {
18
+ return !excludedParseMethods.includes(parserId);
19
+ };
20
+
21
+ // The three types "table", "resume" and "one" do not display this configuration.
22
+ const ParseConfiguration = () => {
23
+ const form = Form.useFormInstance();
24
+ const { t } = useTranslate('knowledgeConfiguration');
25
+
26
+ const handleGenerate = () => {
27
+ form.setFieldValue(
28
+ ['parser_config', 'raptor', 'random_seed'],
29
+ random(10000),
30
+ );
31
+ };
32
+
33
+ return (
34
+ <>
35
+ <Divider></Divider>
36
+ <Form.Item
37
+ name={['parser_config', 'raptor', 'use_raptor']}
38
+ label={t('useRaptor')}
39
+ initialValue={false}
40
+ valuePropName="checked"
41
+ tooltip={t('useRaptorTip')}
42
+ >
43
+ <Switch />
44
+ </Form.Item>
45
+ <Form.Item
46
+ shouldUpdate={(prevValues, curValues) =>
47
+ prevValues.parser_config.raptor.use_raptor !==
48
+ curValues.parser_config.raptor.use_raptor
49
+ }
50
+ >
51
+ {({ getFieldValue }) => {
52
+ const useRaptor = getFieldValue([
53
+ 'parser_config',
54
+ 'raptor',
55
+ 'use_raptor',
56
+ ]);
57
+
58
+ return (
59
+ useRaptor && (
60
+ <>
61
+ <Form.Item
62
+ name={['parser_config', 'raptor', 'prompt']}
63
+ label={t('prompt')}
64
+ initialValue={t('promptText')}
65
+ tooltip={t('promptTip')}
66
+ rules={[
67
+ {
68
+ required: true,
69
+ message: t('promptMessage'),
70
+ },
71
+ ]}
72
+ >
73
+ <Input.TextArea rows={8} />
74
+ </Form.Item>
75
+ <Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
76
+ <Flex gap={20} align="center">
77
+ <Flex flex={1}>
78
+ <Form.Item
79
+ name={['parser_config', 'raptor', 'max_token']}
80
+ noStyle
81
+ initialValue={128}
82
+ rules={[
83
+ {
84
+ required: true,
85
+ message: t('maxTokenMessage'),
86
+ },
87
+ ]}
88
+ >
89
+ <Slider max={2048} style={{ width: '100%' }} />
90
+ </Form.Item>
91
+ </Flex>
92
+ <Form.Item
93
+ name={['parser_config', 'raptor', 'max_token']}
94
+ noStyle
95
+ rules={[
96
+ {
97
+ required: true,
98
+ message: t('maxTokenMessage'),
99
+ },
100
+ ]}
101
+ >
102
+ <InputNumber max={2048} min={0} />
103
+ </Form.Item>
104
+ </Flex>
105
+ </Form.Item>
106
+ <Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
107
+ <Flex gap={20} align="center">
108
+ <Flex flex={1}>
109
+ <Form.Item
110
+ name={['parser_config', 'raptor', 'threshold']}
111
+ noStyle
112
+ initialValue={0.1}
113
+ rules={[
114
+ {
115
+ required: true,
116
+ message: t('thresholdMessage'),
117
+ },
118
+ ]}
119
+ >
120
+ <Slider
121
+ min={0}
122
+ max={1}
123
+ style={{ width: '100%' }}
124
+ step={0.01}
125
+ />
126
+ </Form.Item>
127
+ </Flex>
128
+ <Form.Item
129
+ name={['parser_config', 'raptor', 'threshold']}
130
+ noStyle
131
+ rules={[
132
+ {
133
+ required: true,
134
+ message: t('thresholdMessage'),
135
+ },
136
+ ]}
137
+ >
138
+ <InputNumber max={1} min={0} step={0.01} />
139
+ </Form.Item>
140
+ </Flex>
141
+ </Form.Item>
142
+ <Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
143
+ <Flex gap={20} align="center">
144
+ <Flex flex={1}>
145
+ <Form.Item
146
+ name={['parser_config', 'raptor', 'max_cluster']}
147
+ noStyle
148
+ initialValue={64}
149
+ rules={[
150
+ {
151
+ required: true,
152
+ message: t('maxClusterMessage'),
153
+ },
154
+ ]}
155
+ >
156
+ <Slider min={1} max={1024} style={{ width: '100%' }} />
157
+ </Form.Item>
158
+ </Flex>
159
+ <Form.Item
160
+ name={['parser_config', 'raptor', 'max_cluster']}
161
+ noStyle
162
+ rules={[
163
+ {
164
+ required: true,
165
+ message: t('maxClusterMessage'),
166
+ },
167
+ ]}
168
+ >
169
+ <InputNumber max={1024} min={1} />
170
+ </Form.Item>
171
+ </Flex>
172
+ </Form.Item>
173
+ <Form.Item label={t('randomSeed')}>
174
+ <Flex gap={20} align="center">
175
+ <Flex flex={1}>
176
+ <Form.Item
177
+ name={['parser_config', 'raptor', 'random_seed']}
178
+ noStyle
179
+ initialValue={0}
180
+ rules={[
181
+ {
182
+ required: true,
183
+ message: t('randomSeedMessage'),
184
+ },
185
+ ]}
186
+ >
187
+ <InputNumber style={{ width: '100%' }} />
188
+ </Form.Item>
189
+ </Flex>
190
+ <Form.Item noStyle>
191
+ <Button type="primary" onClick={handleGenerate}>
192
+ <PlusOutlined />
193
+ </Button>
194
+ </Form.Item>
195
+ </Flex>
196
+ </Form.Item>
197
+ </>
198
+ )
199
+ );
200
+ }}
201
+ </Form.Item>
202
+ </>
203
+ );
204
+ };
205
+
206
+ export default ParseConfiguration;
web/src/locales/en.ts CHANGED
@@ -265,6 +265,26 @@ export default {
265
  </p><p>
266
  If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
267
  </p>`,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  },
269
  chunk: {
270
  chunk: 'Chunk',
 
265
  </p><p>
266
  If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
267
  </p>`,
268
+ useRaptor: 'Use RAPTOR to enhance retrieval',
269
+ useRaptorTip:
270
+ 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
271
+ prompt: 'Prompt',
272
+ promptTip: 'LLM prompt used for summarization.',
273
+ promptMessage: 'Prompt is required',
274
+ promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
275
+ {cluster_content}
276
+ The above is the content you need to summarize.`,
277
+ maxToken: 'Max token',
278
+ maxTokenTip: 'Maximum token number for summarization.',
279
+ maxTokenMessage: 'Max token is required',
280
+ threshold: 'Threshold',
281
+ thresholdTip: 'The bigger the threshold is the less cluster will be.',
282
+ thresholdMessage: 'Threshold is required',
283
+ maxCluster: 'Max cluster',
284
+ maxClusterTip: 'Maximum cluster number.',
285
+ maxClusterMessage: 'Max cluster is required',
286
+ randomSeed: 'Random seed',
287
+ randomSeedMessage: 'Random seed is required',
288
  },
289
  chunk: {
290
  chunk: 'Chunk',
web/src/locales/zh-traditional.ts CHANGED
@@ -238,6 +238,25 @@ export default {
238
  </p><p>
239
  如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
240
  </p>`,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  },
242
  chunk: {
243
  chunk: '解析塊',
 
238
  </p><p>
239
  如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
240
  </p>`,
241
+ useRaptor: '使用RAPTOR文件增強策略',
242
+ useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
243
+ prompt: '提示詞',
244
+ promptMessage: '提示詞是必填項',
245
+ promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
246
+ {集群內容}
247
+ 以上就是你需要總結的內容。`,
248
+ maxToken: '最大token數',
249
+ maxTokenMessage: '最大token數是必填項',
250
+ threshold: '臨界點',
251
+ thresholdMessage: '臨界點是必填項',
252
+ maxCluster: '最大聚類數',
253
+ maxClusterMessage: '最大聚類數是必填項',
254
+ randomSeed: '隨機種子',
255
+ randomSeedMessage: '隨機種子是必填項',
256
+ promptTip: 'LLM提示用於總結。',
257
+ maxTokenTip: '用於匯總的最大token數。',
258
+ thresholdTip: '閾值越大,聚類越少。',
259
+ maxClusterTip: '最大聚類數。',
260
  },
261
  chunk: {
262
  chunk: '解析塊',
web/src/locales/zh.ts CHANGED
@@ -255,6 +255,25 @@ export default {
255
  </p><p>
256
  如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
257
  </p>`,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  },
259
  chunk: {
260
  chunk: '解析块',
 
255
  </p><p>
256
  如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
257
  </p>`,
258
+ useRaptor: '使用召回增强RAPTOR策略',
259
+ useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
260
+ prompt: '提示词',
261
+ promptMessage: '提示词是必填项',
262
+ promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
263
+ {集群内容}
264
+ 以上就是你需要总结的内容。`,
265
+ maxToken: '最大token数',
266
+ maxTokenMessage: '最大token数是必填项',
267
+ threshold: '临界点',
268
+ thresholdMessage: '临界点是必填项',
269
+ maxCluster: '最大聚类数',
270
+ maxClusterMessage: '最大聚类数是必填项',
271
+ randomSeed: '随机种子',
272
+ randomSeedMessage: '随机种子是必填项',
273
+ promptTip: 'LLM提示用于总结。',
274
+ maxTokenTip: '用于汇总的最大token数。',
275
+ thresholdTip: '阈值越大,聚类越少。',
276
+ maxClusterTip: '最大聚类数。',
277
  },
278
  chunk: {
279
  chunk: '解析块',
web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx CHANGED
@@ -8,6 +8,9 @@ import {
8
 
9
  import LayoutRecognize from '@/components/layout-recognize';
10
  import MaxTokenNumber from '@/components/max-token-number';
 
 
 
11
  import { useTranslate } from '@/hooks/commonHooks';
12
  import { FormInstance } from 'antd/lib';
13
  import styles from './index.less';
@@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
99
  {({ getFieldValue }) => {
100
  const parserId = getFieldValue('parser_id');
101
 
102
- if (parserId === 'naive') {
103
- return (
104
- <>
105
- <MaxTokenNumber></MaxTokenNumber>
106
- <LayoutRecognize></LayoutRecognize>
107
- </>
108
- );
109
- }
110
- return null;
 
 
 
 
111
  }}
112
  </Form.Item>
113
 
 
8
 
9
  import LayoutRecognize from '@/components/layout-recognize';
10
  import MaxTokenNumber from '@/components/max-token-number';
11
+ import ParseConfiguration, {
12
+ showRaptorParseConfiguration,
13
+ } from '@/components/parse-configuration';
14
  import { useTranslate } from '@/hooks/commonHooks';
15
  import { FormInstance } from 'antd/lib';
16
  import styles from './index.less';
 
102
  {({ getFieldValue }) => {
103
  const parserId = getFieldValue('parser_id');
104
 
105
+ return (
106
+ <>
107
+ {parserId === 'naive' && (
108
+ <>
109
+ <MaxTokenNumber></MaxTokenNumber>
110
+ <LayoutRecognize></LayoutRecognize>
111
+ </>
112
+ )}
113
+ {showRaptorParseConfiguration(parserId) && (
114
+ <ParseConfiguration></ParseConfiguration>
115
+ )}
116
+ </>
117
+ );
118
  }}
119
  </Form.Item>
120
 
web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts CHANGED
@@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
62
  'embd_id',
63
  'parser_id',
64
  'language',
65
- 'parser_config.chunk_token_num',
66
  ]),
67
  avatar: fileList,
68
  });
 
62
  'embd_id',
63
  'parser_id',
64
  'language',
65
+ 'parser_config',
66
  ]),
67
  avatar: fileList,
68
  });