balibabu
commited on
Commit
·
980e3c4
1
Parent(s):
2d8207d
Implements RAPTOR for better chunking #882 (#883)
Browse files### What problem does this PR solve?
Implements RAPTOR for better chunking #882
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- web/src/components/chunk-method-modal/index.tsx +7 -0
- web/src/components/parse-configuration/index.tsx +206 -0
- web/src/locales/en.ts +20 -0
- web/src/locales/zh-traditional.ts +19 -0
- web/src/locales/zh.ts +19 -0
- web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx +16 -9
- web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts +1 -1
web/src/components/chunk-method-modal/index.tsx
CHANGED
@@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
|
|
23 |
|
24 |
import { useTranslate } from '@/hooks/commonHooks';
|
25 |
import LayoutRecognize from '../layout-recognize';
|
|
|
|
|
|
|
26 |
import styles from './index.less';
|
27 |
|
28 |
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
@@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|
111 |
onCancel={hideModal}
|
112 |
afterClose={afterClose}
|
113 |
confirmLoading={loading}
|
|
|
114 |
>
|
115 |
<Space size={[0, 8]} wrap>
|
116 |
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
@@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|
255 |
</Form.Item>
|
256 |
)}
|
257 |
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
|
|
|
|
|
|
|
258 |
</Form>
|
259 |
</Modal>
|
260 |
);
|
|
|
23 |
|
24 |
import { useTranslate } from '@/hooks/commonHooks';
|
25 |
import LayoutRecognize from '../layout-recognize';
|
26 |
+
import ParseConfiguration, {
|
27 |
+
showRaptorParseConfiguration,
|
28 |
+
} from '../parse-configuration';
|
29 |
import styles from './index.less';
|
30 |
|
31 |
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
|
|
114 |
onCancel={hideModal}
|
115 |
afterClose={afterClose}
|
116 |
confirmLoading={loading}
|
117 |
+
width={700}
|
118 |
>
|
119 |
<Space size={[0, 8]} wrap>
|
120 |
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
|
|
259 |
</Form.Item>
|
260 |
)}
|
261 |
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
|
262 |
+
{showRaptorParseConfiguration(selectedTag) && (
|
263 |
+
<ParseConfiguration></ParseConfiguration>
|
264 |
+
)}
|
265 |
</Form>
|
266 |
</Modal>
|
267 |
);
|
web/src/components/parse-configuration/index.tsx
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useTranslate } from '@/hooks/commonHooks';
|
2 |
+
import { PlusOutlined } from '@ant-design/icons';
|
3 |
+
import {
|
4 |
+
Button,
|
5 |
+
Divider,
|
6 |
+
Flex,
|
7 |
+
Form,
|
8 |
+
Input,
|
9 |
+
InputNumber,
|
10 |
+
Slider,
|
11 |
+
Switch,
|
12 |
+
} from 'antd';
|
13 |
+
import random from 'lodash/random';
|
14 |
+
|
15 |
+
export const excludedParseMethods = ['table', 'resume', 'one'];
|
16 |
+
|
17 |
+
export const showRaptorParseConfiguration = (parserId: string) => {
|
18 |
+
return !excludedParseMethods.includes(parserId);
|
19 |
+
};
|
20 |
+
|
21 |
+
// The three types "table", "resume" and "one" do not display this configuration.
|
22 |
+
const ParseConfiguration = () => {
|
23 |
+
const form = Form.useFormInstance();
|
24 |
+
const { t } = useTranslate('knowledgeConfiguration');
|
25 |
+
|
26 |
+
const handleGenerate = () => {
|
27 |
+
form.setFieldValue(
|
28 |
+
['parser_config', 'raptor', 'random_seed'],
|
29 |
+
random(10000),
|
30 |
+
);
|
31 |
+
};
|
32 |
+
|
33 |
+
return (
|
34 |
+
<>
|
35 |
+
<Divider></Divider>
|
36 |
+
<Form.Item
|
37 |
+
name={['parser_config', 'raptor', 'use_raptor']}
|
38 |
+
label={t('useRaptor')}
|
39 |
+
initialValue={false}
|
40 |
+
valuePropName="checked"
|
41 |
+
tooltip={t('useRaptorTip')}
|
42 |
+
>
|
43 |
+
<Switch />
|
44 |
+
</Form.Item>
|
45 |
+
<Form.Item
|
46 |
+
shouldUpdate={(prevValues, curValues) =>
|
47 |
+
prevValues.parser_config.raptor.use_raptor !==
|
48 |
+
curValues.parser_config.raptor.use_raptor
|
49 |
+
}
|
50 |
+
>
|
51 |
+
{({ getFieldValue }) => {
|
52 |
+
const useRaptor = getFieldValue([
|
53 |
+
'parser_config',
|
54 |
+
'raptor',
|
55 |
+
'use_raptor',
|
56 |
+
]);
|
57 |
+
|
58 |
+
return (
|
59 |
+
useRaptor && (
|
60 |
+
<>
|
61 |
+
<Form.Item
|
62 |
+
name={['parser_config', 'raptor', 'prompt']}
|
63 |
+
label={t('prompt')}
|
64 |
+
initialValue={t('promptText')}
|
65 |
+
tooltip={t('promptTip')}
|
66 |
+
rules={[
|
67 |
+
{
|
68 |
+
required: true,
|
69 |
+
message: t('promptMessage'),
|
70 |
+
},
|
71 |
+
]}
|
72 |
+
>
|
73 |
+
<Input.TextArea rows={8} />
|
74 |
+
</Form.Item>
|
75 |
+
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
|
76 |
+
<Flex gap={20} align="center">
|
77 |
+
<Flex flex={1}>
|
78 |
+
<Form.Item
|
79 |
+
name={['parser_config', 'raptor', 'max_token']}
|
80 |
+
noStyle
|
81 |
+
initialValue={128}
|
82 |
+
rules={[
|
83 |
+
{
|
84 |
+
required: true,
|
85 |
+
message: t('maxTokenMessage'),
|
86 |
+
},
|
87 |
+
]}
|
88 |
+
>
|
89 |
+
<Slider max={2048} style={{ width: '100%' }} />
|
90 |
+
</Form.Item>
|
91 |
+
</Flex>
|
92 |
+
<Form.Item
|
93 |
+
name={['parser_config', 'raptor', 'max_token']}
|
94 |
+
noStyle
|
95 |
+
rules={[
|
96 |
+
{
|
97 |
+
required: true,
|
98 |
+
message: t('maxTokenMessage'),
|
99 |
+
},
|
100 |
+
]}
|
101 |
+
>
|
102 |
+
<InputNumber max={2048} min={0} />
|
103 |
+
</Form.Item>
|
104 |
+
</Flex>
|
105 |
+
</Form.Item>
|
106 |
+
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
|
107 |
+
<Flex gap={20} align="center">
|
108 |
+
<Flex flex={1}>
|
109 |
+
<Form.Item
|
110 |
+
name={['parser_config', 'raptor', 'threshold']}
|
111 |
+
noStyle
|
112 |
+
initialValue={0.1}
|
113 |
+
rules={[
|
114 |
+
{
|
115 |
+
required: true,
|
116 |
+
message: t('thresholdMessage'),
|
117 |
+
},
|
118 |
+
]}
|
119 |
+
>
|
120 |
+
<Slider
|
121 |
+
min={0}
|
122 |
+
max={1}
|
123 |
+
style={{ width: '100%' }}
|
124 |
+
step={0.01}
|
125 |
+
/>
|
126 |
+
</Form.Item>
|
127 |
+
</Flex>
|
128 |
+
<Form.Item
|
129 |
+
name={['parser_config', 'raptor', 'threshold']}
|
130 |
+
noStyle
|
131 |
+
rules={[
|
132 |
+
{
|
133 |
+
required: true,
|
134 |
+
message: t('thresholdMessage'),
|
135 |
+
},
|
136 |
+
]}
|
137 |
+
>
|
138 |
+
<InputNumber max={1} min={0} step={0.01} />
|
139 |
+
</Form.Item>
|
140 |
+
</Flex>
|
141 |
+
</Form.Item>
|
142 |
+
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
|
143 |
+
<Flex gap={20} align="center">
|
144 |
+
<Flex flex={1}>
|
145 |
+
<Form.Item
|
146 |
+
name={['parser_config', 'raptor', 'max_cluster']}
|
147 |
+
noStyle
|
148 |
+
initialValue={64}
|
149 |
+
rules={[
|
150 |
+
{
|
151 |
+
required: true,
|
152 |
+
message: t('maxClusterMessage'),
|
153 |
+
},
|
154 |
+
]}
|
155 |
+
>
|
156 |
+
<Slider min={1} max={1024} style={{ width: '100%' }} />
|
157 |
+
</Form.Item>
|
158 |
+
</Flex>
|
159 |
+
<Form.Item
|
160 |
+
name={['parser_config', 'raptor', 'max_cluster']}
|
161 |
+
noStyle
|
162 |
+
rules={[
|
163 |
+
{
|
164 |
+
required: true,
|
165 |
+
message: t('maxClusterMessage'),
|
166 |
+
},
|
167 |
+
]}
|
168 |
+
>
|
169 |
+
<InputNumber max={1024} min={1} />
|
170 |
+
</Form.Item>
|
171 |
+
</Flex>
|
172 |
+
</Form.Item>
|
173 |
+
<Form.Item label={t('randomSeed')}>
|
174 |
+
<Flex gap={20} align="center">
|
175 |
+
<Flex flex={1}>
|
176 |
+
<Form.Item
|
177 |
+
name={['parser_config', 'raptor', 'random_seed']}
|
178 |
+
noStyle
|
179 |
+
initialValue={0}
|
180 |
+
rules={[
|
181 |
+
{
|
182 |
+
required: true,
|
183 |
+
message: t('randomSeedMessage'),
|
184 |
+
},
|
185 |
+
]}
|
186 |
+
>
|
187 |
+
<InputNumber style={{ width: '100%' }} />
|
188 |
+
</Form.Item>
|
189 |
+
</Flex>
|
190 |
+
<Form.Item noStyle>
|
191 |
+
<Button type="primary" onClick={handleGenerate}>
|
192 |
+
<PlusOutlined />
|
193 |
+
</Button>
|
194 |
+
</Form.Item>
|
195 |
+
</Flex>
|
196 |
+
</Form.Item>
|
197 |
+
</>
|
198 |
+
)
|
199 |
+
);
|
200 |
+
}}
|
201 |
+
</Form.Item>
|
202 |
+
</>
|
203 |
+
);
|
204 |
+
};
|
205 |
+
|
206 |
+
export default ParseConfiguration;
|
web/src/locales/en.ts
CHANGED
@@ -265,6 +265,26 @@ export default {
|
|
265 |
</p><p>
|
266 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
267 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
},
|
269 |
chunk: {
|
270 |
chunk: 'Chunk',
|
|
|
265 |
</p><p>
|
266 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
267 |
</p>`,
|
268 |
+
useRaptor: 'Use RAPTOR to enhance retrieval',
|
269 |
+
useRaptorTip:
|
270 |
+
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
271 |
+
prompt: 'Prompt',
|
272 |
+
promptTip: 'LLM prompt used for summarization.',
|
273 |
+
promptMessage: 'Prompt is required',
|
274 |
+
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
|
275 |
+
{cluster_content}
|
276 |
+
The above is the content you need to summarize.`,
|
277 |
+
maxToken: 'Max token',
|
278 |
+
maxTokenTip: 'Maximum token number for summarization.',
|
279 |
+
maxTokenMessage: 'Max token is required',
|
280 |
+
threshold: 'Threshold',
|
281 |
+
thresholdTip: 'The bigger the threshold is the less cluster will be.',
|
282 |
+
thresholdMessage: 'Threshold is required',
|
283 |
+
maxCluster: 'Max cluster',
|
284 |
+
maxClusterTip: 'Maximum cluster number.',
|
285 |
+
maxClusterMessage: 'Max cluster is required',
|
286 |
+
randomSeed: 'Random seed',
|
287 |
+
randomSeedMessage: 'Random seed is required',
|
288 |
},
|
289 |
chunk: {
|
290 |
chunk: 'Chunk',
|
web/src/locales/zh-traditional.ts
CHANGED
@@ -238,6 +238,25 @@ export default {
|
|
238 |
</p><p>
|
239 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
240 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
},
|
242 |
chunk: {
|
243 |
chunk: '解析塊',
|
|
|
238 |
</p><p>
|
239 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
240 |
</p>`,
|
241 |
+
useRaptor: '使用RAPTOR文件增強策略',
|
242 |
+
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
243 |
+
prompt: '提示詞',
|
244 |
+
promptMessage: '提示詞是必填項',
|
245 |
+
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
|
246 |
+
{集群內容}
|
247 |
+
以上就是你需要總結的內容。`,
|
248 |
+
maxToken: '最大token數',
|
249 |
+
maxTokenMessage: '最大token數是必填項',
|
250 |
+
threshold: '臨界點',
|
251 |
+
thresholdMessage: '臨界點是必填項',
|
252 |
+
maxCluster: '最大聚類數',
|
253 |
+
maxClusterMessage: '最大聚類數是必填項',
|
254 |
+
randomSeed: '隨機種子',
|
255 |
+
randomSeedMessage: '隨機種子是必填項',
|
256 |
+
promptTip: 'LLM提示用於總結。',
|
257 |
+
maxTokenTip: '用於匯總的最大token數。',
|
258 |
+
thresholdTip: '閾值越大,聚類越少。',
|
259 |
+
maxClusterTip: '最大聚類數。',
|
260 |
},
|
261 |
chunk: {
|
262 |
chunk: '解析塊',
|
web/src/locales/zh.ts
CHANGED
@@ -255,6 +255,25 @@ export default {
|
|
255 |
</p><p>
|
256 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
257 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
},
|
259 |
chunk: {
|
260 |
chunk: '解析块',
|
|
|
255 |
</p><p>
|
256 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
257 |
</p>`,
|
258 |
+
useRaptor: '使用召回增强RAPTOR策略',
|
259 |
+
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
260 |
+
prompt: '提示词',
|
261 |
+
promptMessage: '提示词是必填项',
|
262 |
+
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
|
263 |
+
{集群内容}
|
264 |
+
以上就是你需要总结的内容。`,
|
265 |
+
maxToken: '最大token数',
|
266 |
+
maxTokenMessage: '最大token数是必填项',
|
267 |
+
threshold: '临界点',
|
268 |
+
thresholdMessage: '临界点是必填项',
|
269 |
+
maxCluster: '最大聚类数',
|
270 |
+
maxClusterMessage: '最大聚类数是必填项',
|
271 |
+
randomSeed: '随机种子',
|
272 |
+
randomSeedMessage: '随机种子是必填项',
|
273 |
+
promptTip: 'LLM提示用于总结。',
|
274 |
+
maxTokenTip: '用于汇总的最大token数。',
|
275 |
+
thresholdTip: '阈值越大,聚类越少。',
|
276 |
+
maxClusterTip: '最大聚类数。',
|
277 |
},
|
278 |
chunk: {
|
279 |
chunk: '解析块',
|
web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
CHANGED
@@ -8,6 +8,9 @@ import {
|
|
8 |
|
9 |
import LayoutRecognize from '@/components/layout-recognize';
|
10 |
import MaxTokenNumber from '@/components/max-token-number';
|
|
|
|
|
|
|
11 |
import { useTranslate } from '@/hooks/commonHooks';
|
12 |
import { FormInstance } from 'antd/lib';
|
13 |
import styles from './index.less';
|
@@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
|
|
99 |
{({ getFieldValue }) => {
|
100 |
const parserId = getFieldValue('parser_id');
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
111 |
}}
|
112 |
</Form.Item>
|
113 |
|
|
|
8 |
|
9 |
import LayoutRecognize from '@/components/layout-recognize';
|
10 |
import MaxTokenNumber from '@/components/max-token-number';
|
11 |
+
import ParseConfiguration, {
|
12 |
+
showRaptorParseConfiguration,
|
13 |
+
} from '@/components/parse-configuration';
|
14 |
import { useTranslate } from '@/hooks/commonHooks';
|
15 |
import { FormInstance } from 'antd/lib';
|
16 |
import styles from './index.less';
|
|
|
102 |
{({ getFieldValue }) => {
|
103 |
const parserId = getFieldValue('parser_id');
|
104 |
|
105 |
+
return (
|
106 |
+
<>
|
107 |
+
{parserId === 'naive' && (
|
108 |
+
<>
|
109 |
+
<MaxTokenNumber></MaxTokenNumber>
|
110 |
+
<LayoutRecognize></LayoutRecognize>
|
111 |
+
</>
|
112 |
+
)}
|
113 |
+
{showRaptorParseConfiguration(parserId) && (
|
114 |
+
<ParseConfiguration></ParseConfiguration>
|
115 |
+
)}
|
116 |
+
</>
|
117 |
+
);
|
118 |
}}
|
119 |
</Form.Item>
|
120 |
|
web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
CHANGED
@@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
|
62 |
'embd_id',
|
63 |
'parser_id',
|
64 |
'language',
|
65 |
-
'parser_config
|
66 |
]),
|
67 |
avatar: fileList,
|
68 |
});
|
|
|
62 |
'embd_id',
|
63 |
'parser_id',
|
64 |
'language',
|
65 |
+
'parser_config',
|
66 |
]),
|
67 |
avatar: fileList,
|
68 |
});
|