liuhua liuhua commited on
Commit
811d178
·
1 Parent(s): 7d7dfcb

Refactor Document API (#2833)

Browse files

### What problem does this PR solve?

Refactor Document API

### Type of change


- [x] Refactoring

Co-authored-by: liuhua <[email protected]>

api/apps/sdk/chat.py CHANGED
@@ -243,7 +243,7 @@ def list(tenant_id):
243
  page_number = int(request.args.get("page", 1))
244
  items_per_page = int(request.args.get("page_size", 1024))
245
  orderby = request.args.get("orderby", "create_time")
246
- if request.args.get("desc") == "False":
247
  desc = False
248
  else:
249
  desc = True
 
243
  page_number = int(request.args.get("page", 1))
244
  items_per_page = int(request.args.get("page_size", 1024))
245
  orderby = request.args.get("orderby", "create_time")
246
+ if request.args.get("desc") == "False" or request.args.get("desc") == "false":
247
  desc = False
248
  else:
249
  desc = True
api/apps/sdk/dataset.py CHANGED
@@ -107,11 +107,6 @@ def update(tenant_id,dataset_id):
107
  if req["tenant_id"] != tenant_id:
108
  return get_error_data_result(
109
  retmsg="Can't change tenant_id.")
110
- if "embedding_model" in req:
111
- if req["embedding_model"] != t.embd_id:
112
- return get_error_data_result(
113
- retmsg="Can't change embedding_model.")
114
- req.pop("embedding_model")
115
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
116
  if "chunk_count" in req:
117
  if req["chunk_count"] != kb.chunk_num:
@@ -128,6 +123,11 @@ def update(tenant_id,dataset_id):
128
  return get_error_data_result(
129
  retmsg="If chunk count is not 0, parse method is not changable.")
130
  req['parser_id'] = req.pop('parse_method')
 
 
 
 
 
131
  if "name" in req:
132
  req["name"] = req["name"].strip()
133
  if req["name"].lower() != kb.name.lower() \
@@ -150,7 +150,7 @@ def list(tenant_id):
150
  page_number = int(request.args.get("page", 1))
151
  items_per_page = int(request.args.get("page_size", 1024))
152
  orderby = request.args.get("orderby", "create_time")
153
- if request.args.get("desc") == "False":
154
  desc = False
155
  else:
156
  desc = True
 
107
  if req["tenant_id"] != tenant_id:
108
  return get_error_data_result(
109
  retmsg="Can't change tenant_id.")
 
 
 
 
 
110
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
111
  if "chunk_count" in req:
112
  if req["chunk_count"] != kb.chunk_num:
 
123
  return get_error_data_result(
124
  retmsg="If chunk count is not 0, parse method is not changable.")
125
  req['parser_id'] = req.pop('parse_method')
126
+ if "embedding_model" in req:
127
+ if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
128
+ return get_error_data_result(
129
+ retmsg="If chunk count is not 0, parse method is not changable.")
130
+ req['embd_id'] = req.pop('embedding_model')
131
  if "name" in req:
132
  req["name"] = req["name"].strip()
133
  if req["name"].lower() != kb.name.lower() \
 
150
  page_number = int(request.args.get("page", 1))
151
  items_per_page = int(request.args.get("page_size", 1024))
152
  orderby = request.args.get("orderby", "create_time")
153
+ if request.args.get("desc") == "False" or request.args.get("desc") == "false" :
154
  desc = False
155
  else:
156
  desc = True
api/apps/sdk/doc.py CHANGED
@@ -8,6 +8,7 @@ from botocore.docs.method import document_model_driven_method
8
  from flask import request
9
  from flask_login import login_required, current_user
10
  from elasticsearch_dsl import Q
 
11
  from sphinx.addnodes import document
12
 
13
  from rag.app.qa import rmPrefix, beAdoc
@@ -158,7 +159,7 @@ def download(tenant_id, dataset_id, document_id):
158
  return get_error_data_result(retmsg=f'You do not own the dataset {dataset_id}.')
159
  doc = DocumentService.query(kb_id=dataset_id, id=document_id)
160
  if not doc:
161
- return get_error_data_result(retmsg=f'The dataset not own the document {doc.id}.')
162
  # The process of downloading
163
  doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
164
  file_stream = STORAGE_IMPL.get(doc_id, doc_location)
@@ -294,7 +295,7 @@ def stop_parsing(tenant_id,dataset_id):
294
  return get_result()
295
 
296
 
297
- @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk', methods=['GET'])
298
  @token_required
299
  def list_chunk(tenant_id,dataset_id,document_id):
300
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
@@ -361,7 +362,7 @@ def list_chunk(tenant_id,dataset_id,document_id):
361
  return server_error_response(e)
362
 
363
 
364
- @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk', methods=['POST'])
365
  @token_required
366
  def create(tenant_id,dataset_id,document_id):
367
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
@@ -369,6 +370,7 @@ def create(tenant_id,dataset_id,document_id):
369
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
370
  if not doc:
371
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
372
  req = request.json
373
  if not req.get("content"):
374
  return get_error_data_result(retmsg="`content` is required")
@@ -418,7 +420,7 @@ def create(tenant_id,dataset_id,document_id):
418
  # return get_result(data={"chunk_id": chunk_id})
419
 
420
 
421
- @manager.route('dataset/{dataset_id}/document/{document_id}/chunk', methods=['DELETE'])
422
  @token_required
423
  def rm_chunk(tenant_id,dataset_id,document_id):
424
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
@@ -426,9 +428,16 @@ def rm_chunk(tenant_id,dataset_id,document_id):
426
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
427
  if not doc:
428
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
429
  req = request.json
430
  if not req.get("chunk_ids"):
431
  return get_error_data_result("`chunk_ids` is required")
 
 
 
 
 
 
432
  if not ELASTICSEARCH.deleteByQuery(
433
  Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
434
  return get_error_data_result(retmsg="Index updating failure")
@@ -439,25 +448,26 @@ def rm_chunk(tenant_id,dataset_id,document_id):
439
 
440
 
441
 
442
- @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}', methods=['PUT'])
443
  @token_required
444
  def set(tenant_id,dataset_id,document_id,chunk_id):
 
 
 
 
 
445
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
446
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
447
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
448
  if not doc:
449
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
450
  req = request.json
451
- if not req.get("content"):
452
- return get_error_data_result("`content` is required")
453
- if not req.get("important_keywords"):
454
- return get_error_data_result("`important_keywords` is required")
455
  d = {
456
  "id": chunk_id,
457
- "content_with_weight": req["content"]}
458
  d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
459
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
460
- d["important_kwd"] = req["important_keywords"]
461
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
462
  if "available" in req:
463
  d["available_int"] = req["available"]
@@ -488,23 +498,27 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
488
  @token_required
489
  def retrieval_test(tenant_id):
490
  req = request.args
491
- if not req.get("datasets"):
 
492
  return get_error_data_result("`datasets` is required.")
493
- for id in req.get("datasets"):
494
  if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
495
  return get_error_data_result(f"You don't own the dataset {id}.")
496
- if not req.get("question"):
497
  return get_error_data_result("`question` is required.")
498
  page = int(req.get("offset", 1))
499
  size = int(req.get("limit", 30))
500
- question = req["question"]
501
- kb_id = req["datasets"]
502
  if isinstance(kb_id, str): kb_id = [kb_id]
503
- doc_ids = req.get("documents", [])
504
- similarity_threshold = float(req.get("similarity_threshold", 0.2))
505
  vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
506
  top = int(req.get("top_k", 1024))
507
-
 
 
 
508
  try:
509
  e, kb = KnowledgebaseService.get_by_id(kb_id[0])
510
  if not e:
@@ -524,7 +538,7 @@ def retrieval_test(tenant_id):
524
  retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
525
  ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
526
  similarity_threshold, vector_similarity_weight, top,
527
- doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
528
  for c in ranks["chunks"]:
529
  if "vector" in c:
530
  del c["vector"]
@@ -543,11 +557,11 @@ def retrieval_test(tenant_id):
543
  for key, value in chunk.items():
544
  new_key = key_mapping.get(key, key)
545
  rename_chunk[new_key] = value
546
- renamed_chunks.append(rename_chunk)
547
  ranks["chunks"] = renamed_chunks
548
  return get_result(data=ranks)
549
  except Exception as e:
550
  if str(e).find("not_found") > 0:
551
- return get_result(retmsg=f'No chunk found! Check the chunk status please!',
552
  retcode=RetCode.DATA_ERROR)
553
  return server_error_response(e)
 
8
  from flask import request
9
  from flask_login import login_required, current_user
10
  from elasticsearch_dsl import Q
11
+ from pygments import highlight
12
  from sphinx.addnodes import document
13
 
14
  from rag.app.qa import rmPrefix, beAdoc
 
159
  return get_error_data_result(retmsg=f'You do not own the dataset {dataset_id}.')
160
  doc = DocumentService.query(kb_id=dataset_id, id=document_id)
161
  if not doc:
162
+ return get_error_data_result(retmsg=f'The dataset not own the document {document_id}.')
163
  # The process of downloading
164
  doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
165
  file_stream = STORAGE_IMPL.get(doc_id, doc_location)
 
295
  return get_result()
296
 
297
 
298
+ @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
299
  @token_required
300
  def list_chunk(tenant_id,dataset_id,document_id):
301
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
 
362
  return server_error_response(e)
363
 
364
 
365
+ @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
366
  @token_required
367
  def create(tenant_id,dataset_id,document_id):
368
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
 
370
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
371
  if not doc:
372
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
373
+ doc = doc[0]
374
  req = request.json
375
  if not req.get("content"):
376
  return get_error_data_result(retmsg="`content` is required")
 
420
  # return get_result(data={"chunk_id": chunk_id})
421
 
422
 
423
+ @manager.route('dataset/<dataset_id>/document/<document_id>/chunk', methods=['DELETE'])
424
  @token_required
425
  def rm_chunk(tenant_id,dataset_id,document_id):
426
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
 
428
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
429
  if not doc:
430
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
431
+ doc = doc[0]
432
  req = request.json
433
  if not req.get("chunk_ids"):
434
  return get_error_data_result("`chunk_ids` is required")
435
+ for chunk_id in req.get("chunk_ids"):
436
+ res = ELASTICSEARCH.get(
437
+ chunk_id, search.index_name(
438
+ tenant_id))
439
+ if not res.get("found"):
440
+ return server_error_response(f"Chunk {chunk_id} not found")
441
  if not ELASTICSEARCH.deleteByQuery(
442
  Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
443
  return get_error_data_result(retmsg="Index updating failure")
 
448
 
449
 
450
 
451
+ @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
452
  @token_required
453
  def set(tenant_id,dataset_id,document_id,chunk_id):
454
+ res = ELASTICSEARCH.get(
455
+ chunk_id, search.index_name(
456
+ tenant_id))
457
+ if not res.get("found"):
458
+ return get_error_data_result(f"Chunk {chunk_id} not found")
459
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
460
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
461
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
462
  if not doc:
463
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
464
  req = request.json
 
 
 
 
465
  d = {
466
  "id": chunk_id,
467
+ "content_with_weight": req.get("content",res.get["content_with_weight"])}
468
  d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
469
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
470
+ d["important_kwd"] = req.get("important_keywords",[])
471
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
472
  if "available" in req:
473
  d["available_int"] = req["available"]
 
498
  @token_required
499
  def retrieval_test(tenant_id):
500
  req = request.args
501
+ req_json = request.json
502
+ if not req_json.get("datasets"):
503
  return get_error_data_result("`datasets` is required.")
504
+ for id in req_json.get("datasets"):
505
  if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
506
  return get_error_data_result(f"You don't own the dataset {id}.")
507
+ if "question" not in req_json:
508
  return get_error_data_result("`question` is required.")
509
  page = int(req.get("offset", 1))
510
  size = int(req.get("limit", 30))
511
+ question = req_json["question"]
512
+ kb_id = req_json["datasets"]
513
  if isinstance(kb_id, str): kb_id = [kb_id]
514
+ doc_ids = req_json.get("documents", [])
515
+ similarity_threshold = float(req.get("similarity_threshold", 0.0))
516
  vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
517
  top = int(req.get("top_k", 1024))
518
+ if req.get("highlight")=="False" or req.get("highlight")=="false":
519
+ highlight = False
520
+ else:
521
+ highlight = True
522
  try:
523
  e, kb = KnowledgebaseService.get_by_id(kb_id[0])
524
  if not e:
 
538
  retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
539
  ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
540
  similarity_threshold, vector_similarity_weight, top,
541
+ doc_ids, rerank_mdl=rerank_mdl, highlight=highlight)
542
  for c in ranks["chunks"]:
543
  if "vector" in c:
544
  del c["vector"]
 
557
  for key, value in chunk.items():
558
  new_key = key_mapping.get(key, key)
559
  rename_chunk[new_key] = value
560
+ renamed_chunks.append(rename_chunk)
561
  ranks["chunks"] = renamed_chunks
562
  return get_result(data=ranks)
563
  except Exception as e:
564
  if str(e).find("not_found") > 0:
565
+ return get_result(retmsg=f'No chunk found! Check the chunk statu s please!',
566
  retcode=RetCode.DATA_ERROR)
567
  return server_error_response(e)
api/apps/sdk/session.py CHANGED
@@ -163,7 +163,7 @@ def list(chat_id,tenant_id):
163
  page_number = int(request.args.get("page", 1))
164
  items_per_page = int(request.args.get("page_size", 1024))
165
  orderby = request.args.get("orderby", "create_time")
166
- if request.args.get("desc") == "False":
167
  desc = False
168
  else:
169
  desc = True
 
163
  page_number = int(request.args.get("page", 1))
164
  items_per_page = int(request.args.get("page_size", 1024))
165
  orderby = request.args.get("orderby", "create_time")
166
+ if request.args.get("desc") == "False" or request.args.get("desc") == "false":
167
  desc = False
168
  else:
169
  desc = True
api/http_api.md CHANGED
@@ -5,7 +5,7 @@
5
 
6
  **POST** `/api/v1/dataset`
7
 
8
- Creates a knowledge base (dataset).
9
 
10
  ### Request
11
 
@@ -31,11 +31,11 @@ Creates a knowledge base (dataset).
31
  #### Request example
32
 
33
  ```bash
34
- # "id": "id" must not be provided.
35
- # "name": name is required and cannot be duplicated.
36
  # "tenant_id": tenant_id must not be provided.
37
- # "embedding_model": REQUIRED.
38
- # "naive": general.
39
  curl --request POST \
40
  --url http://{address}/api/v1/dataset \
41
  --header 'Content-Type: application/json' \
@@ -51,21 +51,21 @@ curl --request POST \
51
  #### Request parameters
52
 
53
  - `"id"`: (*Body parameter*)
54
- The unique identifier of each created dataset.
55
- - When creating a dataset, `id` must not be provided.
56
 
57
  - `"name"`: (*Body parameter*)
58
  The name of the dataset, which must adhere to the following requirements:
59
  - Required when creating a dataset and must be unique.
60
- - When updating a dataset, `name` must still be unique.
61
 
62
  - `"avatar"`: (*Body parameter*)
63
  Base64 encoding of the avatar.
64
 
65
  - `"tenant_id"`: (*Body parameter*)
66
  The ID of the tenant associated with the dataset, used to link it with specific users.
67
- - When creating a dataset, `tenant_id` must not be provided.
68
- - When updating a dataset, `tenant_id` cannot be changed.
69
 
70
  - `"description"`: (*Body parameter*)
71
  The description of the dataset.
@@ -74,31 +74,31 @@ curl --request POST \
74
  The language setting for the dataset.
75
 
76
  - `"embedding_model"`: (*Body parameter*)
77
- Embedding model used in the dataset for generating vector embeddings.
78
- - When creating a dataset, `embedding_model` must not be provided.
79
- - When updating a dataset, `embedding_model` cannot be changed.
80
 
81
  - `"permission"`: (*Body parameter*)
82
  Specifies who can manipulate the dataset.
83
 
84
  - `"document_count"`: (*Body parameter*)
85
  Document count of the dataset.
86
- - When updating a dataset, `document_count` cannot be changed.
87
 
88
  - `"chunk_count"`: (*Body parameter*)
89
  Chunk count of the dataset.
90
- - When updating a dataset, `chunk_count` cannot be changed.
91
 
92
  - `"parse_method"`: (*Body parameter*)
93
  Parsing method of the dataset.
94
- - When updating `parse_method`, `chunk_count` must be greater than 0.
95
 
96
  - `"parser_config"`: (*Body parameter*)
97
  The configuration settings for the dataset parser.
98
 
99
  ### Response
100
 
101
- A successful response includes a JSON object like the following:
102
 
103
  ```json
104
  {
@@ -139,7 +139,8 @@ A successful response includes a JSON object like the following:
139
  - `"error_code"`: `integer`
140
  `0`: The operation succeeds.
141
 
142
- An error response includes a JSON object like the following:
 
143
 
144
  ```json
145
  {
@@ -152,7 +153,7 @@ An error response includes a JSON object like the following:
152
 
153
  **DELETE** `/api/v1/dataset`
154
 
155
- Deletes datasets by their IDs.
156
 
157
  ### Request
158
 
@@ -168,7 +169,7 @@ Deletes datasets by their IDs.
168
  #### Request example
169
 
170
  ```bash
171
- # Specify either "ids" or "names", NOT both.
172
  curl --request DELETE \
173
  --url http://{address}/api/v1/dataset \
174
  --header 'Content-Type: application/json' \
@@ -180,13 +181,13 @@ curl --request DELETE \
180
 
181
  #### Request parameters
182
 
183
- - `"ids"`: (*Body parameter*)
184
- IDs of the datasets to delete.
185
 
186
 
187
  ### Response
188
 
189
- A successful response includes a JSON object like the following:
190
 
191
  ```json
192
  {
@@ -198,7 +199,7 @@ A successful response includes a JSON object like the following:
198
  `0`: The operation succeeds.
199
 
200
 
201
- An error response includes a JSON object like the following:
202
 
203
  ```json
204
  {
@@ -211,7 +212,7 @@ An error response includes a JSON object like the following:
211
 
212
  **PUT** `/api/v1/dataset/{dataset_id}`
213
 
214
- Updates a dataset by its ID.
215
 
216
  ### Request
217
 
@@ -226,14 +227,14 @@ Updates a dataset by its ID.
226
  #### Request example
227
 
228
  ```bash
229
- # "id": REQUIRED
230
- # "name": If you update "name", it cannot be duplicated.
231
- # "tenant_id": If you update "tenant_id", it cannot be changed
232
- # "embedding_model": If you update "embedding_model", it cannot be changed.
233
- # "chunk_count": If you update "chunk_count", it cannot be changed.
234
- # "document_count": If you update "document_count", it cannot be changed.
235
- # "parse_method": If you update "parse_method", "chunk_count" must be 0.
236
- # "naive": General.
237
  curl --request PUT \
238
  --url http://{address}/api/v1/dataset/{dataset_id} \
239
  --header 'Content-Type: application/json' \
@@ -244,18 +245,17 @@ curl --request PUT \
244
  "embedding_model": "BAAI/bge-zh-v1.5",
245
  "chunk_count": 0,
246
  "document_count": 0,
247
- "parse_method": "naive"
248
  }'
249
  ```
250
 
251
  #### Request parameters
252
-
253
- See the "Create Dataset" for the complete structure of the request parameters.
254
 
255
 
256
  ### Response
257
 
258
- A successful response includes a JSON object like the following:
259
 
260
  ```json
261
  {
@@ -267,7 +267,7 @@ A successful response includes a JSON object like the following:
267
  `0`: The operation succeeds.
268
 
269
 
270
- An error response includes a JSON object like the following:
271
 
272
  ```json
273
  {
@@ -321,7 +321,7 @@ curl --request GET \
321
 
322
  ### Response
323
 
324
- A successful response includes a JSON object like the following:
325
 
326
  ```json
327
  {
@@ -365,7 +365,7 @@ A successful response includes a JSON object like the following:
365
  ```
366
 
367
 
368
- An error response includes a JSON object like the following:
369
 
370
  ```json
371
  {
@@ -392,12 +392,12 @@ Uploads files to a dataset.
392
 
393
  #### Request example
394
 
395
- ```shell
396
  curl --request POST \
397
  --url http://{address}/api/v1/dataset/{dataset_id}/document \
398
  --header 'Content-Type: multipart/form-data' \
399
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
400
- --form 'file=@test.txt'
401
  ```
402
 
403
  #### Request parameters
@@ -409,9 +409,9 @@ curl --request POST \
409
 
410
  ### Response
411
 
412
- A successful response includes a JSON object like the following:
413
 
414
- ```shell
415
  {
416
  "code": 0
417
  }
@@ -421,12 +421,12 @@ A successful response includes a JSON object like the following:
421
  `0`: The operation succeeds.
422
 
423
 
424
- An error response includes a JSON object like the following:
425
 
426
- ```shell
427
  {
428
- "code": 3016,
429
- "message": "Can't connect database"
430
  }
431
  ```
432
 
@@ -447,7 +447,7 @@ Downloads files from a dataset.
447
  - '{FILE_NAME}'
448
  #### Request example
449
 
450
- ```shell
451
  curl --request GET \
452
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
453
  --header 'Content-Type: application/json' \
@@ -464,31 +464,29 @@ curl --request GET \
464
 
465
  ### Response
466
 
467
- A successful response includes a JSON object like the following:
468
 
469
- ```shell
470
- {
471
- "code": 0
472
- }
473
  ```
474
 
475
  - `"error_code"`: `integer`
476
  `0`: The operation succeeds.
477
 
478
 
479
- An error response includes a JSON object like the following:
480
 
481
- ```shell
482
  {
483
- "code": 3016,
484
- "message": "Can't connect database"
485
  }
486
  ```
487
 
488
 
489
  ## List files of a dataset
490
 
491
- **GET** `/api/v1/dataset/{dataset_id}/info?keywords={keyword}&page={page}&page_size={limit}&orderby={orderby}&desc={desc}&name={name}`
492
 
493
  List files to a dataset.
494
 
@@ -502,48 +500,47 @@ List files to a dataset.
502
 
503
  #### Request example
504
 
505
- ```shell
506
  curl --request GET \
507
- --url http://{address}/api/v1/dataset/{dataset_id}/info?keywords=rag&page=0&page_size=10&orderby=create_time&desc=yes \
508
- --header 'Content-Type: application/json' \
509
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
510
  ```
511
 
512
  #### Request parameters
513
 
514
  - `"dataset_id"`: (*PATH parameter*)
515
  The dataset id
 
 
516
  - `keywords`: (*Filter parameter*)
517
  The keywords matches the search key workds;
518
- - `page`: (*Filter parameter*)
519
- The current page number to retrieve from the paginated data. This parameter determines which set of records will be fetched.
520
- - `page_size`: (*Filter parameter*)
521
- The number of records to retrieve per page. This controls how many records will be included in each page.
522
  - `orderby`: (*Filter parameter*)
523
  The field by which the records should be sorted. This specifies the attribute or column used to order the results.
524
  - `desc`: (*Filter parameter*)
525
  A boolean flag indicating whether the sorting should be in descending order.
526
- - `name`: (*Filter parameter*)
527
- File name.
528
 
529
  ### Response
530
 
531
- A successful response includes a JSON object like the following:
532
 
533
- ```shell
534
  {
535
  "code": 0,
536
  "data": {
537
  "docs": [
538
  {
539
  "chunk_count": 0,
540
- "create_date": "Wed, 18 Sep 2024 08:20:49 GMT",
541
- "create_time": 1726647649379,
542
- "created_by": "134408906b6811efbcd20242ac120005",
543
- "id": "e970a94a759611efae5b0242ac120004",
544
- "knowledgebase_id": "e95f574e759611efbc850242ac120004",
545
- "location": "Test Document222.txt",
546
- "name": "Test Document222.txt",
547
  "parser_config": {
548
  "chunk_token_count": 128,
549
  "delimiter": "\n!?。;!?",
@@ -556,48 +553,18 @@ A successful response includes a JSON object like the following:
556
  "progress": 0.0,
557
  "progress_msg": "",
558
  "run": "0",
559
- "size": 46,
560
  "source_type": "local",
561
  "status": "1",
562
  "thumbnail": null,
563
  "token_count": 0,
564
  "type": "doc",
565
- "update_date": "Wed, 18 Sep 2024 08:20:49 GMT",
566
- "update_time": 1726647649379
567
- },
568
- {
569
- "chunk_count": 0,
570
- "create_date": "Wed, 18 Sep 2024 08:20:49 GMT",
571
- "create_time": 1726647649340,
572
- "created_by": "134408906b6811efbcd20242ac120005",
573
- "id": "e96aad9c759611ef9ab60242ac120004",
574
- "knowledgebase_id": "e95f574e759611efbc850242ac120004",
575
- "location": "Test Document111.txt",
576
- "name": "Test Document111.txt",
577
- "parser_config": {
578
- "chunk_token_count": 128,
579
- "delimiter": "\n!?。;!?",
580
- "layout_recognize": true,
581
- "task_page_size": 12
582
- },
583
- "parser_method": "naive",
584
- "process_begin_at": null,
585
- "process_duation": 0.0,
586
- "progress": 0.0,
587
- "progress_msg": "",
588
- "run": "0",
589
- "size": 46,
590
- "source_type": "local",
591
- "status": "1",
592
- "thumbnail": null,
593
- "token_count": 0,
594
- "type": "doc",
595
- "update_date": "Wed, 18 Sep 2024 08:20:49 GMT",
596
- "update_time": 1726647649340
597
  }
598
  ],
599
- "total": 2
600
- },
601
  }
602
  ```
603
 
@@ -605,12 +572,12 @@ A successful response includes a JSON object like the following:
605
  `0`: The operation succeeds.
606
 
607
 
608
- An error response includes a JSON object like the following:
609
 
610
- ```shell
611
  {
612
- "code": 3016,
613
- "message": "Can't connect database"
614
  }
615
  ```
616
 
@@ -623,56 +590,114 @@ Update a file in a dataset
623
  ### Request
624
 
625
  - Method: PUT
626
- - URL: `/api/v1/dataset/{dataset_id}/document`
627
  - Headers:
628
  - `content-Type: application/json`
629
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
630
 
631
  #### Request example
632
 
633
- ```shell
634
  curl --request PUT \
635
- --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \
636
- --header 'Content-Type: application/json' \
637
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
638
- --raw '{
639
- "document_id": "f6b170ac758811efa0660242ac120004",
640
- "document_name": "manual.txt",
641
- "thumbnail": null,
642
- "knowledgebase_id": "779333c0758611ef910f0242ac120004",
643
- "parser_method": "manual",
644
- "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12},
645
- "source_type": "local", "type": "doc",
646
- "created_by": "134408906b6811efbcd20242ac120005",
647
- "size": 0, "token_count": 0, "chunk_count": 0,
648
- "progress": 0.0,
649
- "progress_msg": "",
650
- "process_begin_at": null,
651
- "process_duration": 0.0
652
- }'
653
  ```
654
 
655
  #### Request parameters
656
 
657
- - `"document_id"`: (*Body parameter*)
658
- - `"document_name"`: (*Body parameter*)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
 
660
  ### Response
661
 
662
- A successful response includes a JSON object like the following:
663
 
664
- ```shell
665
  {
666
  "code": 0
667
  }
668
  ```
669
 
670
- An error response includes a JSON object like the following:
671
 
672
- ```shell
673
  {
674
- "code": 3016,
675
- "message": "Can't connect database"
676
  }
677
  ```
678
 
@@ -710,7 +735,7 @@ curl --request POST \
710
 
711
  ### Response
712
 
713
- A successful response includes a JSON object like the following:
714
 
715
  ```shell
716
  {
@@ -718,7 +743,7 @@ A successful response includes a JSON object like the following:
718
  }
719
  ```
720
 
721
- An error response includes a JSON object like the following:
722
 
723
  ```shell
724
  {
@@ -761,7 +786,7 @@ curl --request DELETE \
761
 
762
  ### Response
763
 
764
- A successful response includes a JSON object like the following:
765
 
766
  ```shell
767
  {
@@ -769,7 +794,7 @@ A successful response includes a JSON object like the following:
769
  }
770
  ```
771
 
772
- An error response includes a JSON object like the following:
773
 
774
  ```shell
775
  {
@@ -808,7 +833,7 @@ curl --request GET \
808
 
809
  ### Response
810
 
811
- A successful response includes a JSON object like the following:
812
 
813
  ```shell
814
  {
@@ -863,7 +888,7 @@ A successful response includes a JSON object like the following:
863
  }
864
  ```
865
 
866
- An error response includes a JSON object like the following:
867
 
868
  ```shell
869
  {
 
5
 
6
  **POST** `/api/v1/dataset`
7
 
8
+ Creates a dataset.
9
 
10
  ### Request
11
 
 
31
  #### Request example
32
 
33
  ```bash
34
+ # "id": id must not be provided.
35
+ # "name": name is required and can't be duplicated.
36
  # "tenant_id": tenant_id must not be provided.
37
+ # "embedding_model": embedding_model must not be provided.
38
+ # "navie" means general.
39
  curl --request POST \
40
  --url http://{address}/api/v1/dataset \
41
  --header 'Content-Type: application/json' \
 
51
  #### Request parameters
52
 
53
  - `"id"`: (*Body parameter*)
54
+ The ID of the created dataset used to uniquely identify different datasets.
55
+ - If creating a dataset, `id` must not be provided.
56
 
57
  - `"name"`: (*Body parameter*)
58
  The name of the dataset, which must adhere to the following requirements:
59
  - Required when creating a dataset and must be unique.
60
+ - If updating a dataset, `name` must still be unique.
61
 
62
  - `"avatar"`: (*Body parameter*)
63
  Base64 encoding of the avatar.
64
 
65
  - `"tenant_id"`: (*Body parameter*)
66
  The ID of the tenant associated with the dataset, used to link it with specific users.
67
+ - If creating a dataset, `tenant_id` must not be provided.
68
+ - If updating a dataset, `tenant_id` cannot be changed.
69
 
70
  - `"description"`: (*Body parameter*)
71
  The description of the dataset.
 
74
  The language setting for the dataset.
75
 
76
  - `"embedding_model"`: (*Body parameter*)
77
+ Embedding model used in the dataset to generate vector embeddings.
78
+ - If creating a dataset, `embedding_model` must not be provided.
79
+ - If updating a dataset, `embedding_model` cannot be changed.
80
 
81
  - `"permission"`: (*Body parameter*)
82
  Specifies who can manipulate the dataset.
83
 
84
  - `"document_count"`: (*Body parameter*)
85
  Document count of the dataset.
86
+ - If updating a dataset, `document_count` cannot be changed.
87
 
88
  - `"chunk_count"`: (*Body parameter*)
89
  Chunk count of the dataset.
90
+ - If updating a dataset, `chunk_count` cannot be changed.
91
 
92
  - `"parse_method"`: (*Body parameter*)
93
  Parsing method of the dataset.
94
+ - If updating `parse_method`, `chunk_count` must be greater than 0.
95
 
96
  - `"parser_config"`: (*Body parameter*)
97
  The configuration settings for the dataset parser.
98
 
99
  ### Response
100
 
101
+ The successful response includes a JSON object like the following:
102
 
103
  ```json
104
  {
 
139
  - `"error_code"`: `integer`
140
  `0`: The operation succeeds.
141
 
142
+
143
+ The error response includes a JSON object like the following:
144
 
145
  ```json
146
  {
 
153
 
154
  **DELETE** `/api/v1/dataset`
155
 
156
+ Deletes datasets by ids.
157
 
158
  ### Request
159
 
 
169
  #### Request example
170
 
171
  ```bash
172
+ # Either id or name must be provided, but not both.
173
  curl --request DELETE \
174
  --url http://{address}/api/v1/dataset \
175
  --header 'Content-Type: application/json' \
 
181
 
182
  #### Request parameters
183
 
184
+ - `"ids"`: (*Body parameter*)
185
+ Dataset IDs to delete.
186
 
187
 
188
  ### Response
189
 
190
+ The successful response includes a JSON object like the following:
191
 
192
  ```json
193
  {
 
199
  `0`: The operation succeeds.
200
 
201
 
202
+ The error response includes a JSON object like the following:
203
 
204
  ```json
205
  {
 
212
 
213
  **PUT** `/api/v1/dataset/{dataset_id}`
214
 
215
+ Updates a dataset by its id.
216
 
217
  ### Request
218
 
 
227
  #### Request example
228
 
229
  ```bash
230
+ # "id": id is required.
231
+ # "name": If you update name, it can't be duplicated.
232
+ # "tenant_id": If you update tenant_id, it can't be changed
233
+ # "embedding_model": If you update embedding_model, it can't be changed.
234
+ # "chunk_count": If you update chunk_count, it can't be changed.
235
+ # "document_count": If you update document_count, it can't be changed.
236
+ # "parse_method": If you update parse_method, chunk_count must be 0.
237
+ # "navie" means general.
238
  curl --request PUT \
239
  --url http://{address}/api/v1/dataset/{dataset_id} \
240
  --header 'Content-Type: application/json' \
 
245
  "embedding_model": "BAAI/bge-zh-v1.5",
246
  "chunk_count": 0,
247
  "document_count": 0,
248
+ "parse_method": "navie"
249
  }'
250
  ```
251
 
252
  #### Request parameters
253
+ (Refer to the "Create Dataset" for the complete structure of the request parameters.)
 
254
 
255
 
256
  ### Response
257
 
258
+ The successful response includes a JSON object like the following:
259
 
260
  ```json
261
  {
 
267
  `0`: The operation succeeds.
268
 
269
 
270
+ The error response includes a JSON object like the following:
271
 
272
  ```json
273
  {
 
321
 
322
  ### Response
323
 
324
+ The successful response includes a JSON object like the following:
325
 
326
  ```json
327
  {
 
365
  ```
366
 
367
 
368
+ The error response includes a JSON object like the following:
369
 
370
  ```json
371
  {
 
392
 
393
  #### Request example
394
 
395
+ ```bash
396
  curl --request POST \
397
  --url http://{address}/api/v1/dataset/{dataset_id}/document \
398
  --header 'Content-Type: multipart/form-data' \
399
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
400
+ --form 'file=@./test.txt'
401
  ```
402
 
403
  #### Request parameters
 
409
 
410
  ### Response
411
 
412
+ The successful response includes a JSON object like the following:
413
 
414
+ ```json
415
  {
416
  "code": 0
417
  }
 
421
  `0`: The operation succeeds.
422
 
423
 
424
+ The error response includes a JSON object like the following:
425
 
426
+ ```json
427
  {
428
+ "code": 101,
429
+ "message": "No file part!"
430
  }
431
  ```
432
 
 
447
  - '{FILE_NAME}'
448
  #### Request example
449
 
450
+ ```bash
451
  curl --request GET \
452
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
453
  --header 'Content-Type: application/json' \
 
464
 
465
  ### Response
466
 
467
+ The successful response includes a JSON object like the following:
468
 
469
+ ```text
470
+ test_2.
 
 
471
  ```
472
 
473
  - `"error_code"`: `integer`
474
  `0`: The operation succeeds.
475
 
476
 
477
+ The error response includes a JSON object like the following:
478
 
479
+ ```json
480
  {
481
+ "code": 102,
482
+ "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
483
  }
484
  ```
485
 
486
 
487
  ## List files of a dataset
488
 
489
+ **GET** `/api/v1/dataset/{dataset_id}/info?offset={offset}&limit={limit}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}`
490
 
491
  List files to a dataset.
492
 
 
500
 
501
  #### Request example
502
 
503
+ ```bash
504
  curl --request GET \
505
+ --url http://{address}/api/v1/dataset/{dataset_id}/info?offset={offset}&limit={limit}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id} \
506
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 
507
  ```
508
 
509
  #### Request parameters
510
 
511
  - `"dataset_id"`: (*PATH parameter*)
512
  The dataset id
513
+ - `offset`: (*Filter parameter*)
514
+ The beginning number of records for paging.
515
  - `keywords`: (*Filter parameter*)
516
  The keywords matches the search key workds;
517
+ - `limit`: (*Filter parameter*)
518
+ Records number to return.
 
 
519
  - `orderby`: (*Filter parameter*)
520
  The field by which the records should be sorted. This specifies the attribute or column used to order the results.
521
  - `desc`: (*Filter parameter*)
522
  A boolean flag indicating whether the sorting should be in descending order.
523
+ - `id`: (*Filter parameter*)
524
+ The id of the document to be got.
525
 
526
  ### Response
527
 
528
+ The successful response includes a JSON object like the following:
529
 
530
+ ```json
531
  {
532
  "code": 0,
533
  "data": {
534
  "docs": [
535
  {
536
  "chunk_count": 0,
537
+ "create_date": "Mon, 14 Oct 2024 09:11:01 GMT",
538
+ "create_time": 1728897061948,
539
+ "created_by": "69736c5e723611efb51b0242ac120007",
540
+ "id": "3bcfbf8a8a0c11ef8aba0242ac120006",
541
+ "knowledgebase_id": "7898da028a0511efbf750242ac120005",
542
+ "location": "Test_2.txt",
543
+ "name": "Test_2.txt",
544
  "parser_config": {
545
  "chunk_token_count": 128,
546
  "delimiter": "\n!?。;!?",
 
553
  "progress": 0.0,
554
  "progress_msg": "",
555
  "run": "0",
556
+ "size": 7,
557
  "source_type": "local",
558
  "status": "1",
559
  "thumbnail": null,
560
  "token_count": 0,
561
  "type": "doc",
562
+ "update_date": "Mon, 14 Oct 2024 09:11:01 GMT",
563
+ "update_time": 1728897061948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  }
565
  ],
566
+ "total": 1
567
+ }
568
  }
569
  ```
570
 
 
572
  `0`: The operation succeeds.
573
 
574
 
575
+ The error response includes a JSON object like the following:
576
 
577
+ ```json
578
  {
579
+ "code": 102,
580
+ "message": "You don't own the dataset 7898da028a0511efbf750242ac1220005. "
581
  }
582
  ```
583
 
 
590
  ### Request
591
 
592
  - Method: PUT
593
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}`
594
  - Headers:
595
  - `content-Type: application/json`
596
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
597
 
598
  #### Request example
599
 
600
+ ```bash
601
  curl --request PUT \
602
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
603
+ --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
604
+ --header 'Content-Type: application/json' \
605
+ --data '{
606
+ "name": "manual.txt",
607
+ "thumbnail": null,
608
+ "knowledgebase_id": "779333c0758611ef910f0242ac120004",
609
+ "parser_method": "manual",
610
+ "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12},
611
+ "source_type": "local", "type": "doc",
612
+ "created_by": "134408906b6811efbcd20242ac120005",
613
+ "size": 0, "token_count": 0, "chunk_count": 0,
614
+ "progress": 0.0,
615
+ "progress_msg": "",
616
+ "process_begin_at": null,
617
+ "process_duration": 0.0
618
+ }'
619
+
620
  ```
621
 
622
  #### Request parameters
623
 
624
+ - `"thumbnail"`: (*Body parameter*)
625
+ Thumbnail image of the document.
626
+ - `""`
627
+
628
+ - `"knowledgebase_id"`: (*Body parameter*)
629
+ Knowledge base ID related to the document.
630
+ - `""`
631
+
632
+ - `"parser_method"`: (*Body parameter*)
633
+ Method used to parse the document.
634
+ - `""`
635
+
636
+ - `"parser_config"`: (*Body parameter*)
637
+ Configuration object for the parser.
638
+ - If the value is `None`, a dictionary with default values will be generated.
639
+
640
+ - `"source_type"`: (*Body parameter*)
641
+ Source type of the document.
642
+ - `""`
643
+
644
+ - `"type"`: (*Body parameter*)
645
+ Type or category of the document.
646
+ - `""`
647
+
648
+ - `"created_by"`: (*Body parameter*)
649
+ Creator of the document.
650
+ - `""`
651
+
652
+ - `"name"`: (*Body parameter*)
653
+ Name or title of the document.
654
+ - `""`
655
+
656
+ - `"size"`: (*Body parameter*)
657
+ Size of the document in bytes or some other unit.
658
+ - `0`
659
+
660
+ - `"token_count"`: (*Body parameter*)
661
+ Number of tokens in the document.
662
+ - `0`
663
+
664
+ - `"chunk_count"`: (*Body parameter*)
665
+ Number of chunks the document is split into.
666
+ - `0`
667
+
668
+ - `"progress"`: (*Body parameter*)
669
+ Current processing progress as a percentage.
670
+ - `0.0`
671
+
672
+ - `"progress_msg"`: (*Body parameter*)
673
+ Message indicating current progress status.
674
+ - `""`
675
+
676
+ - `"process_begin_at"`: (*Body parameter*)
677
+ Start time of the document processing.
678
+ - `None`
679
+
680
+ - `"process_duration"`: (*Body parameter*)
681
+ Duration of the processing in seconds or minutes.
682
+ - `0.0`
683
+
684
 
685
  ### Response
686
 
687
+ The successful response includes a JSON object like the following:
688
 
689
+ ```json
690
  {
691
  "code": 0
692
  }
693
  ```
694
 
695
+ The error response includes a JSON object like the following:
696
 
697
+ ```json
698
  {
699
+ "code": 102,
700
+ "message": "The dataset not own the document."
701
  }
702
  ```
703
 
 
735
 
736
  ### Response
737
 
738
+ The successful response includes a JSON object like the following:
739
 
740
  ```shell
741
  {
 
743
  }
744
  ```
745
 
746
+ The error response includes a JSON object like the following:
747
 
748
  ```shell
749
  {
 
786
 
787
  ### Response
788
 
789
+ The successful response includes a JSON object like the following:
790
 
791
  ```shell
792
  {
 
794
  }
795
  ```
796
 
797
+ The error response includes a JSON object like the following:
798
 
799
  ```shell
800
  {
 
833
 
834
  ### Response
835
 
836
+ The successful response includes a JSON object like the following:
837
 
838
  ```shell
839
  {
 
888
  }
889
  ```
890
 
891
+ The error response includes a JSON object like the following:
892
 
893
  ```shell
894
  {
api/python_api_reference.md CHANGED
@@ -24,7 +24,7 @@ Creates a knowledge base (dataset).
24
 
25
  ### Parameters
26
 
27
- #### name: *Required*
28
 
29
  The unique name of the dataset to create. It must adhere to the following requirements:
30
 
@@ -36,81 +36,70 @@ The unique name of the dataset to create. It must adhere to the following requir
36
  - Maximum 65,535 characters.
37
  - Case-insensitive.
38
 
39
- #### avatar
40
 
41
  Base64 encoding of the avatar. Defaults to `""`
42
 
43
- #### tenant_id
44
 
45
  The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`.
46
 
47
- - When creating a dataset, `tenant_id` must not be provided.
48
- - When updating a dataset, `tenant_id` cannot be changed.
49
 
50
- #### description
51
 
52
  The description of the created dataset. Defaults to `""`.
53
 
54
- #### language
55
 
56
- The language setting of the created dataset. Defaults to `"English"`.
57
 
58
- #### embedding_model
59
 
60
  The specific model used by the dataset to generate vector embeddings. Defaults to `""`.
61
 
62
- - When creating a dataset, `embedding_model` must not be provided.
63
- - When updating a dataset, `embedding_model` cannot be changed.
64
 
65
- #### permission
66
 
67
- The person who can operate on the dataset. Defaults to `"me"`.
68
 
69
- #### document_count
70
 
71
  The number of documents associated with the dataset. Defaults to `0`.
72
 
73
- :::tip NOTE
74
- When updating a dataset, `document_count` cannot be changed.
75
- :::
76
 
77
- #### chunk_count
78
 
79
  The number of data chunks generated or processed by the created dataset. Defaults to `0`.
80
 
81
- :::tip NOTE
82
- When updating a dataset, `chunk_count` cannot be changed.
83
- :::
84
-
85
- #### parse_method
86
 
87
- The method used by the dataset to parse and process data. Defaults to `"naive"`.
88
 
89
- :::tip NOTE
90
- When updating `parse_method` in a dataset, `chunk_count` must be greater than 0.
91
- :::
92
 
93
- #### parser_config
94
 
95
- The parser configuration of the dataset. A `ParserConfig` object contains the following attributes:
96
 
97
- - `chunk_token_count`: Defaults to `128`.
98
- - `layout_recognize`: Defaults to `True`.
99
- - `delimiter`: Defaults to `'\n!?。;!?'`.
100
- - `task_page_size`: Defaults to `12`.
101
 
102
  ### Returns
103
-
104
- - Success: A `dataset` object.
105
- - Failure: `Exception`
106
-
107
  ### Examples
108
 
109
  ```python
110
  from ragflow import RAGFlow
111
 
112
- rag_object = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
113
- ds = rag_object.create_dataset(name="kb_1")
114
  ```
115
 
116
  ---
@@ -118,27 +107,28 @@ ds = rag_object.create_dataset(name="kb_1")
118
  ## Delete knowledge bases
119
 
120
  ```python
121
- RAGFlow.delete_datasets(ids: list[str] = None)
122
  ```
123
-
124
- Deletes knowledge bases by name or ID.
125
-
126
  ### Parameters
127
 
128
- #### ids
 
 
129
 
130
- The IDs of the knowledge bases to delete.
131
 
132
  ### Returns
133
 
134
- - Success: No value is returned.
135
- - Failure: `Exception`
 
136
 
137
  ### Examples
138
 
139
- #### Delete knowledge bases by name
140
-
141
  ```python
 
 
 
142
  rag.delete_datasets(ids=["id_1","id_2"])
143
  ```
144
 
@@ -154,84 +144,76 @@ RAGFlow.list_datasets(
154
  desc: bool = True,
155
  id: str = None,
156
  name: str = None
157
- ) -> list[DataSet]
158
  ```
159
 
160
- Lists all knowledge bases.
161
 
162
  ### Parameters
163
 
164
- #### page
165
 
166
  The current page number to retrieve from the paginated data. This parameter determines which set of records will be fetched. Defaults to `1`.
167
 
168
- #### page_size
169
 
170
  The number of records to retrieve per page. This controls how many records will be included in each page. Defaults to `1024`.
171
 
172
- #### order_by
173
 
174
- The attribute by which the results are sorted. Defaults to `"create_time"`.
175
 
176
- #### desc
177
 
178
- Indicates whether to sort the results in descending order. Defaults to `True`.
179
 
180
- #### id
181
 
182
- The ID of the dataset to retrieve. Defaults to `None`.
183
 
184
- #### name
185
 
186
- The name of the dataset to retrieve. Defaults to `None`.
187
 
188
  ### Returns
189
 
190
- - Success: A list of `DataSet` objects representing the retrieved knowledge bases.
191
- - Failure: `Exception`.
192
-
193
- ### Examples
194
-
195
- #### Retrieve a list of knowledge bases associated with the current user
196
-
197
  ```python
198
- for ds in rag_object.list_datasets():
199
- print(ds.name)
200
  ```
201
 
202
- #### Retrieve a knowledge base by ID
203
 
204
  ```python
205
- ds = rag_object.list_datasets(id = "id_1")
206
- print(ds.name)
 
 
 
207
  ```
208
 
209
  ---
210
 
211
- ## Update knowledge base
 
212
 
213
  ```python
214
  DataSet.update(update_message: dict)
215
  ```
216
 
217
- Updates the current knowledge base.
218
-
219
- ### Parameters
220
-
221
- #### update_message
222
-
223
  ### Returns
224
 
225
- - Success: No value is returned.
226
- - Failure: `Exception`
 
227
 
228
  ### Examples
229
 
230
  ```python
231
  from ragflow import RAGFlow
232
 
233
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
234
- ds = rag.list_datasets(name="kb_1")
235
  ds.update({"parse_method":"manual", ...}}
236
  ```
237
 
@@ -249,8 +231,6 @@ RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
249
 
250
  ### Parameters
251
 
252
- #### ds
253
-
254
  #### name
255
 
256
  #### blob
@@ -354,7 +334,7 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
354
  ```python
355
  from ragflow import RAGFlow
356
 
357
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
358
  doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
359
  print(doc)
360
  ```
@@ -376,7 +356,7 @@ bool
376
  ```python
377
  from ragflow import RAGFlow
378
 
379
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
380
  doc = rag.get_document(id="wdfxb5t547d")
381
  doc.parser_method= "manual"
382
  doc.save()
@@ -399,7 +379,7 @@ bytes of the document.
399
  ```python
400
  from ragflow import RAGFlow
401
 
402
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
403
  doc = rag.get_document(id="wdfxb5t547d")
404
  open("~/ragflow.txt", "w+").write(doc.download())
405
  print(doc)
@@ -410,7 +390,7 @@ print(doc)
410
  ## List documents
411
 
412
  ```python
413
- Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> list[Document]
414
  ```
415
 
416
  ### Parameters
@@ -425,18 +405,18 @@ The beginning number of records for paging. Defaults to `0`.
425
 
426
  #### limit: `int`
427
 
428
- Records number to return, -1 means all of them.
429
 
430
  ### Returns
431
 
432
- list[Document]
433
 
434
  ### Examples
435
 
436
  ```python
437
  from ragflow import RAGFlow
438
 
439
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
440
  ds = rag.create_dataset(name="kb_1")
441
 
442
  filename1 = "~/ragflow.txt"
@@ -466,7 +446,7 @@ description: delete success or not
466
  ```python
467
  from ragflow import RAGFlow
468
 
469
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
470
  ds = rag.create_dataset(name="kb_1")
471
 
472
  filename1 = "~/ragflow.txt"
@@ -599,7 +579,7 @@ chunk
599
  ```python
600
  from ragflow import RAGFlow
601
 
602
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
603
  doc = rag.get_document(id="wdfxb5t547d")
604
  chunk = doc.add_chunk(content="xxxxxxx")
605
  ```
@@ -621,7 +601,7 @@ bool
621
  ```python
622
  from ragflow import RAGFlow
623
 
624
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
625
  doc = rag.get_document(id="wdfxb5t547d")
626
  chunk = doc.add_chunk(content="xxxxxxx")
627
  chunk.delete()
@@ -644,7 +624,7 @@ bool
644
  ```python
645
  from ragflow import RAGFlow
646
 
647
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
648
  doc = rag.get_document(id="wdfxb5t547d")
649
  chunk = doc.add_chunk(content="xxxxxxx")
650
  chunk.content = "sdfx"
@@ -656,7 +636,7 @@ chunk.save()
656
  ## Retrieval
657
 
658
  ```python
659
- RAGFlow.retrieval(question:str, datasets:list[Dataset], document=list[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> list[Chunk]
660
  ```
661
 
662
  ### Parameters
@@ -665,11 +645,11 @@ RAGFlow.retrieval(question:str, datasets:list[Dataset], document=list[Document]=
665
 
666
  The user query or query keywords. Defaults to `""`.
667
 
668
- #### datasets: `list[Dataset]`, *Required*
669
 
670
  The scope of datasets.
671
 
672
- #### document: `list[Document]`
673
 
674
  The scope of document. `None` means no limitation. Defaults to `None`.
675
 
@@ -695,14 +675,14 @@ Number of records engaged in vector cosine computaton. Defaults to `1024`.
695
 
696
  ### Returns
697
 
698
- list[Chunk]
699
 
700
  ### Examples
701
 
702
  ```python
703
  from ragflow import RAGFlow
704
 
705
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
706
  ds = rag.get_dataset(name="ragflow")
707
  name = 'ragflow_test.txt'
708
  path = 'test_data/ragflow_test.txt'
@@ -733,7 +713,7 @@ Chat APIs
733
  RAGFlow.create_chat(
734
  name: str = "assistant",
735
  avatar: str = "path",
736
- knowledgebases: list[DataSet] = ["kb1"],
737
  llm: Chat.LLM = None,
738
  prompt: Chat.Prompt = None
739
  ) -> Chat
@@ -754,7 +734,7 @@ The name of the created chat. Defaults to `"assistant"`.
754
 
755
  The icon of the created chat. Defaults to `"path"`.
756
 
757
- #### knowledgebases: `list[DataSet]`
758
 
759
  Select knowledgebases associated. Defaults to `["kb1"]`.
760
 
@@ -796,7 +776,7 @@ You are an intelligent assistant. Please summarize the content of the knowledge
796
  ```python
797
  from ragflow import RAGFlow
798
 
799
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
800
  kb = rag.get_dataset(name="kb_1")
801
  assi = rag.create_chat("Miss R", knowledgebases=[kb])
802
  ```
@@ -820,7 +800,7 @@ no return
820
  ```python
821
  from ragflow import RAGFlow
822
 
823
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
824
  kb = rag.get_knowledgebase(name="kb_1")
825
  assi = rag.create_chat("Miss R", knowledgebases=[kb])
826
  assi.update({"temperature":0.8})
@@ -831,7 +811,7 @@ assi.update({"temperature":0.8})
831
  ## Delete chats
832
 
833
  ```python
834
- RAGFlow.delete_chats(ids: list[str] = None)
835
  ```
836
  ### Parameters
837
 
@@ -851,7 +831,7 @@ no return
851
  ```python
852
  from ragflow import RAGFlow
853
 
854
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
855
  rag.delete_chats(ids=["id_1","id_2"])
856
  ```
857
 
@@ -867,7 +847,7 @@ RAGFlow.list_chats(
867
  desc: bool = True,
868
  id: str = None,
869
  name: str = None
870
- ) -> list[Chat]
871
  ```
872
 
873
  ### Parameters
@@ -910,7 +890,7 @@ A list of chat objects.
910
  ```python
911
  from ragflow import RAGFlow
912
 
913
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
914
  for assi in rag.list_chats():
915
  print(assi)
916
  ```
@@ -940,7 +920,7 @@ The id of the created session is used to identify different sessions.
940
 
941
  The name of the created session. Defaults to `"New session"`.
942
 
943
- #### messages: `list[Message]`
944
 
945
  The messages of the created session.
946
  - messages cannot be provided.
@@ -963,7 +943,7 @@ The id of associated chat
963
  ```python
964
  from ragflow import RAGFlow
965
 
966
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
967
  assi = rag.list_chats(name="Miss R")
968
  assi = assi[0]
969
  sess = assi.create_session()
@@ -985,7 +965,7 @@ no return
985
  ```python
986
  from ragflow import RAGFlow
987
 
988
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
989
  assi = rag.list_chats(name="Miss R")
990
  assi = assi[0]
991
  sess = assi.create_session("new_session")
@@ -1023,7 +1003,7 @@ The id of the message. `id` is automatically generated. Defaults to `None`. ????
1023
 
1024
  The content of the message. Defaults to `"Hi! I am your assistant, can I help you?"`.
1025
 
1026
- #### reference: `list[Chunk]`
1027
 
1028
  The auto-generated reference of the message. Each `chunk` object includes the following attributes:
1029
 
@@ -1045,7 +1025,7 @@ The auto-generated reference of the message. Each `chunk` object includes the fo
1045
  A similarity score based on vector representations. This score is obtained by converting texts, words, or objects into vectors and then calculating the cosine similarity or other distance measures between these vectors to determine the similarity in vector space. A higher value indicates greater similarity in the vector space. Defaults to `None`. ?????????????????????????????????
1046
  - **term_similarity**: `float`
1047
  The similarity score based on terms or keywords. This score is calculated by comparing the similarity of key terms between texts or datasets, typically measuring how similar two words or phrases are in meaning or context. A higher value indicates a stronger similarity between terms. Defaults to `None`. ???????????????????
1048
- - **position**: `list[string]`
1049
  Indicates the position or index of keywords or specific terms within the text. An array is typically used to mark the location of keywords or specific elements, facilitating precise operations or analysis of the text. Defaults to `None`. ??????????????
1050
 
1051
  ### Examples
@@ -1053,7 +1033,7 @@ The auto-generated reference of the message. Each `chunk` object includes the fo
1053
  ```python
1054
  from ragflow import RAGFlow
1055
 
1056
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
1057
  assi = rag.list_chats(name="Miss R")
1058
  assi = assi[0]
1059
  sess = assi.create_session()
@@ -1084,12 +1064,12 @@ Chat.list_sessions(
1084
  desc: bool = True,
1085
  id: str = None,
1086
  name: str = None
1087
- ) -> list[Session]
1088
  ```
1089
 
1090
  ### Returns
1091
 
1092
- list[Session]
1093
  description: the List contains information about multiple assistant object, with each dictionary containing information about one assistant.
1094
 
1095
  ### Examples
@@ -1097,7 +1077,7 @@ description: the List contains information about multiple assistant object, with
1097
  ```python
1098
  from ragflow import RAGFlow
1099
 
1100
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
1101
  assi = rag.list_chats(name="Miss R")
1102
  assi = assi[0]
1103
  for sess in assi.list_sessions():
@@ -1140,7 +1120,7 @@ The name of the chat to be retrieved.
1140
  ## Delete session
1141
 
1142
  ```python
1143
- Chat.delete_sessions(ids:list[str] = None)
1144
  ```
1145
 
1146
  ### Returns
@@ -1152,13 +1132,13 @@ no return
1152
  ```python
1153
  from ragflow import RAGFlow
1154
 
1155
- rag = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
1156
  assi = rag.list_chats(name="Miss R")
1157
  assi = assi[0]
1158
  assi.delete_sessions(ids=["id_1","id_2"])
1159
  ```
1160
  ### Parameters
1161
- #### ids: `list[string]`
1162
  IDs of the sessions to be deleted.
1163
  - `None`
1164
 
 
24
 
25
  ### Parameters
26
 
27
+ #### name: `str`, *Required*
28
 
29
  The unique name of the dataset to create. It must adhere to the following requirements:
30
 
 
36
  - Maximum 65,535 characters.
37
  - Case-insensitive.
38
 
39
+ #### avatar: `str`
40
 
41
  Base64 encoding of the avatar. Defaults to `""`
42
 
43
+ #### tenant_id: `str`
44
 
45
  The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`.
46
 
47
+ - If creating a dataset, tenant_id must not be provided.
48
+ - If updating a dataset, tenant_id can't be changed.
49
 
50
+ #### description: `str`
51
 
52
  The description of the created dataset. Defaults to `""`.
53
 
54
+ #### language: `str`
55
 
56
+ The language setting of the created dataset. Defaults to `"English"`. ????????????
57
 
58
+ #### embedding_model: `str`
59
 
60
  The specific model used by the dataset to generate vector embeddings. Defaults to `""`.
61
 
62
+ - If creating a dataset, embedding_model must not be provided.
63
+ - If updating a dataset, embedding_model can't be changed.
64
 
65
+ #### permission: `str`
66
 
67
+ Specify who can operate on the dataset. Defaults to `"me"`.
68
 
69
+ #### document_count: `int`
70
 
71
  The number of documents associated with the dataset. Defaults to `0`.
72
 
73
+ - If updating a dataset, `document_count` can't be changed.
 
 
74
 
75
+ #### chunk_count: `int`
76
 
77
  The number of data chunks generated or processed by the created dataset. Defaults to `0`.
78
 
79
+ - If updating a dataset, chunk_count can't be changed.
 
 
 
 
80
 
81
+ #### parse_method, `str`
82
 
83
+ The method used by the dataset to parse and process data.
 
 
84
 
85
+ - If updating parse_method in a dataset, chunk_count must be greater than 0. Defaults to `"naive"`.
86
 
87
+ #### parser_config, `Dataset.ParserConfig`
88
 
89
+ The configuration settings for the parser used by the dataset.
 
 
 
90
 
91
  ### Returns
92
+ ```python
93
+ DataSet
94
+ description: dataset object
95
+ ```
96
  ### Examples
97
 
98
  ```python
99
  from ragflow import RAGFlow
100
 
101
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
102
+ ds = rag.create_dataset(name="kb_1")
103
  ```
104
 
105
  ---
 
107
  ## Delete knowledge bases
108
 
109
  ```python
110
+ RAGFlow.delete_datasets(ids: List[str] = None)
111
  ```
112
+ Deletes knowledge bases.
 
 
113
  ### Parameters
114
 
115
+ #### ids: `List[str]`
116
+
117
+ The ids of the datasets to be deleted.
118
 
 
119
 
120
  ### Returns
121
 
122
+ ```python
123
+ no return
124
+ ```
125
 
126
  ### Examples
127
 
 
 
128
  ```python
129
+ from ragflow import RAGFlow
130
+
131
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
132
  rag.delete_datasets(ids=["id_1","id_2"])
133
  ```
134
 
 
144
  desc: bool = True,
145
  id: str = None,
146
  name: str = None
147
+ ) -> List[DataSet]
148
  ```
149
 
150
+ Lists all knowledge bases in the RAGFlow system.
151
 
152
  ### Parameters
153
 
154
+ #### page: `int`
155
 
156
  The current page number to retrieve from the paginated data. This parameter determines which set of records will be fetched. Defaults to `1`.
157
 
158
+ #### page_size: `int`
159
 
160
  The number of records to retrieve per page. This controls how many records will be included in each page. Defaults to `1024`.
161
 
162
+ #### order_by: `str`
163
 
164
+ The field by which the records should be sorted. This specifies the attribute or column used to order the results. Defaults to `"create_time"`.
165
 
166
+ #### desc: `bool`
167
 
168
+ Whether the sorting should be in descending order. Defaults to `True`.
169
 
170
+ #### id: `str`
171
 
172
+ The id of the dataset to be got. Defaults to `None`.
173
 
174
+ #### name: `str`
175
 
176
+ The name of the dataset to be got. Defaults to `None`.
177
 
178
  ### Returns
179
 
 
 
 
 
 
 
 
180
  ```python
181
+ List[DataSet]
182
+ description:the list of datasets.
183
  ```
184
 
185
+ ### Examples
186
 
187
  ```python
188
+ from ragflow import RAGFlow
189
+
190
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
191
+ for ds in rag.list_datasets():
192
+ print(ds)
193
  ```
194
 
195
  ---
196
 
197
+
198
+ ## Update knowledge base
199
 
200
  ```python
201
  DataSet.update(update_message: dict)
202
  ```
203
 
 
 
 
 
 
 
204
  ### Returns
205
 
206
+ ```python
207
+ no return
208
+ ```
209
 
210
  ### Examples
211
 
212
  ```python
213
  from ragflow import RAGFlow
214
 
215
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
216
+ ds = rag.get_dataset(name="kb_1")
217
  ds.update({"parse_method":"manual", ...}}
218
  ```
219
 
 
231
 
232
  ### Parameters
233
 
 
 
234
  #### name
235
 
236
  #### blob
 
334
  ```python
335
  from ragflow import RAGFlow
336
 
337
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
338
  doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
339
  print(doc)
340
  ```
 
356
  ```python
357
  from ragflow import RAGFlow
358
 
359
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
360
  doc = rag.get_document(id="wdfxb5t547d")
361
  doc.parser_method= "manual"
362
  doc.save()
 
379
  ```python
380
  from ragflow import RAGFlow
381
 
382
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
383
  doc = rag.get_document(id="wdfxb5t547d")
384
  open("~/ragflow.txt", "w+").write(doc.download())
385
  print(doc)
 
390
  ## List documents
391
 
392
  ```python
393
+ Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
394
  ```
395
 
396
  ### Parameters
 
405
 
406
  #### limit: `int`
407
 
408
+ Records number to return, -1 means all of them. Records number to return, -1 means all of them.
409
 
410
  ### Returns
411
 
412
+ List[Document]
413
 
414
  ### Examples
415
 
416
  ```python
417
  from ragflow import RAGFlow
418
 
419
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
420
  ds = rag.create_dataset(name="kb_1")
421
 
422
  filename1 = "~/ragflow.txt"
 
446
  ```python
447
  from ragflow import RAGFlow
448
 
449
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
450
  ds = rag.create_dataset(name="kb_1")
451
 
452
  filename1 = "~/ragflow.txt"
 
579
  ```python
580
  from ragflow import RAGFlow
581
 
582
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
583
  doc = rag.get_document(id="wdfxb5t547d")
584
  chunk = doc.add_chunk(content="xxxxxxx")
585
  ```
 
601
  ```python
602
  from ragflow import RAGFlow
603
 
604
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
605
  doc = rag.get_document(id="wdfxb5t547d")
606
  chunk = doc.add_chunk(content="xxxxxxx")
607
  chunk.delete()
 
624
  ```python
625
  from ragflow import RAGFlow
626
 
627
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
628
  doc = rag.get_document(id="wdfxb5t547d")
629
  chunk = doc.add_chunk(content="xxxxxxx")
630
  chunk.content = "sdfx"
 
636
  ## Retrieval
637
 
638
  ```python
639
+ RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
640
  ```
641
 
642
  ### Parameters
 
645
 
646
  The user query or query keywords. Defaults to `""`.
647
 
648
+ #### datasets: `List[Dataset]`, *Required*
649
 
650
  The scope of datasets.
651
 
652
+ #### document: `List[Document]`
653
 
654
  The scope of document. `None` means no limitation. Defaults to `None`.
655
 
 
675
 
676
  ### Returns
677
 
678
+ List[Chunk]
679
 
680
  ### Examples
681
 
682
  ```python
683
  from ragflow import RAGFlow
684
 
685
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
686
  ds = rag.get_dataset(name="ragflow")
687
  name = 'ragflow_test.txt'
688
  path = 'test_data/ragflow_test.txt'
 
713
  RAGFlow.create_chat(
714
  name: str = "assistant",
715
  avatar: str = "path",
716
+ knowledgebases: List[DataSet] = ["kb1"],
717
  llm: Chat.LLM = None,
718
  prompt: Chat.Prompt = None
719
  ) -> Chat
 
734
 
735
  The icon of the created chat. Defaults to `"path"`.
736
 
737
+ #### knowledgebases: `List[DataSet]`
738
 
739
  Select knowledgebases associated. Defaults to `["kb1"]`.
740
 
 
776
  ```python
777
  from ragflow import RAGFlow
778
 
779
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
780
  kb = rag.get_dataset(name="kb_1")
781
  assi = rag.create_chat("Miss R", knowledgebases=[kb])
782
  ```
 
800
  ```python
801
  from ragflow import RAGFlow
802
 
803
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
804
  kb = rag.get_knowledgebase(name="kb_1")
805
  assi = rag.create_chat("Miss R", knowledgebases=[kb])
806
  assi.update({"temperature":0.8})
 
811
  ## Delete chats
812
 
813
  ```python
814
+ RAGFlow.delete_chats(ids: List[str] = None)
815
  ```
816
  ### Parameters
817
 
 
831
  ```python
832
  from ragflow import RAGFlow
833
 
834
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
835
  rag.delete_chats(ids=["id_1","id_2"])
836
  ```
837
 
 
847
  desc: bool = True,
848
  id: str = None,
849
  name: str = None
850
+ ) -> List[Chat]
851
  ```
852
 
853
  ### Parameters
 
890
  ```python
891
  from ragflow import RAGFlow
892
 
893
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
894
  for assi in rag.list_chats():
895
  print(assi)
896
  ```
 
920
 
921
  The name of the created session. Defaults to `"New session"`.
922
 
923
+ #### messages: `List[Message]`
924
 
925
  The messages of the created session.
926
  - messages cannot be provided.
 
943
  ```python
944
  from ragflow import RAGFlow
945
 
946
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
947
  assi = rag.list_chats(name="Miss R")
948
  assi = assi[0]
949
  sess = assi.create_session()
 
965
  ```python
966
  from ragflow import RAGFlow
967
 
968
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
969
  assi = rag.list_chats(name="Miss R")
970
  assi = assi[0]
971
  sess = assi.create_session("new_session")
 
1003
 
1004
  The content of the message. Defaults to `"Hi! I am your assistant, can I help you?"`.
1005
 
1006
+ #### reference: `List[Chunk]`
1007
 
1008
  The auto-generated reference of the message. Each `chunk` object includes the following attributes:
1009
 
 
1025
  A similarity score based on vector representations. This score is obtained by converting texts, words, or objects into vectors and then calculating the cosine similarity or other distance measures between these vectors to determine the similarity in vector space. A higher value indicates greater similarity in the vector space. Defaults to `None`. ?????????????????????????????????
1026
  - **term_similarity**: `float`
1027
  The similarity score based on terms or keywords. This score is calculated by comparing the similarity of key terms between texts or datasets, typically measuring how similar two words or phrases are in meaning or context. A higher value indicates a stronger similarity between terms. Defaults to `None`. ???????????????????
1028
+ - **position**: `List[string]`
1029
  Indicates the position or index of keywords or specific terms within the text. An array is typically used to mark the location of keywords or specific elements, facilitating precise operations or analysis of the text. Defaults to `None`. ??????????????
1030
 
1031
  ### Examples
 
1033
  ```python
1034
  from ragflow import RAGFlow
1035
 
1036
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
1037
  assi = rag.list_chats(name="Miss R")
1038
  assi = assi[0]
1039
  sess = assi.create_session()
 
1064
  desc: bool = True,
1065
  id: str = None,
1066
  name: str = None
1067
+ ) -> List[Session]
1068
  ```
1069
 
1070
  ### Returns
1071
 
1072
+ List[Session]
1073
  description: the List contains information about multiple assistant object, with each dictionary containing information about one assistant.
1074
 
1075
  ### Examples
 
1077
  ```python
1078
  from ragflow import RAGFlow
1079
 
1080
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
1081
  assi = rag.list_chats(name="Miss R")
1082
  assi = assi[0]
1083
  for sess in assi.list_sessions():
 
1120
  ## Delete session
1121
 
1122
  ```python
1123
+ Chat.delete_sessions(ids:List[str] = None)
1124
  ```
1125
 
1126
  ### Returns
 
1132
  ```python
1133
  from ragflow import RAGFlow
1134
 
1135
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
1136
  assi = rag.list_chats(name="Miss R")
1137
  assi = assi[0]
1138
  assi.delete_sessions(ids=["id_1","id_2"])
1139
  ```
1140
  ### Parameters
1141
+ #### ids: `List[string]`
1142
  IDs of the sessions to be deleted.
1143
  - `None`
1144
 
sdk/python/ragflow/modules/base.py CHANGED
@@ -18,8 +18,8 @@ class Base(object):
18
  pr[name] = value
19
  return pr
20
 
21
- def post(self, path, json, stream=False):
22
- res = self.rag.post(path, json, stream=stream)
23
  return res
24
 
25
  def get(self, path, params):
 
18
  pr[name] = value
19
  return pr
20
 
21
+ def post(self, path, json=None, stream=False, files=None):
22
+ res = self.rag.post(path, json, stream=stream,files=files)
23
  return res
24
 
25
  def get(self, path, params):
sdk/python/ragflow/modules/dataset.py CHANGED
@@ -1,5 +1,7 @@
1
  from typing import Optional, List
2
 
 
 
3
  from .document import Document
4
 
5
  from .base import Base
@@ -39,39 +41,27 @@ class DataSet(Base):
39
  if res.get("code") != 0:
40
  raise Exception(res["message"])
41
 
 
 
 
 
 
 
 
42
 
43
- def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
44
- """
45
- List the documents in the dataset, optionally filtering by keywords, with pagination support.
46
-
47
- Args:
48
- keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
49
- offset (int): The starting point for pagination. Defaults to 0.
50
- limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
51
-
52
- Returns:
53
- List[Document]: A list of Document objects.
54
- """
55
- # Construct the request payload for listing documents
56
- payload = {
57
- "knowledgebase_id": self.id,
58
- "keywords": keywords,
59
- "offset": offset,
60
- "limit": limit
61
- }
62
-
63
- # Send the request to the server to list documents
64
- res = self.get(f'/doc/dataset/{self.id}/documents', payload)
65
- res_json = res.json()
66
-
67
- # Handle response and error checking
68
- if res_json.get("retmsg") != "success":
69
- raise Exception(res_json.get("retmsg"))
70
-
71
- # Parse the document data from the response
72
  documents = []
73
- for doc_data in res_json["data"].get("docs", []):
74
- doc = Document(self.rag, doc_data)
75
- documents.append(doc)
 
 
 
 
 
 
 
 
76
 
77
- return documents
 
1
  from typing import Optional, List
2
 
3
+ from transformers.models.bloom.modeling_bloom import bloom_gelu_back
4
+
5
  from .document import Document
6
 
7
  from .base import Base
 
41
  if res.get("code") != 0:
42
  raise Exception(res["message"])
43
 
44
+ def upload_documents(self,document_list: List[dict]):
45
+ url = f"/dataset/{self.id}/document"
46
+ files = [("file",(ele["name"],ele["blob"])) for ele in document_list]
47
+ res = self.post(path=url,json=None,files=files)
48
+ res = res.json()
49
+ if res.get("code") != 0:
50
+ raise Exception(res.get("message"))
51
 
52
+ def list_documents(self, id: str = None, keywords: str = None, offset: int =1, limit: int = 1024, orderby: str = "create_time", desc: bool = True):
53
+ res = self.get(f"/dataset/{self.id}/info",params={"id": id,"keywords": keywords,"offset": offset,"limit": limit,"orderby": orderby,"desc": desc})
54
+ res = res.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  documents = []
56
+ if res.get("code") == 0:
57
+ for document in res["data"].get("docs"):
58
+ documents.append(Document(self.rag,document))
59
+ return documents
60
+ raise Exception(res["message"])
61
+
62
+ def delete_documents(self,ids: List[str] = None):
63
+ res = self.rm(f"/dataset/{self.id}/document",{"ids":ids})
64
+ res = res.json()
65
+ if res.get("code") != 0:
66
+ raise Exception(res["message"])
67
 
 
sdk/python/ragflow/modules/document.py CHANGED
@@ -29,18 +29,14 @@ class Document(Base):
29
  res_dict.pop(k)
30
  super().__init__(rag, res_dict)
31
 
32
- def save(self) -> bool:
33
  """
34
  Save the document details to the server.
35
  """
36
- res = self.post('/doc/save',
37
- {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "knowledgebase_id": self.knowledgebase_id,
38
- "parser_method": self.parser_method, "parser_config": self.parser_config.to_json(),
39
- })
40
  res = res.json()
41
- if res.get("retmsg") == "success":
42
- return True
43
- raise Exception(res["retmsg"])
44
 
45
  def delete(self) -> bool:
46
  """
@@ -60,8 +56,7 @@ class Document(Base):
60
  :return: The downloaded document content in bytes.
61
  """
62
  # Construct the URL for the API request using the document ID and knowledge base ID
63
- res = self.get(f"/doc/{self.id}",
64
- {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
65
 
66
  # Check the response status code to ensure the request was successful
67
  if res.status_code == 200:
 
29
  res_dict.pop(k)
30
  super().__init__(rag, res_dict)
31
 
32
+ def update(self,update_message:dict) -> bool:
33
  """
34
  Save the document details to the server.
35
  """
36
+ res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
 
 
 
37
  res = res.json()
38
+ if res.get("code") != 0:
39
+ raise Exception(res["message"])
 
40
 
41
  def delete(self) -> bool:
42
  """
 
56
  :return: The downloaded document content in bytes.
57
  """
58
  # Construct the URL for the API request using the document ID and knowledge base ID
59
+ res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
 
60
 
61
  # Check the response status code to ensure the request was successful
62
  if res.status_code == 200:
sdk/python/ragflow/ragflow.py CHANGED
@@ -32,12 +32,12 @@ class RAGFlow:
32
  self.api_url = f"{base_url}/api/{version}"
33
  self.authorization_header = {"Authorization": "{} {}".format("Bearer", self.user_key)}
34
 
35
- def post(self, path, json, stream=False):
36
- res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream)
37
  return res
38
 
39
- def get(self, path, params=None):
40
- res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header)
41
  return res
42
 
43
  def delete(self, path, json):
@@ -151,31 +151,7 @@ class RAGFlow:
151
  return result_list
152
  raise Exception(res["message"])
153
 
154
- def create_document(self, ds: DataSet, name: str, blob: bytes) -> bool:
155
- url = f"/doc/dataset/{ds.id}/documents/upload"
156
- files = {
157
- 'file': (name, blob)
158
- }
159
- headers = {
160
- 'Authorization': f"Bearer {ds.rag.user_key}"
161
- }
162
 
163
- response = requests.post(self.api_url + url, files=files,
164
- headers=headers)
165
-
166
- if response.status_code == 200 and response.json().get('retmsg') == 'success':
167
- return True
168
- else:
169
- raise Exception(f"Upload failed: {response.json().get('retmsg')}")
170
-
171
- return False
172
-
173
- def get_document(self, id: str = None, name: str = None) -> Document:
174
- res = self.get("/doc/infos", {"id": id, "name": name})
175
- res = res.json()
176
- if res.get("retmsg") == "success":
177
- return Document(self, res['data'])
178
- raise Exception(res["retmsg"])
179
 
180
  def async_parse_documents(self, doc_ids):
181
  """
 
32
  self.api_url = f"{base_url}/api/{version}"
33
  self.authorization_header = {"Authorization": "{} {}".format("Bearer", self.user_key)}
34
 
35
+ def post(self, path, json=None, stream=False, files=None):
36
+ res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream,files=files)
37
  return res
38
 
39
+ def get(self, path, params=None, json=None):
40
+ res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header,json=json)
41
  return res
42
 
43
  def delete(self, path, json):
 
151
  return result_list
152
  raise Exception(res["message"])
153
 
 
 
 
 
 
 
 
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def async_parse_documents(self, doc_ids):
157
  """
sdk/python/test/t_document.py CHANGED
@@ -21,22 +21,16 @@ class TestDocument(TestSdk):
21
 
22
  # Step 2: Create a new document
23
  # The blob is the actual file content or a placeholder in this case
24
- name = "TestDocument.txt"
25
  blob = b"Sample document content for ingestion test."
26
-
27
- res = rag.create_document(ds, name=name, blob=blob)
28
-
 
 
 
 
29
  # Ensure document ingestion was successful
30
- assert res is True, f"Failed to create document, error: {res}"
31
-
32
- def test_get_detail_document_with_success(self):
33
- """
34
- Test getting a document's detail with success
35
- """
36
- rag = RAGFlow(API_KEY, HOST_ADDRESS)
37
- doc = rag.get_document(name="TestDocument.txt")
38
- assert isinstance(doc, Document), f"Failed to get dataset, error: {doc}."
39
- assert doc.name == "TestDocument.txt", "Name does not match"
40
 
41
  def test_update_document_with_success(self):
42
  """
@@ -44,12 +38,13 @@ class TestDocument(TestSdk):
44
  Update name or parser_method are supported
45
  """
46
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
47
- doc = rag.get_document(name="TestDocument.txt")
 
 
 
48
  if isinstance(doc, Document):
49
- doc.parser_method = "manual"
50
- doc.name = "manual.txt"
51
- res = doc.save()
52
- assert res is True, f"Failed to update document, error: {res}"
53
  else:
54
  assert False, f"Failed to get document, error: {doc}"
55
 
@@ -61,8 +56,10 @@ class TestDocument(TestSdk):
61
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
62
 
63
  # Retrieve a document
64
- doc = rag.get_document(name="manual.txt")
65
-
 
 
66
  # Check if the retrieved document is of type Document
67
  if isinstance(doc, Document):
68
  # Download the document content and save it to a file
@@ -81,7 +78,7 @@ class TestDocument(TestSdk):
81
  # If the document retrieval fails, assert failure
82
  assert False, f"Failed to get document, error: {doc}"
83
 
84
- def test_list_all_documents_in_dataset_with_success(self):
85
  """
86
  Test list all documents into a dataset with success.
87
  """
@@ -101,12 +98,10 @@ class TestDocument(TestSdk):
101
  blob1 = b"Sample document content for ingestion test111."
102
  name2 = "Test Document222.txt"
103
  blob2 = b"Sample document content for ingestion test222."
104
-
105
- rag.create_document(ds, name=name1, blob=blob1)
106
- rag.create_document(ds, name=name2, blob=blob2)
107
  for d in ds.list_docs(keywords="test", offset=0, limit=12):
108
- assert isinstance(d, Document)
109
- print(d)
110
 
111
  def test_delete_documents_in_dataset_with_success(self):
112
  """
 
21
 
22
  # Step 2: Create a new document
23
  # The blob is the actual file content or a placeholder in this case
 
24
  blob = b"Sample document content for ingestion test."
25
+ blob_2 = b"test_2."
26
+ list_1 = []
27
+ list_1.append({"name":"Test_1.txt",
28
+ "blob":blob})
29
+ list_1.append({"name":"Test_2.txt",
30
+ "blob":blob_2})
31
+ res = ds.upload_documents(list_1)
32
  # Ensure document ingestion was successful
33
+ assert res is None, f"Failed to create document, error: {res}"
 
 
 
 
 
 
 
 
 
34
 
35
  def test_update_document_with_success(self):
36
  """
 
38
  Update name or parser_method are supported
39
  """
40
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
41
+ ds = rag.list_datasets(name="God")
42
+ ds = ds[0]
43
+ doc = ds.list_documents()
44
+ doc = doc[0]
45
  if isinstance(doc, Document):
46
+ res = doc.update({"parser_method":"manual","name":"manual.txt"})
47
+ assert res is None, f"Failed to update document, error: {res}"
 
 
48
  else:
49
  assert False, f"Failed to get document, error: {doc}"
50
 
 
56
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
57
 
58
  # Retrieve a document
59
+ ds = rag.list_datasets(name="God")
60
+ ds = ds[0]
61
+ doc = ds.list_documents(name="manual.txt")
62
+ doc = doc[0]
63
  # Check if the retrieved document is of type Document
64
  if isinstance(doc, Document):
65
  # Download the document content and save it to a file
 
78
  # If the document retrieval fails, assert failure
79
  assert False, f"Failed to get document, error: {doc}"
80
 
81
+ def test_list_documents_in_dataset_with_success(self):
82
  """
83
  Test list all documents into a dataset with success.
84
  """
 
98
  blob1 = b"Sample document content for ingestion test111."
99
  name2 = "Test Document222.txt"
100
  blob2 = b"Sample document content for ingestion test222."
101
+ list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
102
+ ds.upload_documents(list_1)
 
103
  for d in ds.list_docs(keywords="test", offset=0, limit=12):
104
+ assert isinstance(d, Document), "Failed to upload documents"
 
105
 
106
  def test_delete_documents_in_dataset_with_success(self):
107
  """