liuhua liuhua Kevin Hu commited on
Commit
3d9274d
·
1 Parent(s): ab87187

Refactor Chunk API (#2855)

Browse files

### What problem does this PR solve?

Refactor Chunk API
#2846
### Type of change


- [x] Refactoring

---------

Co-authored-by: liuhua <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>

api/apps/sdk/doc.py CHANGED
@@ -119,13 +119,11 @@ def update_doc(tenant_id, dataset_id, document_id):
119
  if informs:
120
  e, file = FileService.get_by_id(informs[0].file_id)
121
  FileService.update_by_id(file.id, {"name": req["name"]})
 
 
122
  if "parser_method" in req:
123
  if doc.parser_id.lower() == req["parser_method"].lower():
124
- if "parser_config" in req:
125
- if req["parser_config"] == doc.parser_config:
126
- return get_result(retcode=RetCode.SUCCESS)
127
- else:
128
- return get_result(retcode=RetCode.SUCCESS)
129
 
130
  if doc.type == FileType.VISUAL or re.search(
131
  r"\.(ppt|pptx|pages)$", doc.name):
@@ -146,8 +144,6 @@ def update_doc(tenant_id, dataset_id, document_id):
146
  return get_error_data_result(retmsg="Tenant not found!")
147
  ELASTICSEARCH.deleteByQuery(
148
  Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
149
- if "parser_config" in req:
150
- DocumentService.update_parser_config(doc.id, req["parser_config"])
151
 
152
  return get_result()
153
 
@@ -258,6 +254,8 @@ def parse(tenant_id,dataset_id):
258
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
259
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
260
  req = request.json
 
 
261
  for id in req["document_ids"]:
262
  if not DocumentService.query(id=id,kb_id=dataset_id):
263
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
@@ -283,9 +281,14 @@ def stop_parsing(tenant_id,dataset_id):
283
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
284
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
285
  req = request.json
 
 
286
  for id in req["document_ids"]:
287
- if not DocumentService.query(id=id,kb_id=dataset_id):
 
288
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
 
 
289
  info = {"run": "2", "progress": 0}
290
  DocumentService.update_by_id(id, info)
291
  # if str(req["run"]) == TaskStatus.CANCEL.value:
@@ -297,7 +300,7 @@ def stop_parsing(tenant_id,dataset_id):
297
 
298
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
299
  @token_required
300
- def list_chunk(tenant_id,dataset_id,document_id):
301
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
302
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
303
  doc=DocumentService.query(id=document_id, kb_id=dataset_id)
@@ -309,57 +312,58 @@ def list_chunk(tenant_id,dataset_id,document_id):
309
  page = int(req.get("offset", 1))
310
  size = int(req.get("limit", 30))
311
  question = req.get("keywords", "")
312
- try:
313
- query = {
314
- "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  }
316
- if "available_int" in req:
317
- query["available_int"] = int(req["available_int"])
318
- sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
319
- res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
320
-
321
- origin_chunks = []
322
- for id in sres.ids:
323
- d = {
324
- "chunk_id": id,
325
- "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
326
- id].get(
327
- "content_with_weight", ""),
328
- "doc_id": sres.field[id]["doc_id"],
329
- "docnm_kwd": sres.field[id]["docnm_kwd"],
330
- "important_kwd": sres.field[id].get("important_kwd", []),
331
- "img_id": sres.field[id].get("img_id", ""),
332
- "available_int": sres.field[id].get("available_int", 1),
333
- "positions": sres.field[id].get("position_int", "").split("\t")
334
- }
335
- if len(d["positions"]) % 5 == 0:
336
- poss = []
337
- for i in range(0, len(d["positions"]), 5):
338
- poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
339
- float(d["positions"][i + 3]), float(d["positions"][i + 4])])
340
- d["positions"] = poss
341
-
342
- origin_chunks.append(d)
343
- ##rename keys
344
- for chunk in origin_chunks:
345
- key_mapping = {
346
- "chunk_id": "id",
347
- "content_with_weight": "content",
348
- "doc_id": "document_id",
349
- "important_kwd": "important_keywords",
350
- "img_id": "image_id",
351
- }
352
- renamed_chunk = {}
353
- for key, value in chunk.items():
354
- new_key = key_mapping.get(key, key)
355
- renamed_chunk[new_key] = value
356
- res["chunks"].append(renamed_chunk)
357
- return get_result(data=res)
358
- except Exception as e:
359
- if str(e).find("not_found") > 0:
360
- return get_result(retmsg=f'No chunk found!',
361
- retcode=RetCode.DATA_ERROR)
362
- return server_error_response(e)
363
 
364
 
365
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
@@ -374,6 +378,9 @@ def create(tenant_id,dataset_id,document_id):
374
  req = request.json
375
  if not req.get("content"):
376
  return get_error_data_result(retmsg="`content` is required")
 
 
 
377
  md5 = hashlib.md5()
378
  md5.update((req["content"] + document_id).encode("utf-8"))
379
 
@@ -381,8 +388,8 @@ def create(tenant_id,dataset_id,document_id):
381
  d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
382
  "content_with_weight": req["content"]}
383
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
384
- d["important_kwd"] = req.get("important_kwd", [])
385
- d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
386
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
387
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
388
  d["kb_id"] = [doc.kb_id]
@@ -432,12 +439,12 @@ def rm_chunk(tenant_id,dataset_id,document_id):
432
  req = request.json
433
  if not req.get("chunk_ids"):
434
  return get_error_data_result("`chunk_ids` is required")
 
 
 
435
  for chunk_id in req.get("chunk_ids"):
436
- res = ELASTICSEARCH.get(
437
- chunk_id, search.index_name(
438
- tenant_id))
439
- if not res.get("found"):
440
- return server_error_response(f"Chunk {chunk_id} not found")
441
  if not ELASTICSEARCH.deleteByQuery(
442
  Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
443
  return get_error_data_result(retmsg="Index updating failure")
@@ -451,24 +458,36 @@ def rm_chunk(tenant_id,dataset_id,document_id):
451
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
452
  @token_required
453
  def set(tenant_id,dataset_id,document_id,chunk_id):
454
- res = ELASTICSEARCH.get(
 
455
  chunk_id, search.index_name(
456
  tenant_id))
457
- if not res.get("found"):
458
- return get_error_data_result(f"Chunk {chunk_id} not found")
459
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
460
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
461
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
462
  if not doc:
463
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
 
 
 
 
 
 
464
  req = request.json
 
465
  d = {
466
  "id": chunk_id,
467
- "content_with_weight": req.get("content",res.get["content_with_weight"])}
468
- d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
469
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
470
- d["important_kwd"] = req.get("important_keywords",[])
471
- d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
 
 
 
472
  if "available" in req:
473
  d["available_int"] = req["available"]
474
  embd_id = DocumentService.get_embd_id(document_id)
@@ -478,7 +497,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
478
  arr = [
479
  t for t in re.split(
480
  r"[\n\t]",
481
- req["content"]) if len(t) > 1]
482
  if len(arr) != 2:
483
  return get_error_data_result(
484
  retmsg="Q&A must be separated by TAB/ENTER key.")
@@ -486,7 +505,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
486
  d = beAdoc(d, arr[0], arr[1], not any(
487
  [rag_tokenizer.is_chinese(t) for t in q + a]))
488
 
489
- v, c = embd_mdl.encode([doc.name, req["content"]])
490
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
491
  d["q_%d_vec" % len(v)] = v.tolist()
492
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
@@ -505,7 +524,7 @@ def retrieval_test(tenant_id):
505
  for id in kb_id:
506
  if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
507
  return get_error_data_result(f"You don't own the dataset {id}.")
508
- if "question" not in req_json:
509
  return get_error_data_result("`question` is required.")
510
  page = int(req.get("offset", 1))
511
  size = int(req.get("limit", 30))
 
119
  if informs:
120
  e, file = FileService.get_by_id(informs[0].file_id)
121
  FileService.update_by_id(file.id, {"name": req["name"]})
122
+ if "parser_config" in req:
123
+ DocumentService.update_parser_config(doc.id, req["parser_config"])
124
  if "parser_method" in req:
125
  if doc.parser_id.lower() == req["parser_method"].lower():
126
+ return get_result()
 
 
 
 
127
 
128
  if doc.type == FileType.VISUAL or re.search(
129
  r"\.(ppt|pptx|pages)$", doc.name):
 
144
  return get_error_data_result(retmsg="Tenant not found!")
145
  ELASTICSEARCH.deleteByQuery(
146
  Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
 
 
147
 
148
  return get_result()
149
 
 
254
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
255
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
256
  req = request.json
257
+ if not req.get("document_ids"):
258
+ return get_error_data_result("`document_ids` is required")
259
  for id in req["document_ids"]:
260
  if not DocumentService.query(id=id,kb_id=dataset_id):
261
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
 
281
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
282
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
283
  req = request.json
284
+ if not req.get("document_ids"):
285
+ return get_error_data_result("`document_ids` is required")
286
  for id in req["document_ids"]:
287
+ doc = DocumentService.query(id=id, kb_id=dataset_id)
288
+ if not doc:
289
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
290
+ if doc[0].progress == 100.0 or doc[0].progress == 0.0:
291
+ return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
292
  info = {"run": "2", "progress": 0}
293
  DocumentService.update_by_id(id, info)
294
  # if str(req["run"]) == TaskStatus.CANCEL.value:
 
300
 
301
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
302
  @token_required
303
+ def list_chunks(tenant_id,dataset_id,document_id):
304
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
305
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
306
  doc=DocumentService.query(id=document_id, kb_id=dataset_id)
 
312
  page = int(req.get("offset", 1))
313
  size = int(req.get("limit", 30))
314
  question = req.get("keywords", "")
315
+ query = {
316
+ "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
317
+ }
318
+ sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
319
+ res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
320
+ origin_chunks = []
321
+ sign = 0
322
+ for id in sres.ids:
323
+ d = {
324
+ "chunk_id": id,
325
+ "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
326
+ id].get(
327
+ "content_with_weight", ""),
328
+ "doc_id": sres.field[id]["doc_id"],
329
+ "docnm_kwd": sres.field[id]["docnm_kwd"],
330
+ "important_kwd": sres.field[id].get("important_kwd", []),
331
+ "img_id": sres.field[id].get("img_id", ""),
332
+ "available_int": sres.field[id].get("available_int", 1),
333
+ "positions": sres.field[id].get("position_int", "").split("\t")
334
  }
335
+ if len(d["positions"]) % 5 == 0:
336
+ poss = []
337
+ for i in range(0, len(d["positions"]), 5):
338
+ poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
339
+ float(d["positions"][i + 3]), float(d["positions"][i + 4])])
340
+ d["positions"] = poss
341
+
342
+ origin_chunks.append(d)
343
+ if req.get("id"):
344
+ if req.get("id") == id:
345
+ origin_chunks.clear()
346
+ origin_chunks.append(d)
347
+ sign = 1
348
+ break
349
+ if req.get("id"):
350
+ if sign == 0:
351
+ return get_error_data_result(f"Can't find this chunk {req.get('id')}")
352
+ for chunk in origin_chunks:
353
+ key_mapping = {
354
+ "chunk_id": "id",
355
+ "content_with_weight": "content",
356
+ "doc_id": "document_id",
357
+ "important_kwd": "important_keywords",
358
+ "img_id": "image_id",
359
+ }
360
+ renamed_chunk = {}
361
+ for key, value in chunk.items():
362
+ new_key = key_mapping.get(key, key)
363
+ renamed_chunk[new_key] = value
364
+ res["chunks"].append(renamed_chunk)
365
+ return get_result(data=res)
366
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
 
369
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
 
378
  req = request.json
379
  if not req.get("content"):
380
  return get_error_data_result(retmsg="`content` is required")
381
+ if "important_keywords" in req:
382
+ if type(req["important_keywords"]) != list:
383
+ return get_error_data_result("`important_keywords` is required to be a list")
384
  md5 = hashlib.md5()
385
  md5.update((req["content"] + document_id).encode("utf-8"))
386
 
 
388
  d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
389
  "content_with_weight": req["content"]}
390
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
391
+ d["important_kwd"] = req.get("important_keywords", [])
392
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", [])))
393
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
394
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
395
  d["kb_id"] = [doc.kb_id]
 
439
  req = request.json
440
  if not req.get("chunk_ids"):
441
  return get_error_data_result("`chunk_ids` is required")
442
+ query = {
443
+ "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
444
+ sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
445
  for chunk_id in req.get("chunk_ids"):
446
+ if chunk_id not in sres.ids:
447
+ return get_error_data_result(f"Chunk {chunk_id} not found")
 
 
 
448
  if not ELASTICSEARCH.deleteByQuery(
449
  Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
450
  return get_error_data_result(retmsg="Index updating failure")
 
458
  @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
459
  @token_required
460
  def set(tenant_id,dataset_id,document_id,chunk_id):
461
+ try:
462
+ res = ELASTICSEARCH.get(
463
  chunk_id, search.index_name(
464
  tenant_id))
465
+ except Exception as e:
466
+ return get_error_data_result(f"Can't find this chunk {chunk_id}")
467
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
468
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
469
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
470
  if not doc:
471
  return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
472
+ doc = doc[0]
473
+ query = {
474
+ "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True
475
+ }
476
+ sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
477
+ if chunk_id not in sres.ids:
478
+ return get_error_data_result(f"You don't own the chunk {chunk_id}")
479
  req = request.json
480
+ content=res["_source"].get("content_with_weight")
481
  d = {
482
  "id": chunk_id,
483
+ "content_with_weight": req.get("content",content)}
484
+ d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
485
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
486
+ if "important_keywords" in req:
487
+ if type(req["important_keywords"]) != list:
488
+ return get_error_data_result("`important_keywords` is required to be a list")
489
+ d["important_kwd"] = req.get("important_keywords")
490
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
491
  if "available" in req:
492
  d["available_int"] = req["available"]
493
  embd_id = DocumentService.get_embd_id(document_id)
 
497
  arr = [
498
  t for t in re.split(
499
  r"[\n\t]",
500
+ d["content_with_weight"]) if len(t) > 1]
501
  if len(arr) != 2:
502
  return get_error_data_result(
503
  retmsg="Q&A must be separated by TAB/ENTER key.")
 
505
  d = beAdoc(d, arr[0], arr[1], not any(
506
  [rag_tokenizer.is_chinese(t) for t in q + a]))
507
 
508
+ v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
509
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
510
  d["q_%d_vec" % len(v)] = v.tolist()
511
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
 
524
  for id in kb_id:
525
  if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
526
  return get_error_data_result(f"You don't own the dataset {id}.")
527
+ if "question" not in req:
528
  return get_error_data_result("`question` is required.")
529
  page = int(req.get("offset", 1))
530
  size = int(req.get("limit", 30))
api/apps/sdk/session.py CHANGED
@@ -24,10 +24,9 @@ from api.utils import get_uuid
24
  from api.utils.api_utils import get_error_data_result
25
  from api.utils.api_utils import get_result, token_required
26
 
27
-
28
  @manager.route('/chat/<chat_id>/session', methods=['POST'])
29
  @token_required
30
- def create(tenant_id, chat_id):
31
  req = request.json
32
  req["dialog_id"] = chat_id
33
  dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
@@ -51,14 +50,13 @@ def create(tenant_id, chat_id):
51
  del conv["reference"]
52
  return get_result(data=conv)
53
 
54
-
55
  @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
56
  @token_required
57
- def update(tenant_id, chat_id, session_id):
58
  req = request.json
59
  req["dialog_id"] = chat_id
60
  conv_id = session_id
61
- conv = ConversationService.query(id=conv_id, dialog_id=chat_id)
62
  if not conv:
63
  return get_error_data_result(retmsg="Session does not exist")
64
  if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
@@ -74,16 +72,30 @@ def update(tenant_id, chat_id, session_id):
74
  return get_result()
75
 
76
 
77
- @manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST'])
78
  @token_required
79
- def completion(tenant_id, chat_id, session_id):
80
  req = request.json
81
  # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
82
  # {"role": "user", "content": "上海有吗?"}
83
  # ]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if not req.get("question"):
85
  return get_error_data_result(retmsg="Please input your question.")
86
- conv = ConversationService.query(id=session_id, dialog_id=chat_id)
87
  if not conv:
88
  return get_error_data_result(retmsg="Session does not exist")
89
  conv = conv[0]
@@ -117,17 +129,18 @@ def completion(tenant_id, chat_id, session_id):
117
  conv.message[-1] = {"role": "assistant", "content": ans["answer"],
118
  "id": message_id, "prompt": ans.get("prompt", "")}
119
  ans["id"] = message_id
 
120
 
121
  def stream():
122
  nonlocal dia, msg, req, conv
123
  try:
124
  for ans in chat(dia, msg, **req):
125
  fillin_conv(ans)
126
- yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n"
127
  ConversationService.update_by_id(conv.id, conv.to_dict())
128
  except Exception as e:
129
  yield "data:" + json.dumps({"code": 500, "message": str(e),
130
- "data": {"answer": "**ERROR**: " + str(e), "reference": []}},
131
  ensure_ascii=False) + "\n\n"
132
  yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
133
 
@@ -148,15 +161,14 @@ def completion(tenant_id, chat_id, session_id):
148
  break
149
  return get_result(data=answer)
150
 
151
-
152
  @manager.route('/chat/<chat_id>/session', methods=['GET'])
153
  @token_required
154
- def list(chat_id, tenant_id):
155
  if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
156
  return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
157
  id = request.args.get("id")
158
  name = request.args.get("name")
159
- session = ConversationService.query(id=id, name=name, dialog_id=chat_id)
160
  if not session:
161
  return get_error_data_result(retmsg="The session doesn't exist")
162
  page_number = int(request.args.get("page", 1))
@@ -166,7 +178,7 @@ def list(chat_id, tenant_id):
166
  desc = False
167
  else:
168
  desc = True
169
- convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name)
170
  if not convs:
171
  return get_result(data=[])
172
  for conv in convs:
@@ -201,17 +213,16 @@ def list(chat_id, tenant_id):
201
  del conv["reference"]
202
  return get_result(data=convs)
203
 
204
-
205
  @manager.route('/chat/<chat_id>/session', methods=["DELETE"])
206
  @token_required
207
- def delete(tenant_id, chat_id):
208
  if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
209
  return get_error_data_result(retmsg="You don't own the chat")
210
  ids = request.json.get("ids")
211
  if not ids:
212
  return get_error_data_result(retmsg="`ids` is required in deleting operation")
213
  for id in ids:
214
- conv = ConversationService.query(id=id, dialog_id=chat_id)
215
  if not conv:
216
  return get_error_data_result(retmsg="The chat doesn't own the session")
217
  ConversationService.delete_by_id(id)
 
24
  from api.utils.api_utils import get_error_data_result
25
  from api.utils.api_utils import get_result, token_required
26
 
 
27
  @manager.route('/chat/<chat_id>/session', methods=['POST'])
28
  @token_required
29
+ def create(tenant_id,chat_id):
30
  req = request.json
31
  req["dialog_id"] = chat_id
32
  dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
 
50
  del conv["reference"]
51
  return get_result(data=conv)
52
 
 
53
  @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
54
  @token_required
55
+ def update(tenant_id,chat_id,session_id):
56
  req = request.json
57
  req["dialog_id"] = chat_id
58
  conv_id = session_id
59
+ conv = ConversationService.query(id=conv_id,dialog_id=chat_id)
60
  if not conv:
61
  return get_error_data_result(retmsg="Session does not exist")
62
  if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
 
72
  return get_result()
73
 
74
 
75
+ @manager.route('/chat/<chat_id>/completion', methods=['POST'])
76
  @token_required
77
+ def completion(tenant_id,chat_id):
78
  req = request.json
79
  # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
80
  # {"role": "user", "content": "上海有吗?"}
81
  # ]}
82
+ if not req.get("session_id"):
83
+ conv = {
84
+ "id": get_uuid(),
85
+ "dialog_id": chat_id,
86
+ "name": req.get("name", "New session"),
87
+ "message": [{"role": "assistant", "content": "Hi! I am your assistant,can I help you?"}]
88
+ }
89
+ if not conv.get("name"):
90
+ return get_error_data_result(retmsg="Name can not be empty.")
91
+ ConversationService.save(**conv)
92
+ e, conv = ConversationService.get_by_id(conv["id"])
93
+ session_id=conv.id
94
+ else:
95
+ session_id = req.get("session_id")
96
  if not req.get("question"):
97
  return get_error_data_result(retmsg="Please input your question.")
98
+ conv = ConversationService.query(id=session_id,dialog_id=chat_id)
99
  if not conv:
100
  return get_error_data_result(retmsg="Session does not exist")
101
  conv = conv[0]
 
129
  conv.message[-1] = {"role": "assistant", "content": ans["answer"],
130
  "id": message_id, "prompt": ans.get("prompt", "")}
131
  ans["id"] = message_id
132
+ ans["session_id"]=session_id
133
 
134
  def stream():
135
  nonlocal dia, msg, req, conv
136
  try:
137
  for ans in chat(dia, msg, **req):
138
  fillin_conv(ans)
139
+ yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n"
140
  ConversationService.update_by_id(conv.id, conv.to_dict())
141
  except Exception as e:
142
  yield "data:" + json.dumps({"code": 500, "message": str(e),
143
+ "data": {"answer": "**ERROR**: " + str(e),"reference": []}},
144
  ensure_ascii=False) + "\n\n"
145
  yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
146
 
 
161
  break
162
  return get_result(data=answer)
163
 
 
164
  @manager.route('/chat/<chat_id>/session', methods=['GET'])
165
  @token_required
166
+ def list(chat_id,tenant_id):
167
  if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
168
  return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
169
  id = request.args.get("id")
170
  name = request.args.get("name")
171
+ session = ConversationService.query(id=id,name=name,dialog_id=chat_id)
172
  if not session:
173
  return get_error_data_result(retmsg="The session doesn't exist")
174
  page_number = int(request.args.get("page", 1))
 
178
  desc = False
179
  else:
180
  desc = True
181
+ convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name)
182
  if not convs:
183
  return get_result(data=[])
184
  for conv in convs:
 
213
  del conv["reference"]
214
  return get_result(data=convs)
215
 
 
216
  @manager.route('/chat/<chat_id>/session', methods=["DELETE"])
217
  @token_required
218
+ def delete(tenant_id,chat_id):
219
  if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
220
  return get_error_data_result(retmsg="You don't own the chat")
221
  ids = request.json.get("ids")
222
  if not ids:
223
  return get_error_data_result(retmsg="`ids` is required in deleting operation")
224
  for id in ids:
225
+ conv = ConversationService.query(id=id,dialog_id=chat_id)
226
  if not conv:
227
  return get_error_data_result(retmsg="The chat doesn't own the session")
228
  ConversationService.delete_by_id(id)
api/db/services/document_service.py CHANGED
@@ -61,14 +61,13 @@ class DocumentService(CommonService):
61
  docs = docs.where(
62
  fn.LOWER(cls.model.name).contains(keywords.lower())
63
  )
64
- count = docs.count()
65
  if desc:
66
  docs = docs.order_by(cls.model.getter_by(orderby).desc())
67
  else:
68
  docs = docs.order_by(cls.model.getter_by(orderby).asc())
69
 
70
  docs = docs.paginate(page_number, items_per_page)
71
-
72
  return list(docs.dicts()), count
73
 
74
 
 
61
  docs = docs.where(
62
  fn.LOWER(cls.model.name).contains(keywords.lower())
63
  )
 
64
  if desc:
65
  docs = docs.order_by(cls.model.getter_by(orderby).desc())
66
  else:
67
  docs = docs.order_by(cls.model.getter_by(orderby).asc())
68
 
69
  docs = docs.paginate(page_number, items_per_page)
70
+ count = docs.count()
71
  return list(docs.dicts()), count
72
 
73
 
api/http_api.md CHANGED
@@ -432,18 +432,71 @@ The error response includes a JSON object like the following:
432
  }
433
  ```
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  ## Download a file from a dataset
436
 
437
  **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
438
 
439
- Downloads files from a dataset.
440
 
441
  ### Request
442
 
443
  - Method: GET
444
- - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}`
445
  - Headers:
446
- - `content-Type: application/json`
447
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
448
  - Output:
449
  - '{FILE_NAME}'
@@ -451,10 +504,9 @@ Downloads files from a dataset.
451
 
452
  ```bash
453
  curl --request GET \
454
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
455
- --header 'Content-Type: application/json' \
456
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
457
- --output '{FILE_NAME}'
458
  ```
459
 
460
  #### Request parameters
@@ -466,7 +518,7 @@ curl --request GET \
466
 
467
  ### Response
468
 
469
- The successful response includes a JSON object like the following:
470
 
471
  ```text
472
  test_2.
@@ -596,92 +648,39 @@ Update a file in a dataset
596
  - Headers:
597
  - `content-Type: application/json`
598
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
599
-
 
 
 
600
  #### Request example
601
 
602
  ```bash
603
  curl --request PUT \
604
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
605
  --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
606
  --header 'Content-Type: application/json' \
607
  --data '{
608
  "name": "manual.txt",
609
- "thumbnail": null,
610
- "knowledgebase_id": "779333c0758611ef910f0242ac120004",
611
  "parser_method": "manual",
612
- "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12},
613
- "source_type": "local", "type": "doc",
614
- "created_by": "134408906b6811efbcd20242ac120005",
615
- "size": 0, "token_count": 0, "chunk_count": 0,
616
- "progress": 0.0,
617
- "progress_msg": "",
618
- "process_begin_at": null,
619
- "process_duration": 0.0
620
  }'
621
 
622
  ```
623
 
624
  #### Request parameters
625
 
626
- - `"thumbnail"`: (*Body parameter*)
627
- Thumbnail image of the document.
628
- - `""`
629
-
630
- - `"knowledgebase_id"`: (*Body parameter*)
631
- Knowledge base ID related to the document.
632
- - `""`
633
-
634
  - `"parser_method"`: (*Body parameter*)
635
  Method used to parse the document.
636
- - `""`
637
 
638
  - `"parser_config"`: (*Body parameter*)
639
  Configuration object for the parser.
640
  - If the value is `None`, a dictionary with default values will be generated.
641
 
642
- - `"source_type"`: (*Body parameter*)
643
- Source type of the document.
644
- - `""`
645
-
646
- - `"type"`: (*Body parameter*)
647
- Type or category of the document.
648
- - `""`
649
-
650
- - `"created_by"`: (*Body parameter*)
651
- Creator of the document.
652
- - `""`
653
-
654
  - `"name"`: (*Body parameter*)
655
  Name or title of the document.
656
- - `""`
657
-
658
- - `"size"`: (*Body parameter*)
659
- Size of the document in bytes or some other unit.
660
- - `0`
661
-
662
- - `"token_count"`: (*Body parameter*)
663
- Number of tokens in the document.
664
- - `0`
665
-
666
- - `"chunk_count"`: (*Body parameter*)
667
- Number of chunks the document is split into.
668
- - `0`
669
 
670
- - `"progress"`: (*Body parameter*)
671
- Current processing progress as a percentage.
672
- - `0.0`
673
 
674
- - `"progress_msg"`: (*Body parameter*)
675
- Message indicating current progress status.
676
- - `""`
677
-
678
- - `"process_begin_at"`: (*Body parameter*)
679
- Start time of the document processing.
680
- - `None`
681
-
682
- - `"process_duration"`: (*Body parameter*)
683
- Duration of the processing in seconds or minutes.
684
- - `0.0`
685
 
686
 
687
  ### Response
@@ -712,34 +711,34 @@ Parse files into chunks in a dataset
712
  ### Request
713
 
714
  - Method: POST
715
- - URL: `/api/v1/dataset/{dataset_id}/chunk`
716
  - Headers:
717
  - `content-Type: application/json`
718
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 
 
719
 
720
  #### Request example
721
 
722
- ```shell
723
  curl --request POST \
724
- --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
725
- --header 'Content-Type: application/json' \
726
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
727
- --raw '{
728
- "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
729
- }'
730
  ```
731
 
732
  #### Request parameters
733
 
734
  - `"dataset_id"`: (*Path parameter*)
735
- - `"documents"`: (*Body parameter*)
736
- - Documents to parse
737
 
738
  ### Response
739
 
740
  The successful response includes a JSON object like the following:
741
 
742
- ```shell
743
  {
744
  "code": 0
745
  }
@@ -747,10 +746,10 @@ The successful response includes a JSON object like the following:
747
 
748
  The error response includes a JSON object like the following:
749
 
750
- ```shell
751
  {
752
- "code": 3016,
753
- "message": "Can't connect database"
754
  }
755
  ```
756
 
@@ -762,35 +761,35 @@ Stop file parsing
762
 
763
  ### Request
764
 
765
- - Method: POST
766
- - URL: `/api/v1/dataset/{dataset_id}/chunk`
767
  - Headers:
768
  - `content-Type: application/json`
769
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
770
-
 
771
  #### Request example
772
 
773
- ```shell
774
  curl --request DELETE \
775
- --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
776
- --header 'Content-Type: application/json' \
777
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
778
- --raw '{
779
- "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
780
- }'
781
  ```
782
 
783
  #### Request parameters
784
 
785
  - `"dataset_id"`: (*Path parameter*)
786
- - `"documents"`: (*Body parameter*)
787
- - Documents to stop parsing
 
788
 
789
  ### Response
790
 
791
  The successful response includes a JSON object like the following:
792
 
793
- ```shell
794
  {
795
  "code": 0
796
  }
@@ -798,104 +797,98 @@ The successful response includes a JSON object like the following:
798
 
799
  The error response includes a JSON object like the following:
800
 
801
- ```shell
802
  {
803
- "code": 3016,
804
- "message": "Can't connect database"
805
  }
806
  ```
807
 
808
  ## Get document chunk list
809
 
810
- **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
811
 
812
  Get document chunk list
813
 
814
  ### Request
815
 
816
  - Method: GET
817
- - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
818
  - Headers:
819
- - `content-Type: application/json`
820
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
821
 
822
  #### Request example
823
 
824
- ```shell
825
  curl --request GET \
826
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
827
- --header 'Content-Type: application/json' \
828
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
829
  ```
830
 
831
  #### Request parameters
832
 
833
  - `"dataset_id"`: (*Path parameter*)
834
  - `"document_id"`: (*Path parameter*)
835
-
 
 
 
 
 
 
 
836
  ### Response
837
 
838
  The successful response includes a JSON object like the following:
839
 
840
- ```shell
841
  {
842
- "code": 0
843
  "data": {
844
- "chunks": [
845
- {
846
- "available_int": 1,
847
- "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur",
848
- "document_keyword": "ragflow_test.txt",
849
- "document_id": "77df9ef4759a11ef8bdd0242ac120004",
850
- "id": "4ab8c77cfac1a829c8d5ed022a0808c0",
851
- "image_id": "",
852
- "important_keywords": [],
853
- "positions": [
854
- ""
855
- ]
856
- }
857
- ],
858
  "doc": {
859
- "chunk_count": 5,
860
- "create_date": "Wed, 18 Sep 2024 08:46:16 GMT",
861
- "create_time": 1726649176833,
862
- "created_by": "134408906b6811efbcd20242ac120005",
863
- "id": "77df9ef4759a11ef8bdd0242ac120004",
864
- "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004",
865
- "location": "ragflow_test.txt",
866
- "name": "ragflow_test.txt",
867
  "parser_config": {
868
- "chunk_token_count": 128,
869
- "delimiter": "\n!?。;!?",
870
- "layout_recognize": true,
871
- "task_page_size": 12
 
 
872
  },
873
- "parser_method": "naive",
874
- "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT",
875
- "process_duation": 7.3213,
876
- "progress": 1.0,
877
- "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!",
878
- "run": "3",
879
- "size": 4209,
880
  "source_type": "local",
881
  "status": "1",
882
  "thumbnail": null,
883
- "token_count": 746,
884
  "type": "doc",
885
- "update_date": "Wed, 18 Sep 2024 08:46:23 GMT",
886
- "update_time": 1726649183321
887
  },
888
- "total": 1
889
- },
890
  }
891
  ```
892
 
893
  The error response includes a JSON object like the following:
894
 
895
- ```shell
896
  {
897
- "code": 3016,
898
- "message": "Can't connect database"
899
  }
900
  ```
901
 
@@ -908,55 +901,96 @@ Delete document chunks
908
  ### Request
909
 
910
  - Method: DELETE
911
- - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
912
  - Headers:
913
  - `content-Type: application/json`
914
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 
 
915
 
916
  #### Request example
917
 
918
- ```shell
919
  curl --request DELETE \
920
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
921
- --header 'Content-Type: application/json' \
922
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
923
- --raw '{
924
- "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
925
- }'
926
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
 
928
  ## Update document chunk
929
 
930
- **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
931
 
932
  Update document chunk
933
 
934
  ### Request
935
 
936
  - Method: PUT
937
- - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
938
  - Headers:
939
  - `content-Type: application/json`
940
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
941
-
 
 
 
942
  #### Request example
943
 
944
- ```shell
945
  curl --request PUT \
946
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
947
- --header 'Content-Type: application/json' \
948
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
949
- --raw '{
950
- "chunk_id": "d87fb0b7212c15c18d0831677552d7de",
951
- "knowledgebase_id": null,
952
- "name": "",
953
- "content": "ragflow123",
954
- "important_keywords": [],
955
- "document_id": "e6bbba92759511efaa900242ac120004",
956
- "status": "1"
957
- }'
958
  ```
 
 
 
 
 
 
 
959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960
  ## Insert document chunks
961
 
962
  **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
@@ -966,50 +1000,187 @@ Insert document chunks
966
  ### Request
967
 
968
  - Method: POST
969
- - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
970
  - Headers:
971
  - `content-Type: application/json`
972
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
973
-
 
 
974
  #### Request example
975
 
976
- ```shell
977
  curl --request POST \
978
- --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
979
- --header 'Content-Type: application/json' \
980
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
981
- --raw '{
982
- "document_id": "97ad64b6759811ef9fc30242ac120004",
983
- "content": ["ragflow content", "ragflow content"]
984
- }'
985
  ```
 
 
 
 
 
986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
  ## Dataset retrieval test
988
 
989
- **GET** `/api/v1/dataset/{dataset_id}/retrieval`
990
 
991
  Retrieval test of a dataset
992
 
993
  ### Request
994
 
995
- - Method: GET
996
- - URL: `/api/v1/dataset/{dataset_id}/retrieval`
997
  - Headers:
998
  - `content-Type: application/json`
999
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1000
-
 
 
 
 
 
 
 
 
 
 
 
1001
  #### Request example
1002
 
1003
- ```shell
1004
- curl --request GET \
1005
- --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \
1006
- --header 'Content-Type: application/json' \
1007
- --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1008
- --raw '{
1009
- "query_text": "This is a cat."
1010
- }'
 
 
 
 
 
 
1011
  ```
1012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
  ## Create chat
1014
 
1015
  **POST** `/api/v1/chat`
@@ -1708,26 +1879,27 @@ Error
1708
 
1709
  ## Chat with a chat session
1710
 
1711
- **POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion`
1712
 
1713
  Chat with a chat session
1714
 
1715
  ### Request
1716
 
1717
  - Method: POST
1718
- - URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion`
1719
  - Headers:
1720
  - `content-Type: application/json`
1721
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1722
  - Body:
1723
  - `question`: string
1724
  - `stream`: bool
 
1725
 
1726
 
1727
  #### Request example
1728
  ```bash
1729
  curl --request POST \
1730
- --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \
1731
  --header 'Content-Type: application/json' \
1732
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
1733
  --data-binary '{
@@ -1743,6 +1915,8 @@ curl --request POST \
1743
  - `stream`: (*Body Parameter*)
1744
  The approach of streaming text generation.
1745
  `False`
 
 
1746
  ### Response
1747
  Success
1748
  ```json
 
432
  }
433
  ```
434
 
435
+ ## Delete files from a dataset
436
+
437
+ **DELETE** `/api/v1/dataset/{dataset_id}/document `
438
+
439
+ Delete files from a dataset
440
+
441
+ ### Request
442
+
443
+ - Method: DELETE
444
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document`
445
+ - Headers:
446
+ - 'Content-Type: application/json'
447
+ - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
448
+ - Body:
449
+ - `ids`:List[str]
450
+ #### Request example
451
+
452
+ ```bash
453
+ curl --request DELETE \
454
+ --url http://{address}/api/v1/dataset/{dataset_id}/document \
455
+ --header 'Content-Type: application/json' \
456
+ --header 'Authorization: {YOUR ACCESS TOKEN}' \
457
+ --data '{
458
+ "ids": ["id_1","id_2"]
459
+ }'
460
+ ```
461
+
462
+ #### Request parameters
463
+
464
+ - `"ids"`: (*Body parameter*)
465
+ The ids of teh documents to be deleted
466
+ ### Response
467
+
468
+ The successful response includes a JSON object like the following:
469
+
470
+ ```json
471
+ {
472
+ "code": 0
473
+ }.
474
+ ```
475
+
476
+ - `"error_code"`: `integer`
477
+ `0`: The operation succeeds.
478
+
479
+
480
+ The error response includes a JSON object like the following:
481
+
482
+ ```json
483
+ {
484
+ "code": 102,
485
+ "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
486
+ }
487
+ ```
488
+
489
  ## Download a file from a dataset
490
 
491
  **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
492
 
493
+ Downloads a file from a dataset.
494
 
495
  ### Request
496
 
497
  - Method: GET
498
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}`
499
  - Headers:
 
500
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
501
  - Output:
502
  - '{FILE_NAME}'
 
504
 
505
  ```bash
506
  curl --request GET \
507
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
508
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
509
+ --output ./ragflow.txt
 
510
  ```
511
 
512
  #### Request parameters
 
518
 
519
  ### Response
520
 
521
+ The successful response includes a text object like the following:
522
 
523
  ```text
524
  test_2.
 
648
  - Headers:
649
  - `content-Type: application/json`
650
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
651
+ - Body:
652
+ - `name`:`string`
653
+ - `parser_method`:`string`
654
+ - `parser_config`:`dict`
655
  #### Request example
656
 
657
  ```bash
658
  curl --request PUT \
659
+ --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \
660
  --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
661
  --header 'Content-Type: application/json' \
662
  --data '{
663
  "name": "manual.txt",
 
 
664
  "parser_method": "manual",
665
+ "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12}
 
 
 
 
 
 
 
666
  }'
667
 
668
  ```
669
 
670
  #### Request parameters
671
 
 
 
 
 
 
 
 
 
672
  - `"parser_method"`: (*Body parameter*)
673
  Method used to parse the document.
674
+
675
 
676
  - `"parser_config"`: (*Body parameter*)
677
  Configuration object for the parser.
678
  - If the value is `None`, a dictionary with default values will be generated.
679
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  - `"name"`: (*Body parameter*)
681
  Name or title of the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
682
 
 
 
 
683
 
 
 
 
 
 
 
 
 
 
 
 
684
 
685
 
686
  ### Response
 
711
  ### Request
712
 
713
  - Method: POST
714
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk `
715
  - Headers:
716
  - `content-Type: application/json`
717
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
718
+ - Body:
719
+ - `document_ids`:List[str]
720
 
721
  #### Request example
722
 
723
+ ```bash
724
  curl --request POST \
725
+ --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
726
+ --header 'Content-Type: application/json' \
727
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
728
+ --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 
 
729
  ```
730
 
731
  #### Request parameters
732
 
733
  - `"dataset_id"`: (*Path parameter*)
734
+ - `"document_ids"`:(*Body parameter*)
735
+ The ids of the documents to be parsed
736
 
737
  ### Response
738
 
739
  The successful response includes a JSON object like the following:
740
 
741
+ ```json
742
  {
743
  "code": 0
744
  }
 
746
 
747
  The error response includes a JSON object like the following:
748
 
749
+ ```json
750
  {
751
+ "code": 102,
752
+ "message": "`document_ids` is required"
753
  }
754
  ```
755
 
 
761
 
762
  ### Request
763
 
764
+ - Method: DELETE
765
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk`
766
  - Headers:
767
  - `content-Type: application/json`
768
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
769
+ - Body:
770
+ - `document_ids`:List[str]
771
  #### Request example
772
 
773
+ ```bash
774
  curl --request DELETE \
775
+ --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
776
+ --header 'Content-Type: application/json' \
777
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
778
+ --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 
 
779
  ```
780
 
781
  #### Request parameters
782
 
783
  - `"dataset_id"`: (*Path parameter*)
784
+ - `"document_ids"`:(*Body parameter*)
785
+ The ids of the documents to be parsed
786
+
787
 
788
  ### Response
789
 
790
  The successful response includes a JSON object like the following:
791
 
792
+ ```json
793
  {
794
  "code": 0
795
  }
 
797
 
798
  The error response includes a JSON object like the following:
799
 
800
+ ```json
801
  {
802
+ "code": 102,
803
+ "message": "`document_ids` is required"
804
  }
805
  ```
806
 
807
  ## Get document chunk list
808
 
809
+ **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
810
 
811
  Get document chunk list
812
 
813
  ### Request
814
 
815
  - Method: GET
816
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
817
  - Headers:
 
818
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
819
 
820
  #### Request example
821
 
822
+ ```bash
823
  curl --request GET \
824
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \
825
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 
826
  ```
827
 
828
  #### Request parameters
829
 
830
  - `"dataset_id"`: (*Path parameter*)
831
  - `"document_id"`: (*Path parameter*)
832
+ - `"offset"`(*Filter parameter*)
833
+ The beginning number of records for paging.
834
+ - `"keywords"`(*Filter parameter*)
835
+ List chunks whose name has the given keywords
836
+ - `"limit"`(*Filter parameter*)
837
+ Records number to return
838
+ - `"id"`(*Filter parameter*)
839
+ The id of chunk to be got
840
  ### Response
841
 
842
  The successful response includes a JSON object like the following:
843
 
844
+ ```json
845
  {
846
+ "code": 0,
847
  "data": {
848
+ "chunks": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  "doc": {
850
+ "chunk_num": 0,
851
+ "create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
852
+ "create_time": 1727581649216,
853
+ "created_by": "69736c5e723611efb51b0242ac120007",
854
+ "id": "8cb781ec7e1511ef98ac0242ac120006",
855
+ "kb_id": "c7ee74067a2c11efb21c0242ac120006",
856
+ "location": "明天的天气是晴天.txt",
857
+ "name": "明天的天气是晴天.txt",
858
  "parser_config": {
859
+ "pages": [
860
+ [
861
+ 1,
862
+ 1000000
863
+ ]
864
+ ]
865
  },
866
+ "parser_id": "naive",
867
+ "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
868
+ "process_duation": 1435.37,
869
+ "progress": 0.0370833,
870
+ "progress_msg": "\nTask has been received.",
871
+ "run": "1",
872
+ "size": 24,
873
  "source_type": "local",
874
  "status": "1",
875
  "thumbnail": null,
876
+ "token_num": 0,
877
  "type": "doc",
878
+ "update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
879
+ "update_time": 1728989266371
880
  },
881
+ "total": 0
882
+ }
883
  }
884
  ```
885
 
886
  The error response includes a JSON object like the following:
887
 
888
+ ```json
889
  {
890
+ "code": 102,
891
+ "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5."
892
  }
893
  ```
894
 
 
901
  ### Request
902
 
903
  - Method: DELETE
904
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
905
  - Headers:
906
  - `content-Type: application/json`
907
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
908
+ - Body:
909
+ - `chunk_ids`:List[str]
910
 
911
  #### Request example
912
 
913
+ ```bash
914
  curl --request DELETE \
915
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
916
+ --header 'Content-Type: application/json' \
917
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
918
+ --data '{
919
+ "chunk_ids": ["test_1", "test_2"]
920
+ }'
921
  ```
922
+ #### Request parameters
923
+
924
+ - `"chunk_ids"`:(*Body parameter*)
925
+ The chunks of the document to be deleted
926
+
927
+ ### Response
928
+ Success
929
+ ```json
930
+ {
931
+ "code": 0
932
+ }
933
+ ```
934
+ Error
935
+ ```json
936
+ {
937
+ "code": 102,
938
+ "message": "`chunk_ids` is required"
939
+ }
940
+ ```
941
+
942
 
943
  ## Update document chunk
944
 
945
+ **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
946
 
947
  Update document chunk
948
 
949
  ### Request
950
 
951
  - Method: PUT
952
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
953
  - Headers:
954
  - `content-Type: application/json`
955
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
956
+ - Body:
957
+ - `content`:str
958
+ - `important_keywords`:str
959
+ - `available`:int
960
  #### Request example
961
 
962
+ ```bash
963
  curl --request PUT \
964
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \
965
+ --header 'Content-Type: application/json' \
966
+ --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
967
+ --data '{
968
+ "content": "ragflow123",
969
+ "important_keywords": [],
970
+ }'
 
 
 
 
 
971
  ```
972
+ #### Request parameters
973
+ - `"content"`:(*Body parameter*)
974
+ Contains the main text or information of the chunk.
975
+ - `"important_keywords"`:(*Body parameter*)
976
+ list the key terms or phrases that are significant or central to the chunk's content.
977
+ - `"available"`:(*Body parameter*)
978
+ Indicating the availability status, 0 means unavailable and 1 means available.
979
 
980
+ ### Response
981
+ Success
982
+ ```json
983
+ {
984
+ "code": 0
985
+ }
986
+ ```
987
+ Error
988
+ ```json
989
+ {
990
+ "code": 102,
991
+ "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2"
992
+ }
993
+ ```
994
  ## Insert document chunks
995
 
996
  **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 
1000
  ### Request
1001
 
1002
  - Method: POST
1003
+ - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
1004
  - Headers:
1005
  - `content-Type: application/json`
1006
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1007
+ - Body:
1008
+ - `content`: str
1009
+ - `important_keywords`:List[str]
1010
  #### Request example
1011
 
1012
+ ```bash
1013
  curl --request POST \
1014
+ --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
1015
+ --header 'Content-Type: application/json' \
1016
+ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
1017
+ --data '{
1018
+ "content": "ragflow content"
1019
+ }'
 
1020
  ```
1021
+ #### Request parameters
1022
+ - `content`:(*Body parameter*)
1023
+ Contains the main text or information of the chunk.
1024
+ - `important_keywords`(*Body parameter*)
1025
+ list the key terms or phrases that are significant or central to the chunk's content.
1026
 
1027
+ ### Response
1028
+ Success
1029
+ ```json
1030
+ {
1031
+ "code": 0,
1032
+ "data": {
1033
+ "chunk": {
1034
+ "content": "ragflow content",
1035
+ "create_time": "2024-10-16 08:05:04",
1036
+ "create_timestamp": 1729065904.581025,
1037
+ "dataset_id": [
1038
+ "c7ee74067a2c11efb21c0242ac120006"
1039
+ ],
1040
+ "document_id": "5c5999ec7be811ef9cab0242ac120005",
1041
+ "id": "d78435d142bd5cf6704da62c778795c5",
1042
+ "important_keywords": []
1043
+ }
1044
+ }
1045
+ }
1046
+ ```
1047
+
1048
+ Error
1049
+ ```json
1050
+ {
1051
+ "code": 102,
1052
+ "message": "`content` is required"
1053
+ }
1054
+ ```
1055
  ## Dataset retrieval test
1056
 
1057
+ **GET** `/api/v1/retrieval`
1058
 
1059
  Retrieval test of a dataset
1060
 
1061
  ### Request
1062
 
1063
+ - Method: POST
1064
+ - URL: `http://{address}/api/v1/retrieval`
1065
  - Headers:
1066
  - `content-Type: application/json`
1067
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1068
+ - Body:
1069
+ - `question`: str
1070
+ - `datasets`: List[str]
1071
+ - `documents`: List[str]
1072
+ - `offset`: int
1073
+ - `limit`: int
1074
+ - `similarity_threshold`: float
1075
+ - `vector_similarity_weight`: float
1076
+ - `top_k`: int
1077
+ - `rerank_id`: string
1078
+ - `keyword`: bool
1079
+ - `highlight`: bool
1080
  #### Request example
1081
 
1082
+ ```bash
1083
+ curl --request POST \
1084
+ --url http://{address}/api/v1/retrieval \
1085
+ --header 'Content-Type: application/json' \
1086
+ --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
1087
+ --data '{
1088
+ "question": "What is advantage of ragflow?",
1089
+ "datasets": [
1090
+ "b2a62730759d11ef987d0242ac120004"
1091
+ ],
1092
+ "documents": [
1093
+ "77df9ef4759a11ef8bdd0242ac120004"
1094
+ ]
1095
+ }'
1096
  ```
1097
 
1098
+ #### Request parameter
1099
+ - `"question"`: (*Body parameter*)
1100
+ User's question, search keywords
1101
+ `""`
1102
+ - `"datasets"`: (*Body parameter*)
1103
+ The scope of datasets
1104
+ `None`
1105
+ - `"documents"`: (*Body parameter*)
1106
+ The scope of document. `None` means no limitation
1107
+ `None`
1108
+ - `"offset"`: (*Body parameter*)
1109
+ The beginning point of retrieved records
1110
+ `1`
1111
+
1112
+ - `"limit"`: (*Body parameter*)
1113
+ The maximum number of records needed to return
1114
+ `30`
1115
+
1116
+ - `"similarity_threshold"`: (*Body parameter*)
1117
+ The minimum similarity score
1118
+ `0.2`
1119
+
1120
+ - `"vector_similarity_weight"`: (*Body parameter*)
1121
+ The weight of vector cosine similarity, `1 - x` is the term similarity weight
1122
+ `0.3`
1123
+
1124
+ - `"top_k"`: (*Body parameter*)
1125
+ Number of records engaged in vector cosine computation
1126
+ `1024`
1127
+
1128
+ - `"rerank_id"`: (*Body parameter*)
1129
+ ID of the rerank model
1130
+ `None`
1131
+
1132
+ - `"keyword"`: (*Body parameter*)
1133
+ Whether keyword-based matching is enabled
1134
+ `False`
1135
+
1136
+ - `"highlight"`: (*Body parameter*)
1137
+ Whether to enable highlighting of matched terms in the results
1138
+ `False`
1139
+ ### Response
1140
+ Success
1141
+ ```json
1142
+ {
1143
+ "code": 0,
1144
+ "data": {
1145
+ "chunks": [
1146
+ {
1147
+ "content": "ragflow content",
1148
+ "content_ltks": "ragflow content",
1149
+ "document_id": "5c5999ec7be811ef9cab0242ac120005",
1150
+ "document_keyword": "1.txt",
1151
+ "highlight": "<em>ragflow</em> content",
1152
+ "id": "d78435d142bd5cf6704da62c778795c5",
1153
+ "img_id": "",
1154
+ "important_keywords": [
1155
+ ""
1156
+ ],
1157
+ "kb_id": "c7ee74067a2c11efb21c0242ac120006",
1158
+ "positions": [
1159
+ ""
1160
+ ],
1161
+ "similarity": 0.9669436601210759,
1162
+ "term_similarity": 1.0,
1163
+ "vector_similarity": 0.8898122004035864
1164
+ }
1165
+ ],
1166
+ "doc_aggs": [
1167
+ {
1168
+ "count": 1,
1169
+ "doc_id": "5c5999ec7be811ef9cab0242ac120005",
1170
+ "doc_name": "1.txt"
1171
+ }
1172
+ ],
1173
+ "total": 1
1174
+ }
1175
+ }
1176
+ ```
1177
+ Error
1178
+ ```json
1179
+ {
1180
+ "code": 102,
1181
+ "message": "`datasets` is required."
1182
+ }
1183
+ ```
1184
  ## Create chat
1185
 
1186
  **POST** `/api/v1/chat`
 
1879
 
1880
  ## Chat with a chat session
1881
 
1882
+ **POST** `/api/v1/chat/{chat_id}/completion`
1883
 
1884
  Chat with a chat session
1885
 
1886
  ### Request
1887
 
1888
  - Method: POST
1889
+ - URL: `http://{address} /api/v1/chat/{chat_id}/completion`
1890
  - Headers:
1891
  - `content-Type: application/json`
1892
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
1893
  - Body:
1894
  - `question`: string
1895
  - `stream`: bool
1896
+ - `session_id`: str
1897
 
1898
 
1899
  #### Request example
1900
  ```bash
1901
  curl --request POST \
1902
+ --url http://{address} /api/v1/chat/{chat_id}/completion \
1903
  --header 'Content-Type: application/json' \
1904
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
1905
  --data-binary '{
 
1915
  - `stream`: (*Body Parameter*)
1916
  The approach of streaming text generation.
1917
  `False`
1918
+ - `session_id`: (*Body Parameter*)
1919
+ The id of session.If not provided, a new session will be generated.
1920
  ### Response
1921
  Success
1922
  ```json
api/python_api_reference.md CHANGED
@@ -244,42 +244,117 @@ File management inside knowledge base
244
  ## Upload document
245
 
246
  ```python
247
- RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
248
  ```
249
 
250
  ### Parameters
251
 
252
- #### name
 
253
 
254
- #### blob
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
 
 
 
 
 
257
 
258
  ### Returns
259
 
 
260
 
261
  ### Examples
262
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  ---
264
 
265
- ## Retrieve document
266
 
267
  ```python
268
- RAGFlow.get_document(id:str=None,name:str=None) -> Document
269
  ```
270
 
271
  ### Parameters
272
 
273
- #### id: `str`, *Required*
274
 
275
- ID of the document to retrieve.
276
 
277
- #### name: `str`
 
 
 
 
 
 
278
 
279
- Name or title of the document.
 
 
280
 
 
 
 
 
 
281
  ### Returns
282
 
 
 
283
  A document object containing the following attributes:
284
 
285
  #### id: `str`
@@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
352
  ```python
353
  from ragflow import RAGFlow
354
 
355
- rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
356
- doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
357
- print(doc)
358
- ```
359
-
360
- ---
361
-
362
- ## Save document settings
363
-
364
- ```python
365
- Document.save() -> bool
366
- ```
367
-
368
- ### Returns
369
-
370
- bool
371
-
372
- ### Examples
373
-
374
- ```python
375
- from ragflow import RAGFlow
376
-
377
- rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
378
- doc = rag.get_document(id="wdfxb5t547d")
379
- doc.parser_method= "manual"
380
- doc.save()
381
- ```
382
-
383
- ---
384
-
385
- ## Download document
386
-
387
- ```python
388
- Document.download() -> bytes
389
- ```
390
-
391
- ### Returns
392
-
393
- bytes of the document.
394
-
395
- ### Examples
396
-
397
- ```python
398
- from ragflow import RAGFlow
399
-
400
- rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
401
- doc = rag.get_document(id="wdfxb5t547d")
402
- open("~/ragflow.txt", "w+").write(doc.download())
403
- print(doc)
404
- ```
405
-
406
- ---
407
-
408
- ## List documents
409
-
410
- ```python
411
- Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
412
- ```
413
-
414
- ### Parameters
415
-
416
- #### keywords: `str`
417
-
418
- List documents whose name has the given keywords. Defaults to `None`.
419
-
420
- #### offset: `int`
421
-
422
- The beginning number of records for paging. Defaults to `0`.
423
-
424
- #### limit: `int`
425
-
426
- Records number to return, -1 means all of them. Records number to return, -1 means all of them.
427
-
428
- ### Returns
429
-
430
- List[Document]
431
-
432
- ### Examples
433
-
434
- ```python
435
- from ragflow import RAGFlow
436
-
437
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
438
  ds = rag.create_dataset(name="kb_1")
439
 
440
  filename1 = "~/ragflow.txt"
441
- rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
442
-
443
- filename2 = "~/infinity.txt"
444
- rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
445
-
446
- for d in ds.list_docs(keywords="rag", offset=0, limit=12):
447
  print(d)
448
  ```
449
 
@@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12):
452
  ## Delete documents
453
 
454
  ```python
455
- Document.delete() -> bool
456
  ```
457
  ### Returns
458
 
459
- bool
460
- description: delete success or not
461
 
462
  ### Examples
463
 
@@ -465,119 +455,87 @@ description: delete success or not
465
  from ragflow import RAGFlow
466
 
467
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
468
- ds = rag.create_dataset(name="kb_1")
469
-
470
- filename1 = "~/ragflow.txt"
471
- rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
472
-
473
- filename2 = "~/infinity.txt"
474
- rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
475
- for d in ds.list_docs(keywords="rag", offset=0, limit=12):
476
- d.delete()
477
  ```
478
 
479
  ---
480
 
481
- ## Parse document
482
 
483
  ```python
484
- Document.async_parse() -> None
485
- RAGFLOW.async_parse_documents() -> None
486
  ```
487
 
488
  ### Parameters
489
 
 
 
490
  ????????????????????????????????????????????????????
491
 
492
  ### Returns
493
-
494
  ????????????????????????????????????????????????????
495
 
496
  ### Examples
497
 
498
- ```python
499
- #document parse and cancel
500
- rag = RAGFlow(API_KEY, HOST_ADDRESS)
501
- ds = rag.create_dataset(name="dataset_name")
502
- name3 = 'ai.pdf'
503
- path = 'test_data/ai.pdf'
504
- rag.create_document(ds, name=name3, blob=open(path, "rb").read())
505
- doc = rag.get_document(name="ai.pdf")
506
- doc.async_parse()
507
- print("Async parsing initiated")
508
- ```
509
-
510
- ---
511
-
512
- ## Cancel document parsing
513
-
514
- ```python
515
- rag.async_cancel_parse_documents(ids)
516
- RAGFLOW.async_cancel_parse_documents()-> None
517
- ```
518
-
519
- ### Parameters
520
-
521
- #### ids, `list[]`
522
-
523
- ### Returns
524
-
525
- ?????????????????????????????????????????????????
526
-
527
- ### Examples
528
-
529
  ```python
530
  #documents parse and cancel
531
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
532
  ds = rag.create_dataset(name="God5")
533
  documents = [
534
- {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
535
- {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
536
- {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
537
  ]
538
-
539
- # Create documents in bulk
540
- for doc_info in documents:
541
- with open(doc_info['path'], "rb") as file:
542
- created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
543
- docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
544
- ids = [doc.id for doc in docs]
545
-
546
- rag.async_parse_documents(ids)
547
  print("Async bulk parsing initiated")
548
-
549
- for doc in docs:
550
- for progress, msg in doc.join(interval=5, timeout=10):
551
- print(f"{doc.name}: Progress: {progress}, Message: {msg}")
552
-
553
- cancel_result = rag.async_cancel_parse_documents(ids)
554
  print("Async bulk parsing cancelled")
555
  ```
556
 
557
- ---
558
-
559
- ## Join document
560
-
561
- ??????????????????
562
-
563
  ```python
564
- Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
565
  ```
566
-
567
  ### Parameters
568
 
569
- #### interval: `int`
 
 
570
 
571
- Time interval in seconds for progress report. Defaults to `15`.
 
 
572
 
573
- #### timeout: `int`
574
-
575
- Timeout in seconds. Defaults to `3600`.
576
 
 
 
 
577
  ### Returns
 
578
 
579
- iteral[Tuple[float, str]]
 
 
580
 
 
 
 
 
 
 
 
581
  ## Add chunk
582
 
583
  ```python
@@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk
587
  ### Parameters
588
 
589
  #### content: `str`, *Required*
 
 
 
590
 
591
  ### Returns
592
 
@@ -598,7 +559,10 @@ chunk
598
  from ragflow import RAGFlow
599
 
600
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
601
- doc = rag.get_document(id="wdfxb5t547d")
 
 
 
602
  chunk = doc.add_chunk(content="xxxxxxx")
603
  ```
604
 
@@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx")
607
  ## Delete chunk
608
 
609
  ```python
610
- Chunk.delete() -> bool
611
  ```
 
 
 
612
 
613
  ### Returns
614
 
615
- bool
616
 
617
  ### Examples
618
 
@@ -620,22 +587,34 @@ bool
620
  from ragflow import RAGFlow
621
 
622
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
623
- doc = rag.get_document(id="wdfxb5t547d")
 
 
 
624
  chunk = doc.add_chunk(content="xxxxxxx")
625
- chunk.delete()
626
  ```
627
 
628
  ---
629
 
630
- ## Save chunk contents
631
 
632
  ```python
633
- Chunk.save() -> bool
634
  ```
 
 
 
 
 
 
 
 
 
635
 
636
  ### Returns
637
 
638
- bool
639
 
640
  ### Examples
641
 
@@ -643,10 +622,12 @@ bool
643
  from ragflow import RAGFlow
644
 
645
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
646
- doc = rag.get_document(id="wdfxb5t547d")
 
 
 
647
  chunk = doc.add_chunk(content="xxxxxxx")
648
- chunk.content = "sdfx"
649
- chunk.save()
650
  ```
651
 
652
  ---
@@ -654,7 +635,7 @@ chunk.save()
654
  ## Retrieval
655
 
656
  ```python
657
- RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
658
  ```
659
 
660
  ### Parameters
@@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def
691
 
692
  Number of records engaged in vector cosine computaton. Defaults to `1024`.
693
 
 
 
 
 
 
 
 
 
 
694
  ### Returns
695
 
696
  List[Chunk]
@@ -701,18 +691,17 @@ List[Chunk]
701
  from ragflow import RAGFlow
702
 
703
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
704
- ds = rag.get_dataset(name="ragflow")
 
705
  name = 'ragflow_test.txt'
706
- path = 'test_data/ragflow_test.txt'
707
  rag.create_document(ds, name=name, blob=open(path, "rb").read())
708
- doc = rag.get_document(name=name)
709
- doc.async_parse()
710
- # Wait for parsing to complete
711
- for progress, msg in doc.join(interval=5, timeout=30):
712
- print(progress, msg)
713
- for c in rag.retrieval(question="What's ragflow?",
714
- datasets=[ds], documents=[doc],
715
- offset=0, limit=6, similarity_threshold=0.1,
716
  vector_similarity_weight=0.3,
717
  top_k=1024
718
  ):
 
244
  ## Upload document
245
 
246
  ```python
247
+ DataSet.upload_documents(document_list: List[dict])
248
  ```
249
 
250
  ### Parameters
251
 
252
+ #### document_list:`List[dict]`
253
+ A list composed of dicts containing `name` and `blob`.
254
 
 
255
 
256
+ ### Returns
257
+ no return
258
+
259
+ ### Examples
260
+ ```python
261
+ from ragflow import RAGFlow
262
+
263
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
264
+ ds = rag.create_dataset(name="kb_1")
265
+ ds.upload_documents([{name="1.txt", blob="123"}, ...] }
266
+ ```
267
+ ---
268
+
269
+ ## Update document
270
+
271
+ ```python
272
+ Document.update(update_message:dict)
273
+ ```
274
+
275
+ ### Parameters
276
+
277
+ #### update_message:`dict`
278
+ only `name`,`parser_config`,`parser_method` can be changed
279
+
280
+ ### Returns
281
+
282
+ no return
283
+
284
+ ### Examples
285
+
286
+ ```python
287
+ from ragflow import RAGFlow
288
+
289
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
290
+ ds=rag.list_datasets(id='id')
291
+ ds=ds[0]
292
+ doc = ds.list_documents(id="wdfxb5t547d")
293
+ doc = doc[0]
294
+ doc.update([{"parser_method": "manual"...}])
295
+ ```
296
+
297
+ ---
298
 
299
+ ## Download document
300
+
301
+ ```python
302
+ Document.download() -> bytes
303
+ ```
304
 
305
  ### Returns
306
 
307
+ bytes of the document.
308
 
309
  ### Examples
310
 
311
+ ```python
312
+ from ragflow import RAGFlow
313
+
314
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
315
+ ds=rag.list_datasets(id="id")
316
+ ds=ds[0]
317
+ doc = ds.list_documents(id="wdfxb5t547d")
318
+ doc = doc[0]
319
+ open("~/ragflow.txt", "wb+").write(doc.download())
320
+ print(doc)
321
+ ```
322
+
323
  ---
324
 
325
+ ## List documents
326
 
327
  ```python
328
+ Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
329
  ```
330
 
331
  ### Parameters
332
 
333
+ #### id: `str`
334
 
335
+ The id of the document to be got
336
 
337
+ #### keywords: `str`
338
+
339
+ List documents whose name has the given keywords. Defaults to `None`.
340
+
341
+ #### offset: `int`
342
+
343
+ The beginning number of records for paging. Defaults to `0`.
344
 
345
+ #### limit: `int`
346
+
347
+ Records number to return, -1 means all of them. Records number to return, -1 means all of them.
348
 
349
+ #### orderby: `str`
350
+ The field by which the records should be sorted. This specifies the attribute or column used to order the results.
351
+
352
+ #### desc:`bool`
353
+ A boolean flag indicating whether the sorting should be in descending order.
354
  ### Returns
355
 
356
+ List[Document]
357
+
358
  A document object containing the following attributes:
359
 
360
  #### id: `str`
 
427
  ```python
428
  from ragflow import RAGFlow
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
431
  ds = rag.create_dataset(name="kb_1")
432
 
433
  filename1 = "~/ragflow.txt"
434
+ blob=open(filename1 , "rb").read()
435
+ list_files=[{"name":filename1,"blob":blob}]
436
+ ds.upload_documents(list_files)
437
+ for d in ds.list_documents(keywords="rag", offset=0, limit=12):
 
 
438
  print(d)
439
  ```
440
 
 
443
  ## Delete documents
444
 
445
  ```python
446
+ DataSet.delete_documents(ids: List[str] = None)
447
  ```
448
  ### Returns
449
 
450
+ no return
 
451
 
452
  ### Examples
453
 
 
455
  from ragflow import RAGFlow
456
 
457
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
458
+ ds = rag.list_datasets(name="kb_1")
459
+ ds = ds[0]
460
+ ds.delete_documents(ids=["id_1","id_2"])
 
 
 
 
 
 
461
  ```
462
 
463
  ---
464
 
465
+ ## Parse and stop parsing document
466
 
467
  ```python
468
+ DataSet.async_parse_documents(document_ids:List[str]) -> None
469
+ DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
470
  ```
471
 
472
  ### Parameters
473
 
474
+ #### document_ids:`List[str]`
475
+ The ids of the documents to be parsed
476
  ????????????????????????????????????????????????????
477
 
478
  ### Returns
479
+ no return
480
  ????????????????????????????????????????????????????
481
 
482
  ### Examples
483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  ```python
485
  #documents parse and cancel
486
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
487
  ds = rag.create_dataset(name="God5")
488
  documents = [
489
+ {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
490
+ {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
491
+ {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
492
  ]
493
+ ds.upload_documents(documents)
494
+ documents=ds.list_documents(keywords="test")
495
+ ids=[]
496
+ for document in documents:
497
+ ids.append(document.id)
498
+ ds.async_parse_documents(ids)
 
 
 
499
  print("Async bulk parsing initiated")
500
+ ds.async_cancel_parse_documents(ids)
 
 
 
 
 
501
  print("Async bulk parsing cancelled")
502
  ```
503
 
504
+ ## List chunks
 
 
 
 
 
505
  ```python
506
+ Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
507
  ```
 
508
  ### Parameters
509
 
510
+ - `keywords`: `str`
511
+ List chunks whose name has the given keywords
512
+ default: `None`
513
 
514
+ - `offset`: `int`
515
+ The beginning number of records for paging
516
+ default: `1`
517
 
518
+ - `limit`: `int`
519
+ Records number to return
520
+ default: `30`
521
 
522
+ - `id`: `str`
523
+ The ID of the chunk to be retrieved
524
+ default: `None`
525
  ### Returns
526
+ List[chunk]
527
 
528
+ ### Examples
529
+ ```python
530
+ from ragflow import RAGFlow
531
 
532
+ rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
533
+ ds = rag.list_datasets("123")
534
+ ds = ds[0]
535
+ ds.async_parse_documents(["wdfxb5t547d"])
536
+ for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
537
+ print(c)
538
+ ```
539
  ## Add chunk
540
 
541
  ```python
 
545
  ### Parameters
546
 
547
  #### content: `str`, *Required*
548
+ Contains the main text or information of the chunk.
549
+ #### important_keywords :`List[str]`
550
+ list the key terms or phrases that are significant or central to the chunk's content.
551
 
552
  ### Returns
553
 
 
559
  from ragflow import RAGFlow
560
 
561
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
562
+ ds = rag.list_datasets(id="123")
563
+ ds = ds[0]
564
+ doc = ds.list_documents(id="wdfxb5t547d")
565
+ doc = doc[0]
566
  chunk = doc.add_chunk(content="xxxxxxx")
567
  ```
568
 
 
571
  ## Delete chunk
572
 
573
  ```python
574
+ Document.delete_chunks(chunk_ids: List[str])
575
  ```
576
+ ### Parameters
577
+ #### chunk_ids:`List[str]`
578
+ The list of chunk_id
579
 
580
  ### Returns
581
 
582
+ no return
583
 
584
  ### Examples
585
 
 
587
  from ragflow import RAGFlow
588
 
589
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
590
+ ds = rag.list_datasets(id="123")
591
+ ds = ds[0]
592
+ doc = ds.list_documents(id="wdfxb5t547d")
593
+ doc = doc[0]
594
  chunk = doc.add_chunk(content="xxxxxxx")
595
+ doc.delete_chunks(["id_1","id_2"])
596
  ```
597
 
598
  ---
599
 
600
+ ## Update chunk
601
 
602
  ```python
603
+ Chunk.update(update_message: dict)
604
  ```
605
+ ### Parameters
606
+ - `content`: `str`
607
+ Contains the main text or information of the chunk
608
+
609
+ - `important_keywords`: `List[str]`
610
+ List the key terms or phrases that are significant or central to the chunk's content
611
+
612
+ - `available`: `int`
613
+ Indicating the availability status, `0` means unavailable and `1` means available
614
 
615
  ### Returns
616
 
617
+ no return
618
 
619
  ### Examples
620
 
 
622
  from ragflow import RAGFlow
623
 
624
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
625
+ ds = rag.list_datasets(id="123")
626
+ ds = ds[0]
627
+ doc = ds.list_documents(id="wdfxb5t547d")
628
+ doc = doc[0]
629
  chunk = doc.add_chunk(content="xxxxxxx")
630
+ chunk.update({"content":"sdfx...})
 
631
  ```
632
 
633
  ---
 
635
  ## Retrieval
636
 
637
  ```python
638
+ RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
639
  ```
640
 
641
  ### Parameters
 
672
 
673
  Number of records engaged in vector cosine computaton. Defaults to `1024`.
674
 
675
+ #### rerank_id:`str`
676
+ ID of the rerank model. Defaults to `None`.
677
+
678
+ #### keyword:`bool`
679
+ Indicating whether keyword-based matching is enabled (True) or disabled (False).
680
+
681
+ #### highlight:`bool`
682
+
683
+ Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
684
  ### Returns
685
 
686
  List[Chunk]
 
691
  from ragflow import RAGFlow
692
 
693
  rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
694
+ ds = rag.list_datasets(name="ragflow")
695
+ ds = ds[0]
696
  name = 'ragflow_test.txt'
697
+ path = './test_data/ragflow_test.txt'
698
  rag.create_document(ds, name=name, blob=open(path, "rb").read())
699
+ doc = ds.list_documents(name=name)
700
+ doc = doc[0]
701
+ ds.async_parse_documents([doc.id])
702
+ for c in rag.retrieve(question="What's ragflow?",
703
+ datasets=[ds.id], documents=[doc.id],
704
+ offset=1, limit=30, similarity_threshold=0.2,
 
 
705
  vector_similarity_weight=0.3,
706
  top_k=1024
707
  ):
sdk/python/ragflow/modules/chunk.py CHANGED
@@ -17,32 +17,11 @@ class Chunk(Base):
17
  res_dict.pop(k)
18
  super().__init__(rag, res_dict)
19
 
20
- def delete(self) -> bool:
21
- """
22
- Delete the chunk in the document.
23
- """
24
- res = self.post('/doc/chunk/rm',
25
- {"document_id": self.document_id, 'chunk_ids': [self.id]})
26
- res = res.json()
27
- if res.get("retmsg") == "success":
28
- return True
29
- raise Exception(res["retmsg"])
30
 
31
- def save(self) -> bool:
32
- """
33
- Save the document details to the server.
34
- """
35
- res = self.post('/doc/chunk/set',
36
- {"chunk_id": self.id,
37
- "knowledgebase_id": self.knowledgebase_id,
38
- "name": self.document_name,
39
- "content": self.content,
40
- "important_keywords": self.important_keywords,
41
- "document_id": self.document_id,
42
- "available": self.available,
43
- })
44
  res = res.json()
45
- if res.get("retmsg") == "success":
46
- return True
47
- raise Exception(res["retmsg"])
48
 
 
17
  res_dict.pop(k)
18
  super().__init__(rag, res_dict)
19
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ def update(self,update_message:dict):
22
+ res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message)
 
 
 
 
 
 
 
 
 
 
 
23
  res = res.json()
24
+ if res.get("code") != 0 :
25
+ raise Exception(res["message"])
26
+
27
 
sdk/python/ragflow/modules/dataset.py CHANGED
@@ -65,3 +65,14 @@ class DataSet(Base):
65
  if res.get("code") != 0:
66
  raise Exception(res["message"])
67
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if res.get("code") != 0:
66
  raise Exception(res["message"])
67
 
68
+ def async_parse_documents(self,document_ids):
69
+ res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
70
+ res = res.json()
71
+ if res.get("code") != 0:
72
+ raise Exception(res.get("message"))
73
+
74
+ def async_cancel_parse_documents(self,document_ids):
75
+ res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
76
+ res = res.json()
77
+ if res.get("code") != 0:
78
+ raise Exception(res.get("message"))
sdk/python/ragflow/modules/document.py CHANGED
@@ -1,7 +1,10 @@
1
  import time
2
 
 
 
3
  from .base import Base
4
  from .chunk import Chunk
 
5
 
6
 
7
  class Document(Base):
@@ -29,160 +32,28 @@ class Document(Base):
29
  res_dict.pop(k)
30
  super().__init__(rag, res_dict)
31
 
32
- def update(self,update_message:dict) -> bool:
33
- """
34
- Save the document details to the server.
35
- """
36
- res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
37
- res = res.json()
38
- if res.get("code") != 0:
39
- raise Exception(res["message"])
40
-
41
- def delete(self) -> bool:
42
- """
43
- Delete the document from the server.
44
- """
45
- res = self.rm('/doc/delete',
46
- {"document_id": self.id})
47
  res = res.json()
48
- if res.get("retmsg") == "success":
49
- return True
50
- raise Exception(res["retmsg"])
51
-
52
- def download(self) -> bytes:
53
- """
54
- Download the document content from the server using the Flask API.
55
-
56
- :return: The downloaded document content in bytes.
57
- """
58
- # Construct the URL for the API request using the document ID and knowledge base ID
59
- res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
60
-
61
- # Check the response status code to ensure the request was successful
62
- if res.status_code == 200:
63
- # Return the document content as bytes
64
- return res.content
65
- else:
66
- # Handle the error and raise an exception
67
- raise Exception(
68
- f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
69
- )
70
-
71
- def async_parse(self):
72
- """
73
- Initiate document parsing asynchronously without waiting for completion.
74
- """
75
- try:
76
- # Construct request data including document ID and run status (assuming 1 means to run)
77
- data = {"document_ids": [self.id], "run": 1}
78
-
79
- # Send a POST request to the specified parsing status endpoint to start parsing
80
- res = self.post(f'/doc/run', data)
81
-
82
- # Check the server response status code
83
- if res.status_code != 200:
84
- raise Exception(f"Failed to start async parsing: {res.text}")
85
-
86
- print("Async parsing started successfully.")
87
-
88
- except Exception as e:
89
- # Catch and handle exceptions
90
- print(f"Error occurred during async parsing: {str(e)}")
91
- raise
92
-
93
- import time
94
-
95
- def join(self, interval=5, timeout=3600):
96
- """
97
- Wait for the asynchronous parsing to complete and yield parsing progress periodically.
98
-
99
- :param interval: The time interval (in seconds) for progress reports.
100
- :param timeout: The timeout (in seconds) for the parsing operation.
101
- :return: An iterator yielding parsing progress and messages.
102
- """
103
- start_time = time.time()
104
- while time.time() - start_time < timeout:
105
- # Check the parsing status
106
- res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
107
- res_data = res.json()
108
- data = res_data.get("data", [])
109
-
110
- # Retrieve progress and status message
111
- progress = data.get("progress", 0)
112
- progress_msg = data.get("status", "")
113
 
114
- yield progress, progress_msg # Yield progress and message
115
-
116
- if progress == 100: # Parsing completed
117
- break
118
-
119
- time.sleep(interval)
120
-
121
- def cancel(self):
122
- """
123
- Cancel the parsing task for the document.
124
- """
125
- try:
126
- # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
127
- data = {"document_ids": [self.id], "run": 2}
128
-
129
- # Send a POST request to the specified parsing status endpoint to cancel parsing
130
- res = self.post(f'/doc/run', data)
131
-
132
- # Check the server response status code
133
- if res.status_code != 200:
134
- print("Failed to cancel parsing. Server response:", res.text)
135
- else:
136
- print("Parsing cancelled successfully.")
137
-
138
- except Exception as e:
139
- print(f"Error occurred during async parsing cancellation: {str(e)}")
140
- raise
141
-
142
- def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
143
- """
144
- List all chunks associated with this document by calling the external API.
145
-
146
- Args:
147
- page (int): The page number to retrieve (default 1).
148
- size (int): The number of chunks per page (default 30).
149
- keywords (str): Keywords for searching specific chunks (default "").
150
- available_int (int): Filter for available chunks (optional).
151
-
152
- Returns:
153
- list: A list of chunks returned from the API.
154
- """
155
- data = {
156
- "document_id": self.id,
157
- "page": page,
158
- "size": size,
159
- "keywords": keywords,
160
- "offset":offset,
161
- "limit":limit
162
- }
163
-
164
- if available_int is not None:
165
- data["available_int"] = available_int
166
-
167
- res = self.post(f'/doc/chunk/list', data)
168
- if res.status_code == 200:
169
- res_data = res.json()
170
- if res_data.get("retmsg") == "success":
171
- chunks=[]
172
- for chunk_data in res_data["data"].get("chunks", []):
173
- chunk=Chunk(self.rag,chunk_data)
174
- chunks.append(chunk)
175
- return chunks
176
- else:
177
- raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
178
- else:
179
- raise Exception(f"API request failed with status code {res.status_code}")
180
 
181
  def add_chunk(self, content: str):
182
- res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
183
- if res.status_code == 200:
184
- res_data = res.json().get("data")
185
- chunk_data = res_data.get("chunk")
186
- return Chunk(self.rag,chunk_data)
187
- else:
188
- raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
 
 
 
 
 
1
  import time
2
 
3
+ from PIL.ImageFile import raise_oserror
4
+
5
  from .base import Base
6
  from .chunk import Chunk
7
+ from typing import List
8
 
9
 
10
  class Document(Base):
 
32
  res_dict.pop(k)
33
  super().__init__(rag, res_dict)
34
 
35
+ def list_chunks(self,offset=0, limit=30, keywords="", id:str=None):
36
+ data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id}
37
+ res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data)
 
 
 
 
 
 
 
 
 
 
 
 
38
  res = res.json()
39
+ if res.get("code") == 0:
40
+ chunks=[]
41
+ for data in res["data"].get("chunks"):
42
+ chunk = Chunk(self.rag,data)
43
+ chunks.append(chunk)
44
+ return chunks
45
+ raise Exception(res.get("message"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def add_chunk(self, content: str):
49
+ res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content})
50
+ res = res.json()
51
+ if res.get("code") == 0:
52
+ return Chunk(self.rag,res["data"].get("chunk"))
53
+ raise Exception(res.get("message"))
54
+
55
+ def delete_chunks(self,ids:List[str]):
56
+ res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids})
57
+ res = res.json()
58
+ if res.get("code")!=0:
59
+ raise Exception(res.get("message"))
sdk/python/ragflow/modules/session.py CHANGED
@@ -15,8 +15,8 @@ class Session(Base):
15
  for message in self.messages:
16
  if "reference" in message:
17
  message.pop("reference")
18
- res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion",
19
- {"question": question, "stream": True}, stream=stream)
20
  for line in res.iter_lines():
21
  line = line.decode("utf-8")
22
  if line.startswith("{"):
@@ -82,3 +82,4 @@ class Chunk(Base):
82
  self.term_similarity = None
83
  self.positions = None
84
  super().__init__(rag, res_dict)
 
 
15
  for message in self.messages:
16
  if "reference" in message:
17
  message.pop("reference")
18
+ res = self.post(f"/chat/{self.chat_id}/completion",
19
+ {"question": question, "stream": True,"session_id":self.id}, stream=stream)
20
  for line in res.iter_lines():
21
  line = line.decode("utf-8")
22
  if line.startswith("{"):
 
82
  self.term_similarity = None
83
  self.positions = None
84
  super().__init__(rag, res_dict)
85
+
sdk/python/ragflow/ragflow.py CHANGED
@@ -158,105 +158,30 @@ class RAGFlow:
158
  raise Exception(res["message"])
159
 
160
 
161
-
162
- def async_parse_documents(self, doc_ids):
163
- """
164
- Asynchronously start parsing multiple documents without waiting for completion.
165
-
166
- :param doc_ids: A list containing multiple document IDs.
167
- """
168
- try:
169
- if not doc_ids or not isinstance(doc_ids, list):
170
- raise ValueError("doc_ids must be a non-empty list of document IDs")
171
-
172
- data = {"document_ids": doc_ids, "run": 1}
173
-
174
- res = self.post(f'/doc/run', data)
175
-
176
- if res.status_code != 200:
177
- raise Exception(f"Failed to start async parsing for documents: {res.text}")
178
-
179
- print(f"Async parsing started successfully for documents: {doc_ids}")
180
-
181
- except Exception as e:
182
- print(f"Error occurred during async parsing for documents: {str(e)}")
183
- raise
184
-
185
- def async_cancel_parse_documents(self, doc_ids):
186
- """
187
- Cancel the asynchronous parsing of multiple documents.
188
-
189
- :param doc_ids: A list containing multiple document IDs.
190
- """
191
- try:
192
- if not doc_ids or not isinstance(doc_ids, list):
193
- raise ValueError("doc_ids must be a non-empty list of document IDs")
194
- data = {"document_ids": doc_ids, "run": 2}
195
- res = self.post(f'/doc/run', data)
196
-
197
- if res.status_code != 200:
198
- raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
199
-
200
- print(f"Async parsing canceled successfully for documents: {doc_ids}")
201
-
202
- except Exception as e:
203
- print(f"Error occurred during canceling parsing for documents: {str(e)}")
204
- raise
205
-
206
- def retrieval(self,
207
- question,
208
- datasets=None,
209
- documents=None,
210
- offset=0,
211
- limit=6,
212
- similarity_threshold=0.1,
213
- vector_similarity_weight=0.3,
214
- top_k=1024):
215
- """
216
- Perform document retrieval based on the given parameters.
217
-
218
- :param question: The query question.
219
- :param datasets: A list of datasets (optional, as documents may be provided directly).
220
- :param documents: A list of documents (if specific documents are provided).
221
- :param offset: Offset for the retrieval results.
222
- :param limit: Maximum number of retrieval results.
223
- :param similarity_threshold: Similarity threshold.
224
- :param vector_similarity_weight: Weight of vector similarity.
225
- :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking).
226
-
227
- Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API.
228
- """
229
- try:
230
- data = {
231
- "question": question,
232
- "datasets": datasets if datasets is not None else [],
233
- "documents": [doc.id if hasattr(doc, 'id') else doc for doc in
234
- documents] if documents is not None else [],
235
  "offset": offset,
236
  "limit": limit,
237
  "similarity_threshold": similarity_threshold,
238
  "vector_similarity_weight": vector_similarity_weight,
239
  "top_k": top_k,
240
  "knowledgebase_id": datasets,
 
 
 
 
 
 
 
241
  }
242
 
243
  # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
244
- res = self.post(f'/doc/retrieval_test', data)
245
-
246
- # Check the response status code
247
- if res.status_code == 200:
248
- res_data = res.json()
249
- if res_data.get("retmsg") == "success":
250
- chunks = []
251
- for chunk_data in res_data["data"].get("chunks", []):
252
- chunk = Chunk(self, chunk_data)
253
- chunks.append(chunk)
254
- return chunks
255
- else:
256
- raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
257
- else:
258
- raise Exception(f"API request failed with status code {res.status_code}")
259
-
260
- except Exception as e:
261
- print(f"An error occurred during retrieval: {e}")
262
- raise
 
158
  raise Exception(res["message"])
159
 
160
 
161
+ def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,):
162
+ data_params = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  "offset": offset,
164
  "limit": limit,
165
  "similarity_threshold": similarity_threshold,
166
  "vector_similarity_weight": vector_similarity_weight,
167
  "top_k": top_k,
168
  "knowledgebase_id": datasets,
169
+ "rerank_id":rerank_id,
170
+ "keyword":keyword
171
+ }
172
+ data_json ={
173
+ "question": question,
174
+ "datasets": datasets,
175
+ "documents": documents
176
  }
177
 
178
  # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
179
+ res = self.get(f'/retrieval', data_params,data_json)
180
+ res = res.json()
181
+ if res.get("code") ==0:
182
+ chunks=[]
183
+ for chunk_data in res["data"].get("chunks"):
184
+ chunk=Chunk(self,chunk_data)
185
+ chunks.append(chunk)
186
+ return chunks
187
+ raise Exception(res.get("message"))
 
 
 
 
 
 
 
 
 
 
sdk/python/test/t_document.py CHANGED
@@ -63,17 +63,13 @@ class TestDocument(TestSdk):
63
  # Check if the retrieved document is of type Document
64
  if isinstance(doc, Document):
65
  # Download the document content and save it to a file
66
- try:
67
- with open("ragflow.txt", "wb+") as file:
68
- file.write(doc.download())
69
- # Print the document object for debugging
70
- print(doc)
71
-
72
- # Assert that the download was successful
73
- assert True, "Document downloaded successfully."
74
- except Exception as e:
75
- # If an error occurs, raise an assertion error
76
- assert False, f"Failed to download document, error: {str(e)}"
77
  else:
78
  # If the document retrieval fails, assert failure
79
  assert False, f"Failed to get document, error: {doc}"
@@ -100,7 +96,7 @@ class TestDocument(TestSdk):
100
  blob2 = b"Sample document content for ingestion test222."
101
  list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
102
  ds.upload_documents(list_1)
103
- for d in ds.list_docs(keywords="test", offset=0, limit=12):
104
  assert isinstance(d, Document), "Failed to upload documents"
105
 
106
  def test_delete_documents_in_dataset_with_success(self):
@@ -123,16 +119,11 @@ class TestDocument(TestSdk):
123
  blob1 = b"Sample document content for ingestion test333."
124
  name2 = "Test Document444.txt"
125
  blob2 = b"Sample document content for ingestion test444."
126
- name3 = 'test.txt'
127
- path = 'test_data/test.txt'
128
- rag.create_document(ds, name=name3, blob=open(path, "rb").read())
129
- rag.create_document(ds, name=name1, blob=blob1)
130
- rag.create_document(ds, name=name2, blob=blob2)
131
- for d in ds.list_docs(keywords="document", offset=0, limit=12):
132
  assert isinstance(d, Document)
133
- d.delete()
134
- print(d)
135
- remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
136
  assert len(remaining_docs) == 0, "Documents were not properly deleted."
137
 
138
  def test_parse_and_cancel_document(self):
@@ -144,16 +135,15 @@ class TestDocument(TestSdk):
144
 
145
  # Define the document name and path
146
  name3 = 'westworld.pdf'
147
- path = 'test_data/westworld.pdf'
148
 
149
  # Create a document in the dataset using the file path
150
- rag.create_document(ds, name=name3, blob=open(path, "rb").read())
151
 
152
  # Retrieve the document by name
153
- doc = rag.get_document(name="westworld.pdf")
154
-
155
- # Initiate asynchronous parsing
156
- doc.async_parse()
157
 
158
  # Print message to confirm asynchronous parsing has been initiated
159
  print("Async parsing initiated")
 
63
  # Check if the retrieved document is of type Document
64
  if isinstance(doc, Document):
65
  # Download the document content and save it to a file
66
+ with open("./ragflow.txt", "wb+") as file:
67
+ file.write(doc.download())
68
+ # Print the document object for debugging
69
+ print(doc)
70
+
71
+ # Assert that the download was successful
72
+ assert True, f"Failed to download document, error: {doc}"
 
 
 
 
73
  else:
74
  # If the document retrieval fails, assert failure
75
  assert False, f"Failed to get document, error: {doc}"
 
96
  blob2 = b"Sample document content for ingestion test222."
97
  list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
98
  ds.upload_documents(list_1)
99
+ for d in ds.list_documents(keywords="test", offset=0, limit=12):
100
  assert isinstance(d, Document), "Failed to upload documents"
101
 
102
  def test_delete_documents_in_dataset_with_success(self):
 
119
  blob1 = b"Sample document content for ingestion test333."
120
  name2 = "Test Document444.txt"
121
  blob2 = b"Sample document content for ingestion test444."
122
+ ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}])
123
+ for d in ds.list_documents(keywords="document", offset=0, limit=12):
 
 
 
 
124
  assert isinstance(d, Document)
125
+ ds.delete_documents([d.id])
126
+ remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12)
 
127
  assert len(remaining_docs) == 0, "Documents were not properly deleted."
128
 
129
  def test_parse_and_cancel_document(self):
 
135
 
136
  # Define the document name and path
137
  name3 = 'westworld.pdf'
138
+ path = './test_data/westworld.pdf'
139
 
140
  # Create a document in the dataset using the file path
141
+ ds.upload_documents({"name":name3, "blob":open(path, "rb").read()})
142
 
143
  # Retrieve the document by name
144
+ doc = rag.list_documents(name="westworld.pdf")
145
+ doc = doc[0]
146
+ ds.async_parse_documents(document_ids=[])
 
147
 
148
  # Print message to confirm asynchronous parsing has been initiated
149
  print("Async parsing initiated")