Spaces:

retopara
/

ragflow

Build error

liuhua liuhua Kevin Hu commited on Oct 16, 2024

Commit

3d9274d

1 Parent(s): ab87187

Refactor Chunk API (#2855)

### What problem does this PR solve?

Refactor Chunk API
#2846
### Type of change

- [x] Refactoring

---------

Co-authored-by: liuhua <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>

Files changed (11) hide show

api/apps/sdk/doc.py +95 -76
api/apps/sdk/session.py +28 -17
api/db/services/document_service.py +1 -2
api/http_api.md +372 -198
api/python_api_reference.md +187 -198
sdk/python/ragflow/modules/chunk.py +5 -26
sdk/python/ragflow/modules/dataset.py +11 -0
sdk/python/ragflow/modules/document.py +24 -153
sdk/python/ragflow/modules/session.py +3 -2
sdk/python/ragflow/ragflow.py +18 -93
sdk/python/test/t_document.py +17 -27

api/apps/sdk/doc.py CHANGED Viewed

@@ -119,13 +119,11 @@ def update_doc(tenant_id, dataset_id, document_id):
         if informs:
             e, file = FileService.get_by_id(informs[0].file_id)
             FileService.update_by_id(file.id, {"name": req["name"]})
     if "parser_method" in req:
         if doc.parser_id.lower() == req["parser_method"].lower():
-            if "parser_config" in req:
-                if req["parser_config"] == doc.parser_config:
-                    return get_result(retcode=RetCode.SUCCESS)
-            else:
-                return get_result(retcode=RetCode.SUCCESS)
         if doc.type == FileType.VISUAL or re.search(
                 r"\.(ppt|pptx|pages)$", doc.name):
@@ -146,8 +144,6 @@ def update_doc(tenant_id, dataset_id, document_id):
                 return get_error_data_result(retmsg="Tenant not found!")
             ELASTICSEARCH.deleteByQuery(
                 Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
-    if "parser_config" in req:
-        DocumentService.update_parser_config(doc.id, req["parser_config"])
     return get_result()
@@ -258,6 +254,8 @@ def parse(tenant_id,dataset_id):
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     req = request.json
     for id in req["document_ids"]:
         if not DocumentService.query(id=id,kb_id=dataset_id):
             return get_error_data_result(retmsg=f"You don't own the document {id}.")
@@ -283,9 +281,14 @@ def stop_parsing(tenant_id,dataset_id):
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     req = request.json
     for id in req["document_ids"]:
-        if not DocumentService.query(id=id,kb_id=dataset_id):
             return get_error_data_result(retmsg=f"You don't own the document {id}.")
         info = {"run": "2", "progress": 0}
         DocumentService.update_by_id(id, info)
         # if str(req["run"]) == TaskStatus.CANCEL.value:
@@ -297,7 +300,7 @@ def stop_parsing(tenant_id,dataset_id):
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
 @token_required
-def list_chunk(tenant_id,dataset_id,document_id):
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     doc=DocumentService.query(id=document_id, kb_id=dataset_id)
@@ -309,57 +312,58 @@ def list_chunk(tenant_id,dataset_id,document_id):
     page = int(req.get("offset", 1))
     size = int(req.get("limit", 30))
     question = req.get("keywords", "")
-    try:
-        query = {
-            "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
         }
-        if "available_int" in req:
-            query["available_int"] = int(req["available_int"])
-        sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
-        res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
-        origin_chunks = []
-        for id in sres.ids:
-            d = {
-                "chunk_id": id,
-                "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
-                    id].get(
-                    "content_with_weight", ""),
-                "doc_id": sres.field[id]["doc_id"],
-                "docnm_kwd": sres.field[id]["docnm_kwd"],
-                "important_kwd": sres.field[id].get("important_kwd", []),
-                "img_id": sres.field[id].get("img_id", ""),
-                "available_int": sres.field[id].get("available_int", 1),
-                "positions": sres.field[id].get("position_int", "").split("\t")
-            }
-            if len(d["positions"]) % 5 == 0:
-                poss = []
-                for i in range(0, len(d["positions"]), 5):
-                    poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
-                                 float(d["positions"][i + 3]), float(d["positions"][i + 4])])
-                d["positions"] = poss
-            origin_chunks.append(d)
-            ##rename keys
-            for chunk in origin_chunks:
-                key_mapping = {
-                    "chunk_id": "id",
-                    "content_with_weight": "content",
-                    "doc_id": "document_id",
-                    "important_kwd": "important_keywords",
-                    "img_id": "image_id",
-                }
-                renamed_chunk = {}
-                for key, value in chunk.items():
-                    new_key = key_mapping.get(key, key)
-                    renamed_chunk[new_key] = value
-                res["chunks"].append(renamed_chunk)
-        return get_result(data=res)
-    except Exception as e:
-        if str(e).find("not_found") > 0:
-            return get_result(retmsg=f'No chunk found!',
-                                   retcode=RetCode.DATA_ERROR)
-        return server_error_response(e)
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
@@ -374,6 +378,9 @@ def create(tenant_id,dataset_id,document_id):
     req = request.json
     if not req.get("content"):
         return get_error_data_result(retmsg="`content` is required")
     md5 = hashlib.md5()
     md5.update((req["content"] + document_id).encode("utf-8"))
@@ -381,8 +388,8 @@ def create(tenant_id,dataset_id,document_id):
     d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
          "content_with_weight": req["content"]}
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["important_kwd"] = req.get("important_kwd", [])
-    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
     d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
     d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
     d["kb_id"] = [doc.kb_id]
@@ -432,12 +439,12 @@ def rm_chunk(tenant_id,dataset_id,document_id):
     req = request.json
     if not req.get("chunk_ids"):
         return get_error_data_result("`chunk_ids` is required")
     for chunk_id in req.get("chunk_ids"):
-        res = ELASTICSEARCH.get(
-            chunk_id, search.index_name(
-                tenant_id))
-        if not res.get("found"):
-            return server_error_response(f"Chunk {chunk_id} not found")
     if not ELASTICSEARCH.deleteByQuery(
             Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
         return get_error_data_result(retmsg="Index updating failure")
@@ -451,24 +458,36 @@ def rm_chunk(tenant_id,dataset_id,document_id):
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
 @token_required
 def set(tenant_id,dataset_id,document_id,chunk_id):
-    res = ELASTICSEARCH.get(
         chunk_id, search.index_name(
             tenant_id))
-    if not res.get("found"):
-        return get_error_data_result(f"Chunk {chunk_id} not found")
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     doc = DocumentService.query(id=document_id, kb_id=dataset_id)
     if not doc:
         return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
     req = request.json
     d = {
         "id": chunk_id,
-        "content_with_weight": req.get("content",res.get["content_with_weight"])}
-    d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["important_kwd"] = req.get("important_keywords",[])
-    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
     if "available" in req:
         d["available_int"] = req["available"]
     embd_id = DocumentService.get_embd_id(document_id)
@@ -478,7 +497,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
         arr = [
             t for t in re.split(
                 r"[\n\t]",
-                req["content"]) if len(t) > 1]
         if len(arr) != 2:
             return get_error_data_result(
                 retmsg="Q&A must be separated by TAB/ENTER key.")
@@ -486,7 +505,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
         d = beAdoc(d, arr[0], arr[1], not any(
             [rag_tokenizer.is_chinese(t) for t in q + a]))
-    v, c = embd_mdl.encode([doc.name, req["content"]])
     v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
     d["q_%d_vec" % len(v)] = v.tolist()
     ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
@@ -505,7 +524,7 @@ def retrieval_test(tenant_id):
     for id in kb_id:
         if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
             return get_error_data_result(f"You don't own the dataset {id}.")
-    if "question" not in req_json:
         return get_error_data_result("`question` is required.")
     page = int(req.get("offset", 1))
     size = int(req.get("limit", 30))

         if informs:
             e, file = FileService.get_by_id(informs[0].file_id)
             FileService.update_by_id(file.id, {"name": req["name"]})
+    if "parser_config" in req:
+        DocumentService.update_parser_config(doc.id, req["parser_config"])
     if "parser_method" in req:
         if doc.parser_id.lower() == req["parser_method"].lower():
+                return get_result()
         if doc.type == FileType.VISUAL or re.search(
                 r"\.(ppt|pptx|pages)$", doc.name):
                 return get_error_data_result(retmsg="Tenant not found!")
             ELASTICSEARCH.deleteByQuery(
                 Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
     return get_result()
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     req = request.json
+    if not req.get("document_ids"):
+        return get_error_data_result("`document_ids` is required")
     for id in req["document_ids"]:
         if not DocumentService.query(id=id,kb_id=dataset_id):
             return get_error_data_result(retmsg=f"You don't own the document {id}.")
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     req = request.json
+    if not req.get("document_ids"):
+        return get_error_data_result("`document_ids` is required")
     for id in req["document_ids"]:
+        doc = DocumentService.query(id=id, kb_id=dataset_id)
+        if not doc:
             return get_error_data_result(retmsg=f"You don't own the document {id}.")
+        if doc[0].progress == 100.0 or doc[0].progress == 0.0:
+            return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
         info = {"run": "2", "progress": 0}
         DocumentService.update_by_id(id, info)
         # if str(req["run"]) == TaskStatus.CANCEL.value:
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
 @token_required
+def list_chunks(tenant_id,dataset_id,document_id):
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     doc=DocumentService.query(id=document_id, kb_id=dataset_id)
     page = int(req.get("offset", 1))
     size = int(req.get("limit", 30))
     question = req.get("keywords", "")
+    query = {
+        "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
+    }
+    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
+    res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
+    origin_chunks = []
+    sign = 0
+    for id in sres.ids:
+        d = {
+            "chunk_id": id,
+            "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
+                id].get(
+                "content_with_weight", ""),
+            "doc_id": sres.field[id]["doc_id"],
+            "docnm_kwd": sres.field[id]["docnm_kwd"],
+            "important_kwd": sres.field[id].get("important_kwd", []),
+            "img_id": sres.field[id].get("img_id", ""),
+            "available_int": sres.field[id].get("available_int", 1),
+            "positions": sres.field[id].get("position_int", "").split("\t")
         }
+        if len(d["positions"]) % 5 == 0:
+            poss = []
+            for i in range(0, len(d["positions"]), 5):
+                poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
+                             float(d["positions"][i + 3]), float(d["positions"][i + 4])])
+            d["positions"] = poss
+        origin_chunks.append(d)
+        if req.get("id"):
+            if req.get("id") == id:
+                origin_chunks.clear()
+                origin_chunks.append(d)
+                sign = 1
+                break
+    if req.get("id"):
+        if sign == 0:
+            return get_error_data_result(f"Can't find this chunk {req.get('id')}")
+    for chunk in origin_chunks:
+        key_mapping = {
+            "chunk_id": "id",
+            "content_with_weight": "content",
+            "doc_id": "document_id",
+            "important_kwd": "important_keywords",
+            "img_id": "image_id",
+        }
+        renamed_chunk = {}
+        for key, value in chunk.items():
+            new_key = key_mapping.get(key, key)
+            renamed_chunk[new_key] = value
+        res["chunks"].append(renamed_chunk)
+    return get_result(data=res)
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
     req = request.json
     if not req.get("content"):
         return get_error_data_result(retmsg="`content` is required")
+    if "important_keywords" in req:
+        if type(req["important_keywords"]) != list:
+            return get_error_data_result("`important_keywords` is required to be a list")
     md5 = hashlib.md5()
     md5.update((req["content"] + document_id).encode("utf-8"))
     d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
          "content_with_weight": req["content"]}
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    d["important_kwd"] = req.get("important_keywords", [])
+    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", [])))
     d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
     d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
     d["kb_id"] = [doc.kb_id]
     req = request.json
     if not req.get("chunk_ids"):
         return get_error_data_result("`chunk_ids` is required")
+    query = {
+        "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
+    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
     for chunk_id in req.get("chunk_ids"):
+        if chunk_id not in sres.ids:
+            return get_error_data_result(f"Chunk {chunk_id} not found")
     if not ELASTICSEARCH.deleteByQuery(
             Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
         return get_error_data_result(retmsg="Index updating failure")
 @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
 @token_required
 def set(tenant_id,dataset_id,document_id,chunk_id):
+    try:
+        res = ELASTICSEARCH.get(
         chunk_id, search.index_name(
             tenant_id))
+    except Exception as e:
+        return get_error_data_result(f"Can't find this chunk {chunk_id}")
     if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
         return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
     doc = DocumentService.query(id=document_id, kb_id=dataset_id)
     if not doc:
         return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
+    doc = doc[0]
+    query = {
+        "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True
+    }
+    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
+    if chunk_id not in sres.ids:
+        return get_error_data_result(f"You don't own the chunk {chunk_id}")
     req = request.json
+    content=res["_source"].get("content_with_weight")
     d = {
         "id": chunk_id,
+        "content_with_weight": req.get("content",content)}
+    d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if "important_keywords" in req:
+        if type(req["important_keywords"]) != list:
+            return get_error_data_result("`important_keywords` is required to be a list")
+        d["important_kwd"] = req.get("important_keywords")
+        d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
     if "available" in req:
         d["available_int"] = req["available"]
     embd_id = DocumentService.get_embd_id(document_id)
         arr = [
             t for t in re.split(
                 r"[\n\t]",
+                d["content_with_weight"]) if len(t) > 1]
         if len(arr) != 2:
             return get_error_data_result(
                 retmsg="Q&A must be separated by TAB/ENTER key.")
         d = beAdoc(d, arr[0], arr[1], not any(
             [rag_tokenizer.is_chinese(t) for t in q + a]))
+    v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
     v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
     d["q_%d_vec" % len(v)] = v.tolist()
     ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
     for id in kb_id:
         if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
             return get_error_data_result(f"You don't own the dataset {id}.")
+    if "question" not in req:
         return get_error_data_result("`question` is required.")
     page = int(req.get("offset", 1))
     size = int(req.get("limit", 30))

api/apps/sdk/session.py CHANGED Viewed

@@ -24,10 +24,9 @@ from api.utils import get_uuid
 from api.utils.api_utils import get_error_data_result
 from api.utils.api_utils import get_result, token_required
 @manager.route('/chat/<chat_id>/session', methods=['POST'])
 @token_required
-def create(tenant_id, chat_id):
     req = request.json
     req["dialog_id"] = chat_id
     dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
@@ -51,14 +50,13 @@ def create(tenant_id, chat_id):
     del conv["reference"]
     return get_result(data=conv)
 @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
 @token_required
-def update(tenant_id, chat_id, session_id):
     req = request.json
     req["dialog_id"] = chat_id
     conv_id = session_id
-    conv = ConversationService.query(id=conv_id, dialog_id=chat_id)
     if not conv:
         return get_error_data_result(retmsg="Session does not exist")
     if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
@@ -74,16 +72,30 @@ def update(tenant_id, chat_id, session_id):
     return get_result()
-@manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST'])
 @token_required
-def completion(tenant_id, chat_id, session_id):
     req = request.json
     # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
     #    {"role": "user", "content": "上海有吗？"}
     # ]}
     if not req.get("question"):
         return get_error_data_result(retmsg="Please input your question.")
-    conv = ConversationService.query(id=session_id, dialog_id=chat_id)
     if not conv:
         return get_error_data_result(retmsg="Session does not exist")
     conv = conv[0]
@@ -117,17 +129,18 @@ def completion(tenant_id, chat_id, session_id):
         conv.message[-1] = {"role": "assistant", "content": ans["answer"],
                             "id": message_id, "prompt": ans.get("prompt", "")}
         ans["id"] = message_id
     def stream():
         nonlocal dia, msg, req, conv
         try:
             for ans in chat(dia, msg, **req):
                 fillin_conv(ans)
-                yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n"
             ConversationService.update_by_id(conv.id, conv.to_dict())
         except Exception as e:
             yield "data:" + json.dumps({"code": 500, "message": str(e),
-                                        "data": {"answer": "**ERROR**: " + str(e), "reference": []}},
                                        ensure_ascii=False) + "\n\n"
         yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
@@ -148,15 +161,14 @@ def completion(tenant_id, chat_id, session_id):
             break
         return get_result(data=answer)
 @manager.route('/chat/<chat_id>/session', methods=['GET'])
 @token_required
-def list(chat_id, tenant_id):
     if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
         return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
     id = request.args.get("id")
     name = request.args.get("name")
-    session = ConversationService.query(id=id, name=name, dialog_id=chat_id)
     if not session:
         return get_error_data_result(retmsg="The session doesn't exist")
     page_number = int(request.args.get("page", 1))
@@ -166,7 +178,7 @@ def list(chat_id, tenant_id):
         desc = False
     else:
         desc = True
-    convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name)
     if not convs:
         return get_result(data=[])
     for conv in convs:
@@ -201,17 +213,16 @@ def list(chat_id, tenant_id):
         del conv["reference"]
     return get_result(data=convs)
 @manager.route('/chat/<chat_id>/session', methods=["DELETE"])
 @token_required
-def delete(tenant_id, chat_id):
     if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
         return get_error_data_result(retmsg="You don't own the chat")
     ids = request.json.get("ids")
     if not ids:
         return get_error_data_result(retmsg="`ids` is required in deleting operation")
     for id in ids:
-        conv = ConversationService.query(id=id, dialog_id=chat_id)
         if not conv:
             return get_error_data_result(retmsg="The chat doesn't own the session")
         ConversationService.delete_by_id(id)

 from api.utils.api_utils import get_error_data_result
 from api.utils.api_utils import get_result, token_required
 @manager.route('/chat/<chat_id>/session', methods=['POST'])
 @token_required
+def create(tenant_id,chat_id):
     req = request.json
     req["dialog_id"] = chat_id
     dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
     del conv["reference"]
     return get_result(data=conv)
 @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
 @token_required
+def update(tenant_id,chat_id,session_id):
     req = request.json
     req["dialog_id"] = chat_id
     conv_id = session_id
+    conv = ConversationService.query(id=conv_id,dialog_id=chat_id)
     if not conv:
         return get_error_data_result(retmsg="Session does not exist")
     if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
     return get_result()
+@manager.route('/chat/<chat_id>/completion', methods=['POST'])
 @token_required
+def completion(tenant_id,chat_id):
     req = request.json
     # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
     #    {"role": "user", "content": "上海有吗？"}
     # ]}
+    if not req.get("session_id"):
+        conv = {
+            "id": get_uuid(),
+            "dialog_id": chat_id,
+            "name": req.get("name", "New session"),
+            "message": [{"role": "assistant", "content": "Hi! I am your assistant，can I help you?"}]
+        }
+        if not conv.get("name"):
+            return get_error_data_result(retmsg="Name can not be empty.")
+        ConversationService.save(**conv)
+        e, conv = ConversationService.get_by_id(conv["id"])
+        session_id=conv.id
+    else:
+        session_id = req.get("session_id")
     if not req.get("question"):
         return get_error_data_result(retmsg="Please input your question.")
+    conv = ConversationService.query(id=session_id,dialog_id=chat_id)
     if not conv:
         return get_error_data_result(retmsg="Session does not exist")
     conv = conv[0]
         conv.message[-1] = {"role": "assistant", "content": ans["answer"],
                             "id": message_id, "prompt": ans.get("prompt", "")}
         ans["id"] = message_id
+        ans["session_id"]=session_id
     def stream():
         nonlocal dia, msg, req, conv
         try:
             for ans in chat(dia, msg, **req):
                 fillin_conv(ans)
+                yield "data:" + json.dumps({"code": 0,  "data": ans}, ensure_ascii=False) + "\n\n"
             ConversationService.update_by_id(conv.id, conv.to_dict())
         except Exception as e:
             yield "data:" + json.dumps({"code": 500, "message": str(e),
+                                        "data": {"answer": "**ERROR**: " + str(e),"reference": []}},
                                        ensure_ascii=False) + "\n\n"
         yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
             break
         return get_result(data=answer)
 @manager.route('/chat/<chat_id>/session', methods=['GET'])
 @token_required
+def list(chat_id,tenant_id):
     if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
         return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
     id = request.args.get("id")
     name = request.args.get("name")
+    session = ConversationService.query(id=id,name=name,dialog_id=chat_id)
     if not session:
         return get_error_data_result(retmsg="The session doesn't exist")
     page_number = int(request.args.get("page", 1))
         desc = False
     else:
         desc = True
+    convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name)
     if not convs:
         return get_result(data=[])
     for conv in convs:
         del conv["reference"]
     return get_result(data=convs)
 @manager.route('/chat/<chat_id>/session', methods=["DELETE"])
 @token_required
+def delete(tenant_id,chat_id):
     if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
         return get_error_data_result(retmsg="You don't own the chat")
     ids = request.json.get("ids")
     if not ids:
         return get_error_data_result(retmsg="`ids` is required in deleting operation")
     for id in ids:
+        conv = ConversationService.query(id=id,dialog_id=chat_id)
         if not conv:
             return get_error_data_result(retmsg="The chat doesn't own the session")
         ConversationService.delete_by_id(id)

api/db/services/document_service.py CHANGED Viewed

@@ -61,14 +61,13 @@ class DocumentService(CommonService):
             docs = docs.where(
                 fn.LOWER(cls.model.name).contains(keywords.lower())
             )
-        count = docs.count()
         if desc:
             docs = docs.order_by(cls.model.getter_by(orderby).desc())
         else:
             docs = docs.order_by(cls.model.getter_by(orderby).asc())
         docs = docs.paginate(page_number, items_per_page)
         return list(docs.dicts()), count

             docs = docs.where(
                 fn.LOWER(cls.model.name).contains(keywords.lower())
             )
         if desc:
             docs = docs.order_by(cls.model.getter_by(orderby).desc())
         else:
             docs = docs.order_by(cls.model.getter_by(orderby).asc())
         docs = docs.paginate(page_number, items_per_page)
+        count = docs.count()
         return list(docs.dicts()), count

api/http_api.md CHANGED Viewed

@@ -432,18 +432,71 @@ The error response includes a JSON object like the following:
 }
 ```
 ## Download a file from a dataset
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
-Downloads files from a dataset.
 ### Request
 - Method: GET
-- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}`
 - Headers:
-  - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Output:
   - '{FILE_NAME}'
@@ -451,10 +504,9 @@ Downloads files from a dataset.
 ```bash
 curl --request GET \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --output '{FILE_NAME}'
 ```
 #### Request parameters
@@ -466,7 +518,7 @@ curl --request GET \
 ### Response
-The successful response includes a JSON object like the following:
 ```text
 test_2.
@@ -596,92 +648,39 @@ Update a file in a dataset
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
 ```bash
 curl --request PUT \
-  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
   --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
   --header 'Content-Type: application/json' \
   --data '{
   "name": "manual.txt",
-  "thumbnail": null,
-  "knowledgebase_id": "779333c0758611ef910f0242ac120004",
   "parser_method": "manual",
-  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12},
-  "source_type": "local", "type": "doc",
-  "created_by": "134408906b6811efbcd20242ac120005",
-  "size": 0, "token_count": 0, "chunk_count": 0,
-  "progress": 0.0,
-  "progress_msg": "",
-  "process_begin_at": null,
-  "process_duration": 0.0
   }'
 ```
 #### Request parameters
-- `"thumbnail"`: (*Body parameter*)
-    Thumbnail image of the document.
-    - `""`
-- `"knowledgebase_id"`: (*Body parameter*)
-    Knowledge base ID related to the document.
-    - `""`
 - `"parser_method"`: (*Body parameter*)
     Method used to parse the document.
-    - `""`
 - `"parser_config"`: (*Body parameter*)
     Configuration object for the parser.
     - If the value is `None`, a dictionary with default values will be generated.
-- `"source_type"`: (*Body parameter*)
-    Source type of the document.
-    - `""`
-- `"type"`: (*Body parameter*)
-    Type or category of the document.
-    - `""`
-- `"created_by"`: (*Body parameter*)
-    Creator of the document.
-    - `""`
 - `"name"`: (*Body parameter*)
     Name or title of the document.
-    - `""`
-- `"size"`: (*Body parameter*)
-    Size of the document in bytes or some other unit.
-    - `0`
-- `"token_count"`: (*Body parameter*)
-    Number of tokens in the document.
-    - `0`
-- `"chunk_count"`: (*Body parameter*)
-    Number of chunks the document is split into.
-    - `0`
-- `"progress"`: (*Body parameter*)
-    Current processing progress as a percentage.
-    - `0.0`
-- `"progress_msg"`: (*Body parameter*)
-    Message indicating current progress status.
-    - `""`
-- `"process_begin_at"`: (*Body parameter*)
-    Start time of the document processing.
-    - `None`
-- `"process_duration"`: (*Body parameter*)
-    Duration of the processing in seconds or minutes.
-    - `0.0`
 ### Response
@@ -712,34 +711,34 @@ Parse files into chunks in a dataset
 ### Request
 - Method: POST
-- URL: `/api/v1/dataset/{dataset_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request POST \
-     --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
-     }'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
-- `"documents"`: (*Body parameter*)
-  - Documents to parse
 ### Response
 The successful response includes a JSON object like the following:
-```shell
 {
     "code": 0
 }
@@ -747,10 +746,10 @@ The successful response includes a JSON object like the following:
 The error response includes a JSON object like the following:
-```shell
 {
-    "code": 3016,
-    "message": "Can't connect database"
 }
 ```
@@ -762,35 +761,35 @@ Stop file parsing
 ### Request
-- Method: POST
-- URL: `/api/v1/dataset/{dataset_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request DELETE \
-     --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
-     }'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
-- `"documents"`: (*Body parameter*)
-  - Documents to stop parsing
 ### Response
 The successful response includes a JSON object like the following:
-```shell
 {
     "code": 0
 }
@@ -798,104 +797,98 @@ The successful response includes a JSON object like the following:
 The error response includes a JSON object like the following:
-```shell
 {
-    "code": 3016,
-    "message": "Can't connect database"
 }
 ```
 ## Get document chunk list
-**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 Get document chunk list
 ### Request
 - Method: GET
-- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
-  - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request GET \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
 - `"document_id"`: (*Path parameter*)
 ### Response
 The successful response includes a JSON object like the following:
-```shell
 {
-    "code": 0
     "data": {
-        "chunks": [
-            {
-                "available_int": 1,
-                "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur",
-                "document_keyword": "ragflow_test.txt",
-                "document_id": "77df9ef4759a11ef8bdd0242ac120004",
-                "id": "4ab8c77cfac1a829c8d5ed022a0808c0",
-                "image_id": "",
-                "important_keywords": [],
-                "positions": [
-                    ""
-                ]
-            }
-        ],
         "doc": {
-            "chunk_count": 5,
-            "create_date": "Wed, 18 Sep 2024 08:46:16 GMT",
-            "create_time": 1726649176833,
-            "created_by": "134408906b6811efbcd20242ac120005",
-            "id": "77df9ef4759a11ef8bdd0242ac120004",
-            "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004",
-            "location": "ragflow_test.txt",
-            "name": "ragflow_test.txt",
             "parser_config": {
-                "chunk_token_count": 128,
-                "delimiter": "\n!?。；！？",
-                "layout_recognize": true,
-                "task_page_size": 12
             },
-            "parser_method": "naive",
-            "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT",
-            "process_duation": 7.3213,
-            "progress": 1.0,
-            "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!",
-            "run": "3",
-            "size": 4209,
             "source_type": "local",
             "status": "1",
             "thumbnail": null,
-            "token_count": 746,
             "type": "doc",
-            "update_date": "Wed, 18 Sep 2024 08:46:23 GMT",
-            "update_time": 1726649183321
         },
-        "total": 1
-    },
 }
 ```
 The error response includes a JSON object like the following:
-```shell
 {
-    "code": 3016,
-    "message": "Can't connect database"
 }
 ```
@@ -908,55 +901,96 @@ Delete document chunks
 ### Request
 - Method: DELETE
-- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request DELETE \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-         "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
-     }'
 ```
 ## Update document chunk
-**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 Update document chunk
 ### Request
 - Method: PUT
-- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request PUT \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-        "chunk_id": "d87fb0b7212c15c18d0831677552d7de",
-        "knowledgebase_id": null,
-        "name": "",
-        "content": "ragflow123",
-        "important_keywords": [],
-        "document_id": "e6bbba92759511efaa900242ac120004",
-        "status": "1"
-     }'
 ```
 ## Insert document chunks
 **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
@@ -966,50 +1000,187 @@ Insert document chunks
 ### Request
 - Method: POST
-- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
 curl --request POST \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-         "document_id": "97ad64b6759811ef9fc30242ac120004",
-         "content": ["ragflow content", "ragflow content"]
-     }'
 ```
 ## Dataset retrieval test
-**GET** `/api/v1/dataset/{dataset_id}/retrieval`
 Retrieval test of a dataset
 ### Request
-- Method: GET
-- URL: `/api/v1/dataset/{dataset_id}/retrieval`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
-curl --request GET \
-     --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \
-     --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-     --raw '{
-         "query_text": "This is a cat."
-     }'
 ```
 ## Create chat
 **POST** `/api/v1/chat`
@@ -1708,26 +1879,27 @@ Error
 ## Chat with a chat session
-**POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion`
 Chat with a chat session
 ### Request
 - Method: POST
-- URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
   - `question`: string
   - `stream`: bool
 #### Request example
 ```bash
 curl --request POST \
-  --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \
   --header 'Content-Type: application/json' \
   --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
   --data-binary '{
@@ -1743,6 +1915,8 @@ curl --request POST \
 - `stream`: (*Body Parameter*)
     The approach of streaming text generation.
     `False`
 ### Response
 Success
 ```json

 }
 ```
+## Delete files from a dataset
+**DELETE** `/api/v1/dataset/{dataset_id}/document `
+Delete files from a dataset
+### Request
+- Method: DELETE
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document`
+- Headers:
+  - 'Content-Type: application/json'
+  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `ids`:List[str]
+#### Request example
+```bash
+curl --request DELETE \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: {YOUR ACCESS TOKEN}' \
+  --data '{
+  "ids": ["id_1","id_2"]
+  }'
+```
+#### Request parameters
+- `"ids"`: (*Body parameter*)
+    The ids of teh documents to be deleted
+### Response
+The successful response includes a JSON object like the following:
+```json
+{
+    "code": 0
+}.
+```
+- `"error_code"`: `integer`
+  `0`: The operation succeeds.
+The error response includes a JSON object like the following:
+```json
+{
+    "code": 102,
+    "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
+}
+```
 ## Download a file from a dataset
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
+Downloads a file from a dataset.
 ### Request
 - Method: GET
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}`
 - Headers:
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Output:
   - '{FILE_NAME}'
 ```bash
 curl --request GET \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
+  --output ./ragflow.txt
 ```
 #### Request parameters
 ### Response
+The successful response includes a text object like the following:
 ```text
 test_2.
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `name`:`string`
+  - `parser_method`:`string`
+  - `parser_config`:`dict`
 #### Request example
 ```bash
 curl --request PUT \
+  --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \
   --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
   --header 'Content-Type: application/json' \
   --data '{
   "name": "manual.txt",
   "parser_method": "manual",
+  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12}
   }'
 ```
 #### Request parameters
 - `"parser_method"`: (*Body parameter*)
     Method used to parse the document.
 - `"parser_config"`: (*Body parameter*)
     Configuration object for the parser.
     - If the value is `None`, a dictionary with default values will be generated.
 - `"name"`: (*Body parameter*)
     Name or title of the document.
 ### Response
 ### Request
 - Method: POST
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk `
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `document_ids`:List[str]
 #### Request example
+```bash
 curl --request POST \
+    --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
+    --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
+- `"document_ids"`:(*Body parameter*)
+  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
+```json
 {
     "code": 0
 }
 The error response includes a JSON object like the following:
+```json
 {
+    "code": 102,
+    "message": "`document_ids` is required"
 }
 ```
 ### Request
+- Method: DELETE
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `document_ids`:List[str]
 #### Request example
+```bash
 curl --request DELETE \
+   --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
+   --header 'Content-Type: application/json' \
+   --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
+   --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
+- `"document_ids"`:(*Body parameter*)
+  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
+```json
 {
     "code": 0
 }
 The error response includes a JSON object like the following:
+```json
 {
+    "code": 102,
+    "message": "`document_ids` is required"
 }
 ```
 ## Get document chunk list
+**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 Get document chunk list
 ### Request
 - Method: GET
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 - Headers:
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
+```bash
 curl --request GET \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
 - `"document_id"`: (*Path parameter*)
+- `"offset"`(*Filter parameter*)
+  The beginning number of records for paging.
+- `"keywords"`(*Filter parameter*)
+  List chunks whose name has the given keywords
+- `"limit"`(*Filter parameter*)
+  Records number to return
+- `"id"`(*Filter parameter*)
+  The id of chunk to be got
 ### Response
 The successful response includes a JSON object like the following:
+```json
 {
+    "code": 0,
     "data": {
+        "chunks": [],
         "doc": {
+            "chunk_num": 0,
+            "create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
+            "create_time": 1727581649216,
+            "created_by": "69736c5e723611efb51b0242ac120007",
+            "id": "8cb781ec7e1511ef98ac0242ac120006",
+            "kb_id": "c7ee74067a2c11efb21c0242ac120006",
+            "location": "明天的天气是晴天.txt",
+            "name": "明天的天气是晴天.txt",
             "parser_config": {
+                "pages": [
+                    [
+                        1,
+                        1000000
+                    ]
+                ]
             },
+            "parser_id": "naive",
+            "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
+            "process_duation": 1435.37,
+            "progress": 0.0370833,
+            "progress_msg": "\nTask has been received.",
+            "run": "1",
+            "size": 24,
             "source_type": "local",
             "status": "1",
             "thumbnail": null,
+            "token_num": 0,
             "type": "doc",
+            "update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
+            "update_time": 1728989266371
         },
+        "total": 0
+    }
 }
 ```
 The error response includes a JSON object like the following:
+```json
 {
+    "code": 102,
+    "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5."
 }
 ```
 ### Request
 - Method: DELETE
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `chunk_ids`:List[str]
 #### Request example
+```bash
 curl --request DELETE \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
+  --data '{
+  "chunk_ids": ["test_1", "test_2"]
+  }'
 ```
+#### Request parameters
+- `"chunk_ids"`:(*Body parameter*)
+  The chunks of the document to be deleted
+### Response
+Success
+```json
+{
+    "code": 0
+}
+```
+Error
+```json
+{
+    "code": 102,
+    "message": "`chunk_ids` is required"
+}
+```
 ## Update document chunk
+**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 Update document chunk
 ### Request
 - Method: PUT
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `content`:str
+  - `important_keywords`:str
+  - `available`:int
 #### Request example
+```bash
 curl --request PUT \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
+  --data '{
+    "content": "ragflow123",
+    "important_keywords": [],
+}'
 ```
+#### Request parameters
+- `"content"`:(*Body parameter*)
+  Contains the main text or information of the chunk.
+- `"important_keywords"`:(*Body parameter*)
+  list the key terms or phrases that are significant or central to the chunk's content.
+- `"available"`:(*Body parameter*)
+   Indicating the availability status, 0 means unavailable and 1 means available.
+### Response
+Success
+```json
+{
+    "code": 0
+}
+```
+Error
+```json
+{
+    "code": 102,
+    "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2"
+}
+```
 ## Insert document chunks
 **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 ### Request
 - Method: POST
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `content`: str
+  - `important_keywords`:List[str]
 #### Request example
+```bash
 curl --request POST \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
+  --data '{
+    "content": "ragflow content"
+}'
 ```
+#### Request parameters
+- `content`:(*Body parameter*)
+  Contains the main text or information of the chunk.
+- `important_keywords`(*Body parameter*)
+  list the key terms or phrases that are significant or central to the chunk's content.
+### Response
+Success
+```json
+{
+    "code": 0,
+    "data": {
+        "chunk": {
+            "content": "ragflow content",
+            "create_time": "2024-10-16 08:05:04",
+            "create_timestamp": 1729065904.581025,
+            "dataset_id": [
+                "c7ee74067a2c11efb21c0242ac120006"
+            ],
+            "document_id": "5c5999ec7be811ef9cab0242ac120005",
+            "id": "d78435d142bd5cf6704da62c778795c5",
+            "important_keywords": []
+        }
+    }
+}
+```
+Error
+```json
+{
+    "code": 102,
+    "message": "`content` is required"
+}
+```
 ## Dataset retrieval test
+**GET** `/api/v1/retrieval`
 Retrieval test of a dataset
 ### Request
+- Method: POST
+- URL: `http://{address}/api/v1/retrieval`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+- Body:
+  - `question`: str
+  - `datasets`: List[str]
+  - `documents`: List[str]
+  - `offset`: int
+  - `limit`: int
+  - `similarity_threshold`: float
+  - `vector_similarity_weight`: float
+  - `top_k`: int
+  - `rerank_id`: string
+  - `keyword`: bool
+  - `highlight`: bool
 #### Request example
+```bash
+curl --request POST \
+  --url http://{address}/api/v1/retrieval \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
+  --data '{
+    "question": "What is advantage of ragflow?",
+    "datasets": [
+        "b2a62730759d11ef987d0242ac120004"
+    ],
+    "documents": [
+        "77df9ef4759a11ef8bdd0242ac120004"
+    ]
+}'
 ```
+#### Request parameter
+- `"question"`: (*Body parameter*)
+  User's question, search keywords
+  `""`
+- `"datasets"`: (*Body parameter*)
+  The scope of datasets
+  `None`
+- `"documents"`: (*Body parameter*)
+  The scope of document. `None` means no limitation
+  `None`
+- `"offset"`: (*Body parameter*)
+  The beginning point of retrieved records
+  `1`
+- `"limit"`: (*Body parameter*)
+  The maximum number of records needed to return
+  `30`
+- `"similarity_threshold"`: (*Body parameter*)
+  The minimum similarity score
+  `0.2`
+- `"vector_similarity_weight"`: (*Body parameter*)
+  The weight of vector cosine similarity, `1 - x` is the term similarity weight
+  `0.3`
+- `"top_k"`: (*Body parameter*)
+  Number of records engaged in vector cosine computation
+  `1024`
+- `"rerank_id"`: (*Body parameter*)
+  ID of the rerank model
+  `None`
+- `"keyword"`: (*Body parameter*)
+  Whether keyword-based matching is enabled
+  `False`
+- `"highlight"`: (*Body parameter*)
+  Whether to enable highlighting of matched terms in the results
+  `False`
+### Response
+Success
+```json
+{
+    "code": 0,
+    "data": {
+        "chunks": [
+            {
+                "content": "ragflow content",
+                "content_ltks": "ragflow content",
+                "document_id": "5c5999ec7be811ef9cab0242ac120005",
+                "document_keyword": "1.txt",
+                "highlight": "<em>ragflow</em> content",
+                "id": "d78435d142bd5cf6704da62c778795c5",
+                "img_id": "",
+                "important_keywords": [
+                    ""
+                ],
+                "kb_id": "c7ee74067a2c11efb21c0242ac120006",
+                "positions": [
+                    ""
+                ],
+                "similarity": 0.9669436601210759,
+                "term_similarity": 1.0,
+                "vector_similarity": 0.8898122004035864
+            }
+        ],
+        "doc_aggs": [
+            {
+                "count": 1,
+                "doc_id": "5c5999ec7be811ef9cab0242ac120005",
+                "doc_name": "1.txt"
+            }
+        ],
+        "total": 1
+    }
+}
+```
+Error
+```json
+{
+    "code": 102,
+    "message": "`datasets` is required."
+}
+```
 ## Create chat
 **POST** `/api/v1/chat`
 ## Chat with a chat session
+**POST** `/api/v1/chat/{chat_id}/completion`
 Chat with a chat session
 ### Request
 - Method: POST
+- URL: `http://{address} /api/v1/chat/{chat_id}/completion`
 - Headers:
   - `content-Type: application/json`
   - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
   - `question`: string
   - `stream`: bool
+  - `session_id`: str
 #### Request example
 ```bash
 curl --request POST \
+  --url http://{address} /api/v1/chat/{chat_id}/completion \
   --header 'Content-Type: application/json' \
   --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
   --data-binary '{
 - `stream`: (*Body Parameter*)
     The approach of streaming text generation.
     `False`
+- `session_id`: (*Body Parameter*)
+    The id of session.If not provided, a new session will be generated.
 ### Response
 Success
 ```json

api/python_api_reference.md CHANGED Viewed

@@ -244,42 +244,117 @@ File management inside knowledge base
 ## Upload document
 ```python
-RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
 ```
 ### Parameters
-#### name
-#### blob
 ### Returns
 ### Examples
 ---
-## Retrieve document
 ```python
-RAGFlow.get_document(id:str=None,name:str=None) -> Document
 ```
 ### Parameters
-#### id: `str`, *Required*
-ID of the document to retrieve.
-#### name: `str`
-Name or title of the document.
 ### Returns
 A document object containing the following attributes:
 #### id: `str`
@@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
 ```python
 from ragflow import RAGFlow
-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
-print(doc)
-```
----
-## Save document settings
-```python
-Document.save() -> bool
-```
-### Returns
-bool
-### Examples
-```python
-from ragflow import RAGFlow
-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
-doc.parser_method= "manual"
-doc.save()
-```
----
-## Download document
-```python
-Document.download() -> bytes
-```
-### Returns
-bytes of the document.
-### Examples
-```python
-from ragflow import RAGFlow
-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
-open("~/ragflow.txt", "w+").write(doc.download())
-print(doc)
-```
----
-## List documents
-```python
-Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
-```
-### Parameters
-#### keywords: `str`
-List documents whose name has the given keywords. Defaults to `None`.
-#### offset: `int`
-The beginning number of records for paging. Defaults to `0`.
-#### limit: `int`
-Records number to return, -1 means all of them. Records number to return, -1 means all of them.
-### Returns
-List[Document]
-### Examples
-```python
-from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 filename1 = "~/ragflow.txt"
-rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
-filename2 = "~/infinity.txt"
-rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
-for d in ds.list_docs(keywords="rag", offset=0, limit=12):
     print(d)
 ```
@@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12):
 ## Delete documents
 ```python
-Document.delete() -> bool
 ```
 ### Returns
-bool
-description: delete success or not
 ### Examples
@@ -465,119 +455,87 @@ description: delete success or not
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.create_dataset(name="kb_1")
-filename1 = "~/ragflow.txt"
-rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
-filename2 = "~/infinity.txt"
-rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
-for d in ds.list_docs(keywords="rag", offset=0, limit=12):
-    d.delete()
 ```
 ---
-## Parse document
 ```python
-Document.async_parse() -> None
-RAGFLOW.async_parse_documents() -> None
 ```
 ### Parameters
 ????????????????????????????????????????????????????
 ### Returns
 ????????????????????????????????????????????????????
 ### Examples
-```python
-#document parse and cancel
-rag = RAGFlow(API_KEY, HOST_ADDRESS)
-ds = rag.create_dataset(name="dataset_name")
-name3 = 'ai.pdf'
-path = 'test_data/ai.pdf'
-rag.create_document(ds, name=name3, blob=open(path, "rb").read())
-doc = rag.get_document(name="ai.pdf")
-doc.async_parse()
-print("Async parsing initiated")
-```
----
-## Cancel document parsing
-```python
-rag.async_cancel_parse_documents(ids)
-RAGFLOW.async_cancel_parse_documents()-> None
-```
-### Parameters
-#### ids, `list[]`
-### Returns
-?????????????????????????????????????????????????
-### Examples
 ```python
 #documents parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="God5")
 documents = [
-    {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
-    {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
-    {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
 ]
-# Create documents in bulk
-for doc_info in documents:
-    with open(doc_info['path'], "rb") as file:
-        created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
-docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
-ids = [doc.id for doc in docs]
-rag.async_parse_documents(ids)
 print("Async bulk parsing initiated")
-for doc in docs:
-    for progress, msg in doc.join(interval=5, timeout=10):
-        print(f"{doc.name}: Progress: {progress}, Message: {msg}")
-cancel_result = rag.async_cancel_parse_documents(ids)
 print("Async bulk parsing cancelled")
 ```
----
-## Join document
-??????????????????
 ```python
-Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
 ```
 ### Parameters
-#### interval: `int`
-Time interval in seconds for progress report. Defaults to `15`.
-#### timeout: `int`
-Timeout in seconds. Defaults to `3600`.
 ### Returns
-iteral[Tuple[float, str]]
 ## Add chunk
 ```python
@@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk
 ### Parameters
 #### content: `str`, *Required*
 ### Returns
@@ -598,7 +559,10 @@ chunk
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
 chunk = doc.add_chunk(content="xxxxxxx")
 ```
@@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx")
 ## Delete chunk
 ```python
-Chunk.delete() -> bool
 ```
 ### Returns
-bool
 ### Examples
@@ -620,22 +587,34 @@ bool
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.delete()
 ```
 ---
-## Save chunk contents
 ```python
-Chunk.save() -> bool
 ```
 ### Returns
-bool
 ### Examples
@@ -643,10 +622,12 @@ bool
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.content = "sdfx"
-chunk.save()
 ```
 ---
@@ -654,7 +635,7 @@ chunk.save()
 ## Retrieval
 ```python
-RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None,     offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
 ```
 ### Parameters
@@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def
 Number of records engaged in vector cosine computaton. Defaults to `1024`.
 ### Returns
 List[Chunk]
@@ -701,18 +691,17 @@ List[Chunk]
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.get_dataset(name="ragflow")
 name = 'ragflow_test.txt'
-path = 'test_data/ragflow_test.txt'
 rag.create_document(ds, name=name, blob=open(path, "rb").read())
-doc = rag.get_document(name=name)
-doc.async_parse()
-# Wait for parsing to complete
-for progress, msg in doc.join(interval=5, timeout=30):
-    print(progress, msg)
-for c in rag.retrieval(question="What's ragflow?",
-             datasets=[ds], documents=[doc],
-             offset=0, limit=6, similarity_threshold=0.1,
              vector_similarity_weight=0.3,
              top_k=1024
              ):

 ## Upload document
 ```python
+DataSet.upload_documents(document_list: List[dict])
 ```
 ### Parameters
+#### document_list:`List[dict]`
+A list composed of dicts containing `name` and `blob`.
+### Returns
+no return
+### Examples
+```python
+from ragflow import RAGFlow
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.create_dataset(name="kb_1")
+ds.upload_documents([{name="1.txt", blob="123"}, ...] }
+```
+---
+## Update document
+```python
+Document.update(update_message:dict)
+```
+### Parameters
+#### update_message:`dict`
+only `name`,`parser_config`,`parser_method` can be changed
+### Returns
+no return
+### Examples
+```python
+from ragflow import RAGFlow
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds=rag.list_datasets(id='id')
+ds=ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
+doc.update([{"parser_method": "manual"...}])
+```
+---
+## Download document
+```python
+Document.download() -> bytes
+```
 ### Returns
+bytes of the document.
 ### Examples
+```python
+from ragflow import RAGFlow
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds=rag.list_datasets(id="id")
+ds=ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
+open("~/ragflow.txt", "wb+").write(doc.download())
+print(doc)
+```
 ---
+## List documents
 ```python
+Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
 ```
 ### Parameters
+#### id: `str`
+The id of the document to be got
+#### keywords: `str`
+List documents whose name has the given keywords. Defaults to `None`.
+#### offset: `int`
+The beginning number of records for paging. Defaults to `0`.
+#### limit: `int`
+Records number to return, -1 means all of them. Records number to return, -1 means all of them.
+#### orderby: `str`
+The field by which the records should be sorted. This specifies the attribute or column used to order the results.
+#### desc:`bool`
+A boolean flag indicating whether the sorting should be in descending order.
 ### Returns
+List[Document]
 A document object containing the following attributes:
 #### id: `str`
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 filename1 = "~/ragflow.txt"
+blob=open(filename1 , "rb").read()
+list_files=[{"name":filename1,"blob":blob}]
+ds.upload_documents(list_files)
+for d in ds.list_documents(keywords="rag", offset=0, limit=12):
     print(d)
 ```
 ## Delete documents
 ```python
+DataSet.delete_documents(ids: List[str] = None)
 ```
 ### Returns
+no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets(name="kb_1")
+ds = ds[0]
+ds.delete_documents(ids=["id_1","id_2"])
 ```
 ---
+## Parse and stop parsing document
 ```python
+DataSet.async_parse_documents(document_ids:List[str]) -> None
+DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
 ```
 ### Parameters
+#### document_ids:`List[str]`
+The ids of the documents to be parsed
 ????????????????????????????????????????????????????
 ### Returns
+no return
 ????????????????????????????????????????????????????
 ### Examples
 ```python
 #documents parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="God5")
 documents = [
+    {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
+    {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
+    {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
 ]
+ds.upload_documents(documents)
+documents=ds.list_documents(keywords="test")
+ids=[]
+for document in documents:
+    ids.append(document.id)
+ds.async_parse_documents(ids)
 print("Async bulk parsing initiated")
+ds.async_cancel_parse_documents(ids)
 print("Async bulk parsing cancelled")
 ```
+## List chunks
 ```python
+Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
 ```
 ### Parameters
+- `keywords`: `str`
+  List chunks whose name has the given keywords
+  default: `None`
+- `offset`: `int`
+  The beginning number of records for paging
+  default: `1`
+- `limit`: `int`
+  Records number to return
+  default: `30`
+- `id`: `str`
+  The ID of the chunk to be retrieved
+  default: `None`
 ### Returns
+List[chunk]
+### Examples
+```python
+from ragflow import RAGFlow
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets("123")
+ds = ds[0]
+ds.async_parse_documents(["wdfxb5t547d"])
+for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
+    print(c)
+```
 ## Add chunk
 ```python
 ### Parameters
 #### content: `str`, *Required*
+Contains the main text or information of the chunk.
+#### important_keywords :`List[str]`
+list the key terms or phrases that are significant or central to the chunk's content.
 ### Returns
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 ```
 ## Delete chunk
 ```python
+Document.delete_chunks(chunk_ids: List[str])
 ```
+### Parameters
+#### chunk_ids:`List[str]`
+The list of chunk_id
 ### Returns
+no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
+doc.delete_chunks(["id_1","id_2"])
 ```
 ---
+## Update chunk
 ```python
+Chunk.update(update_message: dict)
 ```
+### Parameters
+- `content`: `str`
+  Contains the main text or information of the chunk
+- `important_keywords`: `List[str]`
+  List the key terms or phrases that are significant or central to the chunk's content
+- `available`: `int`
+  Indicating the availability status, `0` means unavailable and `1` means available
 ### Returns
+no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
+chunk.update({"content":"sdfx...})
 ```
 ---
 ## Retrieval
 ```python
+RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
 ```
 ### Parameters
 Number of records engaged in vector cosine computaton. Defaults to `1024`.
+#### rerank_id:`str`
+ID of the rerank model.  Defaults to `None`.
+#### keyword:`bool`
+Indicating whether keyword-based matching is enabled (True) or disabled (False).
+#### highlight:`bool`
+Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
 ### Returns
 List[Chunk]
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets(name="ragflow")
+ds = ds[0]
 name = 'ragflow_test.txt'
+path = './test_data/ragflow_test.txt'
 rag.create_document(ds, name=name, blob=open(path, "rb").read())
+doc = ds.list_documents(name=name)
+doc = doc[0]
+ds.async_parse_documents([doc.id])
+for c in rag.retrieve(question="What's ragflow?",
+             datasets=[ds.id], documents=[doc.id],
+             offset=1, limit=30, similarity_threshold=0.2,
              vector_similarity_weight=0.3,
              top_k=1024
              ):

sdk/python/ragflow/modules/chunk.py CHANGED Viewed

@@ -17,32 +17,11 @@ class Chunk(Base):
                 res_dict.pop(k)
         super().__init__(rag, res_dict)
-    def delete(self) -> bool:
-        """
-        Delete the chunk in the document.
-        """
-        res = self.post('/doc/chunk/rm',
-                        {"document_id": self.document_id, 'chunk_ids': [self.id]})
-        res = res.json()
-        if res.get("retmsg") == "success":
-            return True
-        raise Exception(res["retmsg"])
-    def save(self) -> bool:
-        """
-        Save the document details to the server.
-        """
-        res = self.post('/doc/chunk/set',
-                        {"chunk_id": self.id,
-                         "knowledgebase_id": self.knowledgebase_id,
-                         "name": self.document_name,
-                         "content": self.content,
-                         "important_keywords": self.important_keywords,
-                         "document_id": self.document_id,
-                         "available": self.available,
-                         })
         res = res.json()
-        if res.get("retmsg") == "success":
-            return True
-        raise Exception(res["retmsg"])

                 res_dict.pop(k)
         super().__init__(rag, res_dict)
+    def update(self,update_message:dict):
+        res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message)
         res = res.json()
+        if res.get("code") != 0 :
+            raise Exception(res["message"])

sdk/python/ragflow/modules/dataset.py CHANGED Viewed

@@ -65,3 +65,14 @@ class DataSet(Base):
         if res.get("code") != 0:
             raise Exception(res["message"])

         if res.get("code") != 0:
             raise Exception(res["message"])
+    def async_parse_documents(self,document_ids):
+        res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
+        res = res.json()
+        if res.get("code") != 0:
+            raise Exception(res.get("message"))
+    def async_cancel_parse_documents(self,document_ids):
+        res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
+        res = res.json()
+        if res.get("code") != 0:
+            raise Exception(res.get("message"))

sdk/python/ragflow/modules/document.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import time
 from .base import Base
 from .chunk import Chunk
 class Document(Base):
@@ -29,160 +32,28 @@ class Document(Base):
                 res_dict.pop(k)
         super().__init__(rag, res_dict)
-    def update(self,update_message:dict) -> bool:
-        """
-        Save the document details to the server.
-        """
-        res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
-        res = res.json()
-        if res.get("code") != 0:
-            raise Exception(res["message"])
-    def delete(self) -> bool:
-        """
-        Delete the document from the server.
-        """
-        res = self.rm('/doc/delete',
-                      {"document_id": self.id})
         res = res.json()
-        if res.get("retmsg") == "success":
-            return True
-        raise Exception(res["retmsg"])
-    def download(self) -> bytes:
-        """
-        Download the document content from the server using the Flask API.
-        :return: The downloaded document content in bytes.
-        """
-        # Construct the URL for the API request using the document ID and knowledge base ID
-        res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
-        # Check the response status code to ensure the request was successful
-        if res.status_code == 200:
-            # Return the document content as bytes
-            return res.content
-        else:
-            # Handle the error and raise an exception
-            raise Exception(
-                f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
-            )
-    def async_parse(self):
-        """
-        Initiate document parsing asynchronously without waiting for completion.
-        """
-        try:
-            # Construct request data including document ID and run status (assuming 1 means to run)
-            data = {"document_ids": [self.id], "run": 1}
-            # Send a POST request to the specified parsing status endpoint to start parsing
-            res = self.post(f'/doc/run', data)
-            # Check the server response status code
-            if res.status_code != 200:
-                raise Exception(f"Failed to start async parsing: {res.text}")
-            print("Async parsing started successfully.")
-        except Exception as e:
-            # Catch and handle exceptions
-            print(f"Error occurred during async parsing: {str(e)}")
-            raise
-    import time
-    def join(self, interval=5, timeout=3600):
-        """
-        Wait for the asynchronous parsing to complete and yield parsing progress periodically.
-        :param interval: The time interval (in seconds) for progress reports.
-        :param timeout: The timeout (in seconds) for the parsing operation.
-        :return: An iterator yielding parsing progress and messages.
-        """
-        start_time = time.time()
-        while time.time() - start_time < timeout:
-            # Check the parsing status
-            res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
-            res_data = res.json()
-            data = res_data.get("data", [])
-            # Retrieve progress and status message
-            progress = data.get("progress", 0)
-            progress_msg = data.get("status", "")
-            yield progress, progress_msg  # Yield progress and message
-            if progress == 100:  # Parsing completed
-                break
-            time.sleep(interval)
-    def cancel(self):
-        """
-        Cancel the parsing task for the document.
-        """
-        try:
-            # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
-            data = {"document_ids": [self.id], "run": 2}
-            # Send a POST request to the specified parsing status endpoint to cancel parsing
-            res = self.post(f'/doc/run', data)
-            # Check the server response status code
-            if res.status_code != 200:
-                print("Failed to cancel parsing. Server response:", res.text)
-            else:
-                print("Parsing cancelled successfully.")
-        except Exception as e:
-            print(f"Error occurred during async parsing cancellation: {str(e)}")
-            raise
-    def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
-        """
-        List all chunks associated with this document by calling the external API.
-        Args:
-            page (int): The page number to retrieve (default 1).
-            size (int): The number of chunks per page (default 30).
-            keywords (str): Keywords for searching specific chunks (default "").
-            available_int (int): Filter for available chunks (optional).
-        Returns:
-            list: A list of chunks returned from the API.
-        """
-        data = {
-            "document_id": self.id,
-            "page": page,
-            "size": size,
-            "keywords": keywords,
-            "offset":offset,
-            "limit":limit
-        }
-        if available_int is not None:
-            data["available_int"] = available_int
-        res = self.post(f'/doc/chunk/list', data)
-        if res.status_code == 200:
-            res_data = res.json()
-            if res_data.get("retmsg") == "success":
-                chunks=[]
-                for chunk_data in res_data["data"].get("chunks", []):
-                    chunk=Chunk(self.rag,chunk_data)
-                    chunks.append(chunk)
-                return chunks
-            else:
-                raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
-        else:
-            raise Exception(f"API request failed with status code {res.status_code}")
     def add_chunk(self, content: str):
-        res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
-        if res.status_code == 200:
-            res_data = res.json().get("data")
-            chunk_data = res_data.get("chunk")
-            return Chunk(self.rag,chunk_data)
-        else:
-            raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")

 import time
+from PIL.ImageFile import raise_oserror
 from .base import Base
 from .chunk import Chunk
+from typing import List
 class Document(Base):
                 res_dict.pop(k)
         super().__init__(rag, res_dict)
+    def list_chunks(self,offset=0, limit=30, keywords="", id:str=None):
+        data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id}
+        res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data)
         res = res.json()
+        if res.get("code") == 0:
+            chunks=[]
+            for data in res["data"].get("chunks"):
+                chunk = Chunk(self.rag,data)
+                chunks.append(chunk)
+            return chunks
+        raise Exception(res.get("message"))
     def add_chunk(self, content: str):
+        res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content})
+        res = res.json()
+        if res.get("code") == 0:
+            return Chunk(self.rag,res["data"].get("chunk"))
+        raise Exception(res.get("message"))
+    def delete_chunks(self,ids:List[str]):
+        res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids})
+        res = res.json()
+        if res.get("code")!=0:
+            raise Exception(res.get("message"))

sdk/python/ragflow/modules/session.py CHANGED Viewed

@@ -15,8 +15,8 @@ class Session(Base):
         for message in self.messages:
             if "reference" in message:
                 message.pop("reference")
-        res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion",
-                        {"question": question, "stream": True}, stream=stream)
         for line in res.iter_lines():
             line = line.decode("utf-8")
             if line.startswith("{"):
@@ -82,3 +82,4 @@ class Chunk(Base):
         self.term_similarity = None
         self.positions = None
         super().__init__(rag, res_dict)

         for message in self.messages:
             if "reference" in message:
                 message.pop("reference")
+        res = self.post(f"/chat/{self.chat_id}/completion",
+                        {"question": question, "stream": True,"session_id":self.id}, stream=stream)
         for line in res.iter_lines():
             line = line.decode("utf-8")
             if line.startswith("{"):
         self.term_similarity = None
         self.positions = None
         super().__init__(rag, res_dict)

sdk/python/ragflow/ragflow.py CHANGED Viewed

@@ -158,105 +158,30 @@ class RAGFlow:
         raise Exception(res["message"])
-    def async_parse_documents(self, doc_ids):
-        """
-        Asynchronously start parsing multiple documents without waiting for completion.
-        :param doc_ids: A list containing multiple document IDs.
-        """
-        try:
-            if not doc_ids or not isinstance(doc_ids, list):
-                raise ValueError("doc_ids must be a non-empty list of document IDs")
-            data = {"document_ids": doc_ids, "run": 1}
-            res = self.post(f'/doc/run', data)
-            if res.status_code != 200:
-                raise Exception(f"Failed to start async parsing for documents: {res.text}")
-            print(f"Async parsing started successfully for documents: {doc_ids}")
-        except Exception as e:
-            print(f"Error occurred during async parsing for documents: {str(e)}")
-            raise
-    def async_cancel_parse_documents(self, doc_ids):
-        """
-        Cancel the asynchronous parsing of multiple documents.
-        :param doc_ids: A list containing multiple document IDs.
-        """
-        try:
-            if not doc_ids or not isinstance(doc_ids, list):
-                raise ValueError("doc_ids must be a non-empty list of document IDs")
-            data = {"document_ids": doc_ids, "run": 2}
-            res = self.post(f'/doc/run', data)
-            if res.status_code != 200:
-                raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
-            print(f"Async parsing canceled successfully for documents: {doc_ids}")
-        except Exception as e:
-            print(f"Error occurred during canceling parsing for documents: {str(e)}")
-            raise
-    def retrieval(self,
-                  question,
-                  datasets=None,
-                  documents=None,
-                  offset=0,
-                  limit=6,
-                  similarity_threshold=0.1,
-                  vector_similarity_weight=0.3,
-                  top_k=1024):
-        """
-        Perform document retrieval based on the given parameters.
-        :param question: The query question.
-        :param datasets: A list of datasets (optional, as documents may be provided directly).
-        :param documents: A list of documents (if specific documents are provided).
-        :param offset: Offset for the retrieval results.
-        :param limit: Maximum number of retrieval results.
-        :param similarity_threshold: Similarity threshold.
-        :param vector_similarity_weight: Weight of vector similarity.
-        :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking).
-        Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API.
-        """
-        try:
-            data = {
-                "question": question,
-                "datasets": datasets if datasets is not None else [],
-                "documents": [doc.id if hasattr(doc, 'id') else doc for doc in
-                              documents] if documents is not None else [],
                 "offset": offset,
                 "limit": limit,
                 "similarity_threshold": similarity_threshold,
                 "vector_similarity_weight": vector_similarity_weight,
                 "top_k": top_k,
                 "knowledgebase_id": datasets,
             }
             # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
-            res = self.post(f'/doc/retrieval_test', data)
-            # Check the response status code
-            if res.status_code == 200:
-                res_data = res.json()
-                if res_data.get("retmsg") == "success":
-                    chunks = []
-                    for chunk_data in res_data["data"].get("chunks", []):
-                        chunk = Chunk(self, chunk_data)
-                        chunks.append(chunk)
-                    return chunks
-                else:
-                    raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
-            else:
-                raise Exception(f"API request failed with status code {res.status_code}")
-        except Exception as e:
-            print(f"An error occurred during retrieval: {e}")
-            raise

         raise Exception(res["message"])
+    def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,):
+            data_params = {
                 "offset": offset,
                 "limit": limit,
                 "similarity_threshold": similarity_threshold,
                 "vector_similarity_weight": vector_similarity_weight,
                 "top_k": top_k,
                 "knowledgebase_id": datasets,
+                "rerank_id":rerank_id,
+                "keyword":keyword
+            }
+            data_json ={
+                "question": question,
+                "datasets": datasets,
+                "documents": documents
             }
             # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
+            res = self.get(f'/retrieval', data_params,data_json)
+            res = res.json()
+            if res.get("code") ==0:
+                chunks=[]
+                for chunk_data in res["data"].get("chunks"):
+                    chunk=Chunk(self,chunk_data)
+                    chunks.append(chunk)
+                return chunks
+            raise Exception(res.get("message"))

sdk/python/test/t_document.py CHANGED Viewed

@@ -63,17 +63,13 @@ class TestDocument(TestSdk):
         # Check if the retrieved document is of type Document
         if isinstance(doc, Document):
             # Download the document content and save it to a file
-            try:
-                with open("ragflow.txt", "wb+") as file:
-                    file.write(doc.download())
-                    # Print the document object for debugging
-                print(doc)
-                # Assert that the download was successful
-                assert True, "Document downloaded successfully."
-            except Exception as e:
-                # If an error occurs, raise an assertion error
-                assert False, f"Failed to download document, error: {str(e)}"
         else:
             # If the document retrieval fails, assert failure
             assert False, f"Failed to get document, error: {doc}"
@@ -100,7 +96,7 @@ class TestDocument(TestSdk):
         blob2 = b"Sample document content for ingestion test222."
         list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
         ds.upload_documents(list_1)
-        for d in ds.list_docs(keywords="test", offset=0, limit=12):
             assert isinstance(d, Document), "Failed to upload documents"
     def test_delete_documents_in_dataset_with_success(self):
@@ -123,16 +119,11 @@ class TestDocument(TestSdk):
         blob1 = b"Sample document content for ingestion test333."
         name2 = "Test Document444.txt"
         blob2 = b"Sample document content for ingestion test444."
-        name3 = 'test.txt'
-        path = 'test_data/test.txt'
-        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
-        rag.create_document(ds, name=name1, blob=blob1)
-        rag.create_document(ds, name=name2, blob=blob2)
-        for d in ds.list_docs(keywords="document", offset=0, limit=12):
             assert isinstance(d, Document)
-            d.delete()
-            print(d)
-        remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
         assert len(remaining_docs) == 0, "Documents were not properly deleted."
     def test_parse_and_cancel_document(self):
@@ -144,16 +135,15 @@ class TestDocument(TestSdk):
         # Define the document name and path
         name3 = 'westworld.pdf'
-        path = 'test_data/westworld.pdf'
         # Create a document in the dataset using the file path
-        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
         # Retrieve the document by name
-        doc = rag.get_document(name="westworld.pdf")
-        # Initiate asynchronous parsing
-        doc.async_parse()
         # Print message to confirm asynchronous parsing has been initiated
         print("Async parsing initiated")

         # Check if the retrieved document is of type Document
         if isinstance(doc, Document):
             # Download the document content and save it to a file
+            with open("./ragflow.txt", "wb+") as file:
+                file.write(doc.download())
+                # Print the document object for debugging
+            print(doc)
+            # Assert that the download was successful
+            assert True, f"Failed to download document, error: {doc}"
         else:
             # If the document retrieval fails, assert failure
             assert False, f"Failed to get document, error: {doc}"
         blob2 = b"Sample document content for ingestion test222."
         list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
         ds.upload_documents(list_1)
+        for d in ds.list_documents(keywords="test", offset=0, limit=12):
             assert isinstance(d, Document), "Failed to upload documents"
     def test_delete_documents_in_dataset_with_success(self):
         blob1 = b"Sample document content for ingestion test333."
         name2 = "Test Document444.txt"
         blob2 = b"Sample document content for ingestion test444."
+        ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}])
+        for d in ds.list_documents(keywords="document", offset=0, limit=12):
             assert isinstance(d, Document)
+            ds.delete_documents([d.id])
+        remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12)
         assert len(remaining_docs) == 0, "Documents were not properly deleted."
     def test_parse_and_cancel_document(self):
         # Define the document name and path
         name3 = 'westworld.pdf'
+        path = './test_data/westworld.pdf'
         # Create a document in the dataset using the file path
+        ds.upload_documents({"name":name3, "blob":open(path, "rb").read()})
         # Retrieve the document by name
+        doc = rag.list_documents(name="westworld.pdf")
+        doc = doc[0]
+        ds.async_parse_documents(document_ids=[])
         # Print message to confirm asynchronous parsing has been initiated
         print("Async parsing initiated")