JobSmithManipulation
Kevin Hu
commited on
Commit
·
ce45214
1
Parent(s):
5e72d47
update sdk document (#2374)
Browse files### What problem does this PR solve?
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Kevin Hu <[email protected]>
- api/apps/sdk/doc.py +180 -0
- sdk/python/ragflow/__init__.py +2 -1
- sdk/python/ragflow/modules/dataset.py +40 -0
- sdk/python/ragflow/modules/document.py +75 -0
- sdk/python/ragflow/ragflow.py +30 -1
- sdk/python/test/ragflow.txt +1 -0
- sdk/python/test/t_document.py +144 -0
api/apps/sdk/doc.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import BytesIO
|
2 |
+
|
3 |
+
from flask import request,send_file
|
4 |
+
from api.utils.api_utils import get_json_result, construct_json_result, server_error_response
|
5 |
+
from api.utils.api_utils import get_json_result, token_required, get_data_error_result
|
6 |
+
from api.db import FileType, ParserType, FileSource, TaskStatus
|
7 |
+
from api.db.db_models import File
|
8 |
+
from api.db.services.document_service import DocumentService
|
9 |
+
from api.db.services.file2document_service import File2DocumentService
|
10 |
+
from api.db.services.file_service import FileService
|
11 |
+
from api.db.services.knowledgebase_service import KnowledgebaseService
|
12 |
+
from api.db.services.user_service import TenantService, UserTenantService
|
13 |
+
from api.settings import RetCode
|
14 |
+
from api.utils.api_utils import construct_json_result, construct_error_response
|
15 |
+
from rag.utils.storage_factory import STORAGE_IMPL
|
16 |
+
|
17 |
+
|
18 |
+
@manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
|
19 |
+
@token_required
|
20 |
+
def upload(dataset_id, tenant_id):
|
21 |
+
if 'file' not in request.files:
|
22 |
+
return get_json_result(
|
23 |
+
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
24 |
+
file_objs = request.files.getlist('file')
|
25 |
+
for file_obj in file_objs:
|
26 |
+
if file_obj.filename == '':
|
27 |
+
return get_json_result(
|
28 |
+
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
29 |
+
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
30 |
+
if not e:
|
31 |
+
raise LookupError(f"Can't find the knowledgebase with ID {dataset_id}!")
|
32 |
+
err, _ = FileService.upload_document(kb, file_objs, tenant_id)
|
33 |
+
if err:
|
34 |
+
return get_json_result(
|
35 |
+
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
36 |
+
return get_json_result(data=True)
|
37 |
+
|
38 |
+
|
39 |
+
@manager.route('/infos', methods=['GET'])
|
40 |
+
@token_required
|
41 |
+
def docinfos(tenant_id):
|
42 |
+
req = request.args
|
43 |
+
if "id" in req:
|
44 |
+
doc_id = req["id"]
|
45 |
+
e, doc = DocumentService.get_by_id(doc_id)
|
46 |
+
return get_json_result(data=doc.to_json())
|
47 |
+
if "name" in req:
|
48 |
+
doc_name = req["name"]
|
49 |
+
doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
|
50 |
+
e, doc = DocumentService.get_by_id(doc_id)
|
51 |
+
return get_json_result(data=doc.to_json())
|
52 |
+
|
53 |
+
|
54 |
+
@manager.route('/save', methods=['POST'])
|
55 |
+
@token_required
|
56 |
+
def save_doc(tenant_id):
|
57 |
+
req = request.json # Expecting JSON input
|
58 |
+
if "id" in req:
|
59 |
+
doc_id = req["id"]
|
60 |
+
if "name" in req:
|
61 |
+
doc_name = req["name"]
|
62 |
+
doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
|
63 |
+
data = request.json
|
64 |
+
# Call the update method with the provided id and data
|
65 |
+
try:
|
66 |
+
num = DocumentService.update_by_id(doc_id, data)
|
67 |
+
if num > 0:
|
68 |
+
return get_json_result(retmsg="success", data={"updated_count": num})
|
69 |
+
else:
|
70 |
+
return get_json_result(retcode=404, retmsg="Document not found")
|
71 |
+
except Exception as e:
|
72 |
+
return get_json_result(retmsg=f"Error occurred: {str(e)}")
|
73 |
+
|
74 |
+
|
75 |
+
@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
|
76 |
+
@token_required
|
77 |
+
def download_document(dataset_id, document_id):
|
78 |
+
try:
|
79 |
+
# Check whether there is this dataset
|
80 |
+
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
|
81 |
+
if not exist:
|
82 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
83 |
+
message=f"This dataset '{dataset_id}' cannot be found!")
|
84 |
+
|
85 |
+
# Check whether there is this document
|
86 |
+
exist, document = DocumentService.get_by_id(document_id)
|
87 |
+
if not exist:
|
88 |
+
return construct_json_result(message=f"This document '{document_id}' cannot be found!",
|
89 |
+
code=RetCode.ARGUMENT_ERROR)
|
90 |
+
|
91 |
+
# The process of downloading
|
92 |
+
doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id) # minio address
|
93 |
+
file_stream = STORAGE_IMPL.get(doc_id, doc_location)
|
94 |
+
if not file_stream:
|
95 |
+
return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
|
96 |
+
|
97 |
+
file = BytesIO(file_stream)
|
98 |
+
|
99 |
+
# Use send_file with a proper filename and MIME type
|
100 |
+
return send_file(
|
101 |
+
file,
|
102 |
+
as_attachment=True,
|
103 |
+
download_name=document.name,
|
104 |
+
mimetype='application/octet-stream' # Set a default MIME type
|
105 |
+
)
|
106 |
+
|
107 |
+
# Error
|
108 |
+
except Exception as e:
|
109 |
+
return construct_error_response(e)
|
110 |
+
|
111 |
+
@manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
|
112 |
+
@token_required
|
113 |
+
def list_docs(dataset_id,tenant_id):
|
114 |
+
kb_id = request.args.get("kb_id")
|
115 |
+
if not kb_id:
|
116 |
+
return get_json_result(
|
117 |
+
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
118 |
+
tenants = UserTenantService.query(user_id=tenant_id)
|
119 |
+
for tenant in tenants:
|
120 |
+
if KnowledgebaseService.query(
|
121 |
+
tenant_id=tenant.tenant_id, id=kb_id):
|
122 |
+
break
|
123 |
+
else:
|
124 |
+
return get_json_result(
|
125 |
+
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
|
126 |
+
retcode=RetCode.OPERATING_ERROR)
|
127 |
+
keywords = request.args.get("keywords", "")
|
128 |
+
|
129 |
+
page_number = int(request.args.get("page", 1))
|
130 |
+
items_per_page = int(request.args.get("page_size", 15))
|
131 |
+
orderby = request.args.get("orderby", "create_time")
|
132 |
+
desc = request.args.get("desc", True)
|
133 |
+
try:
|
134 |
+
docs, tol = DocumentService.get_by_kb_id(
|
135 |
+
kb_id, page_number, items_per_page, orderby, desc, keywords)
|
136 |
+
return get_json_result(data={"total": tol, "docs": docs})
|
137 |
+
except Exception as e:
|
138 |
+
return server_error_response(e)
|
139 |
+
|
140 |
+
|
141 |
+
@manager.route('/delete', methods=['DELETE'])
|
142 |
+
@token_required
|
143 |
+
def rm(tenant_id):
|
144 |
+
req = request.args
|
145 |
+
if "doc_id" not in req:
|
146 |
+
return get_data_error_result(
|
147 |
+
retmsg="doc_id is required")
|
148 |
+
doc_ids = req["doc_id"]
|
149 |
+
if isinstance(doc_ids, str): doc_ids = [doc_ids]
|
150 |
+
root_folder = FileService.get_root_folder(tenant_id)
|
151 |
+
pf_id = root_folder["id"]
|
152 |
+
FileService.init_knowledgebase_docs(pf_id, tenant_id)
|
153 |
+
errors = ""
|
154 |
+
for doc_id in doc_ids:
|
155 |
+
try:
|
156 |
+
e, doc = DocumentService.get_by_id(doc_id)
|
157 |
+
if not e:
|
158 |
+
return get_data_error_result(retmsg="Document not found!")
|
159 |
+
tenant_id = DocumentService.get_tenant_id(doc_id)
|
160 |
+
if not tenant_id:
|
161 |
+
return get_data_error_result(retmsg="Tenant not found!")
|
162 |
+
|
163 |
+
b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
|
164 |
+
|
165 |
+
if not DocumentService.remove_document(doc, tenant_id):
|
166 |
+
return get_data_error_result(
|
167 |
+
retmsg="Database error (Document removal)!")
|
168 |
+
|
169 |
+
f2d = File2DocumentService.get_by_document_id(doc_id)
|
170 |
+
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
171 |
+
File2DocumentService.delete_by_document_id(doc_id)
|
172 |
+
|
173 |
+
STORAGE_IMPL.rm(b, n)
|
174 |
+
except Exception as e:
|
175 |
+
errors += str(e)
|
176 |
+
|
177 |
+
if errors:
|
178 |
+
return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
|
179 |
+
|
180 |
+
return get_json_result(data=True,retmsg="success")
|
sdk/python/ragflow/__init__.py
CHANGED
@@ -5,4 +5,5 @@ __version__ = importlib.metadata.version("ragflow")
|
|
5 |
from .ragflow import RAGFlow
|
6 |
from .modules.dataset import DataSet
|
7 |
from .modules.assistant import Assistant
|
8 |
-
from .modules.session import Session
|
|
|
|
5 |
from .ragflow import RAGFlow
|
6 |
from .modules.dataset import DataSet
|
7 |
from .modules.assistant import Assistant
|
8 |
+
from .modules.session import Session
|
9 |
+
from .modules.document import Document
|
sdk/python/ragflow/modules/dataset.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
from .base import Base
|
2 |
|
3 |
|
@@ -46,3 +50,39 @@ class DataSet(Base):
|
|
46 |
res = res.json()
|
47 |
if res.get("retmsg") == "success": return True
|
48 |
raise Exception(res["retmsg"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, List
|
2 |
+
|
3 |
+
from .document import Document
|
4 |
+
|
5 |
from .base import Base
|
6 |
|
7 |
|
|
|
50 |
res = res.json()
|
51 |
if res.get("retmsg") == "success": return True
|
52 |
raise Exception(res["retmsg"])
|
53 |
+
|
54 |
+
def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
|
55 |
+
"""
|
56 |
+
List the documents in the dataset, optionally filtering by keywords, with pagination support.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
|
60 |
+
offset (int): The starting point for pagination. Defaults to 0.
|
61 |
+
limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
List[Document]: A list of Document objects.
|
65 |
+
"""
|
66 |
+
# Construct the request payload for listing documents
|
67 |
+
payload = {
|
68 |
+
"kb_id": self.id,
|
69 |
+
"keywords": keywords,
|
70 |
+
"offset": offset,
|
71 |
+
"limit": limit
|
72 |
+
}
|
73 |
+
|
74 |
+
# Send the request to the server to list documents
|
75 |
+
res = self.get(f'/doc/dataset/{self.id}/documents', payload)
|
76 |
+
res_json = res.json()
|
77 |
+
|
78 |
+
# Handle response and error checking
|
79 |
+
if res_json.get("retmsg") != "success":
|
80 |
+
raise Exception(res_json.get("retmsg"))
|
81 |
+
|
82 |
+
# Parse the document data from the response
|
83 |
+
documents = []
|
84 |
+
for doc_data in res_json["data"].get("docs", []):
|
85 |
+
doc = Document(self.rag, doc_data)
|
86 |
+
documents.append(doc)
|
87 |
+
|
88 |
+
return documents
|
sdk/python/ragflow/modules/document.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from .base import Base
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
class Document(Base):
|
7 |
+
def __init__(self, rag, res_dict):
|
8 |
+
self.id = ""
|
9 |
+
self.name = ""
|
10 |
+
self.thumbnail = None
|
11 |
+
self.kb_id = None
|
12 |
+
self.parser_method = ""
|
13 |
+
self.parser_config = {"pages": [[1, 1000000]]}
|
14 |
+
self.source_type = "local"
|
15 |
+
self.type = ""
|
16 |
+
self.created_by = ""
|
17 |
+
self.size = 0
|
18 |
+
self.token_num = 0
|
19 |
+
self.chunk_num = 0
|
20 |
+
self.progress = 0.0
|
21 |
+
self.progress_msg = ""
|
22 |
+
self.process_begin_at = None
|
23 |
+
self.process_duration = 0.0
|
24 |
+
for k in list(res_dict.keys()):
|
25 |
+
if k not in self.__dict__:
|
26 |
+
res_dict.pop(k)
|
27 |
+
super().__init__(rag, res_dict)
|
28 |
+
|
29 |
+
def save(self) -> bool:
|
30 |
+
"""
|
31 |
+
Save the document details to the server.
|
32 |
+
"""
|
33 |
+
res = self.post('/doc/save',
|
34 |
+
{"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.kb_id,
|
35 |
+
"parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
|
36 |
+
"source_type": self.source_type, "type": self.type, "created_by": self.created_by,
|
37 |
+
"size": self.size, "token_num": self.token_num, "chunk_num": self.chunk_num,
|
38 |
+
"progress": self.progress, "progress_msg": self.progress_msg,
|
39 |
+
"process_begin_at": self.process_begin_at, "process_duation": self.process_duration
|
40 |
+
})
|
41 |
+
res = res.json()
|
42 |
+
if res.get("retmsg") == "success":
|
43 |
+
return True
|
44 |
+
raise Exception(res["retmsg"])
|
45 |
+
|
46 |
+
def delete(self) -> bool:
|
47 |
+
"""
|
48 |
+
Delete the document from the server.
|
49 |
+
"""
|
50 |
+
res = self.rm('/doc/delete',
|
51 |
+
{"doc_id": self.id})
|
52 |
+
res = res.json()
|
53 |
+
if res.get("retmsg") == "success":
|
54 |
+
return True
|
55 |
+
raise Exception(res["retmsg"])
|
56 |
+
|
57 |
+
def download(self) -> bytes:
|
58 |
+
"""
|
59 |
+
Download the document content from the server using the Flask API.
|
60 |
+
|
61 |
+
:return: The downloaded document content in bytes.
|
62 |
+
"""
|
63 |
+
# Construct the URL for the API request using the document ID and knowledge base ID
|
64 |
+
res = self.get(f"/doc/{self.kb_id}/documents/{self.id}",
|
65 |
+
{"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
|
66 |
+
|
67 |
+
# Check the response status code to ensure the request was successful
|
68 |
+
if res.status_code == 200:
|
69 |
+
# Return the document content as bytes
|
70 |
+
return res.content
|
71 |
+
else:
|
72 |
+
# Handle the error and raise an exception
|
73 |
+
raise Exception(
|
74 |
+
f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
|
75 |
+
)
|
sdk/python/ragflow/ragflow.py
CHANGED
@@ -19,7 +19,7 @@ import requests
|
|
19 |
|
20 |
from .modules.assistant import Assistant
|
21 |
from .modules.dataset import DataSet
|
22 |
-
|
23 |
|
24 |
class RAGFlow:
|
25 |
def __init__(self, user_key, base_url, version='v1'):
|
@@ -142,3 +142,32 @@ class RAGFlow:
|
|
142 |
result_list.append(Assistant(self, data))
|
143 |
return result_list
|
144 |
raise Exception(res["retmsg"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
from .modules.assistant import Assistant
|
21 |
from .modules.dataset import DataSet
|
22 |
+
from .modules.document import Document
|
23 |
|
24 |
class RAGFlow:
|
25 |
def __init__(self, user_key, base_url, version='v1'):
|
|
|
142 |
result_list.append(Assistant(self, data))
|
143 |
return result_list
|
144 |
raise Exception(res["retmsg"])
|
145 |
+
|
146 |
+
def create_document(self, ds:DataSet, name: str, blob: bytes) -> bool:
|
147 |
+
url = f"/doc/dataset/{ds.id}/documents/upload"
|
148 |
+
files = {
|
149 |
+
'file': (name, blob)
|
150 |
+
}
|
151 |
+
data = {
|
152 |
+
'kb_id': ds.id
|
153 |
+
}
|
154 |
+
headers = {
|
155 |
+
'Authorization': f"Bearer {ds.rag.user_key}"
|
156 |
+
}
|
157 |
+
|
158 |
+
response = requests.post(self.api_url + url, data=data, files=files,
|
159 |
+
headers=headers)
|
160 |
+
|
161 |
+
if response.status_code == 200 and response.json().get('retmsg') == 'success':
|
162 |
+
return True
|
163 |
+
else:
|
164 |
+
raise Exception(f"Upload failed: {response.json().get('retmsg')}")
|
165 |
+
|
166 |
+
return False
|
167 |
+
def get_document(self, id: str = None, name: str = None) -> Document:
|
168 |
+
res = self.get("/doc/infos", {"id": id, "name": name})
|
169 |
+
res = res.json()
|
170 |
+
if res.get("retmsg") == "success":
|
171 |
+
return Document(self, res['data'])
|
172 |
+
raise Exception(res["retmsg"])
|
173 |
+
|
sdk/python/test/ragflow.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":null,"retcode":100,"retmsg":"TypeError(\"download_document() got an unexpected keyword argument 'tenant_id'\")"}
|
sdk/python/test/t_document.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ragflow import RAGFlow, DataSet, Document
|
2 |
+
|
3 |
+
from common import API_KEY, HOST_ADDRESS
|
4 |
+
from test_sdkbase import TestSdk
|
5 |
+
|
6 |
+
|
7 |
+
class TestDocument(TestSdk):
|
8 |
+
def test_upload_document_with_success(self):
|
9 |
+
"""
|
10 |
+
Test ingesting a document into a dataset with success.
|
11 |
+
"""
|
12 |
+
# Initialize RAGFlow instance
|
13 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
14 |
+
|
15 |
+
# Step 1: Create a new dataset
|
16 |
+
ds = rag.create_dataset(name="God")
|
17 |
+
|
18 |
+
# Ensure dataset creation was successful
|
19 |
+
assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
|
20 |
+
assert ds.name == "God", "Dataset name does not match."
|
21 |
+
|
22 |
+
# Step 2: Create a new document
|
23 |
+
# The blob is the actual file content or a placeholder in this case
|
24 |
+
name = "TestDocument.txt"
|
25 |
+
blob = b"Sample document content for ingestion test."
|
26 |
+
|
27 |
+
res = rag.create_document(ds, name=name, blob=blob)
|
28 |
+
|
29 |
+
# Ensure document ingestion was successful
|
30 |
+
assert res is True, f"Failed to create document, error: {res}"
|
31 |
+
|
32 |
+
def test_get_detail_document_with_success(self):
|
33 |
+
"""
|
34 |
+
Test getting a document's detail with success
|
35 |
+
"""
|
36 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
37 |
+
doc = rag.get_document(name="TestDocument.txt")
|
38 |
+
assert isinstance(doc, Document), f"Failed to get dataset, error: {doc}."
|
39 |
+
assert doc.name == "TestDocument.txt", "Name does not match"
|
40 |
+
|
41 |
+
def test_update_document_with_success(self):
|
42 |
+
"""
|
43 |
+
Test updating a document with success.
|
44 |
+
"""
|
45 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
46 |
+
doc = rag.get_document(name="TestDocument.txt")
|
47 |
+
if isinstance(doc, Document):
|
48 |
+
doc.parser_method = "manual"
|
49 |
+
res = doc.save()
|
50 |
+
assert res is True, f"Failed to update document, error: {res}"
|
51 |
+
else:
|
52 |
+
assert False, f"Failed to get document, error: {doc}"
|
53 |
+
|
54 |
+
def test_download_document_with_success(self):
|
55 |
+
"""
|
56 |
+
Test downloading a document with success.
|
57 |
+
"""
|
58 |
+
# Initialize RAGFlow instance
|
59 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
60 |
+
|
61 |
+
# Retrieve a document
|
62 |
+
doc = rag.get_document(name="TestDocument.txt")
|
63 |
+
|
64 |
+
# Check if the retrieved document is of type Document
|
65 |
+
if isinstance(doc, Document):
|
66 |
+
# Download the document content and save it to a file
|
67 |
+
try:
|
68 |
+
with open("ragflow.txt", "wb+") as file:
|
69 |
+
file.write(doc.download())
|
70 |
+
# Print the document object for debugging
|
71 |
+
print(doc)
|
72 |
+
|
73 |
+
# Assert that the download was successful
|
74 |
+
assert True, "Document downloaded successfully."
|
75 |
+
except Exception as e:
|
76 |
+
# If an error occurs, raise an assertion error
|
77 |
+
assert False, f"Failed to download document, error: {str(e)}"
|
78 |
+
else:
|
79 |
+
# If the document retrieval fails, assert failure
|
80 |
+
assert False, f"Failed to get document, error: {doc}"
|
81 |
+
|
82 |
+
def test_list_all_documents_in_dataset_with_success(self):
|
83 |
+
"""
|
84 |
+
Test list all documents into a dataset with success.
|
85 |
+
"""
|
86 |
+
# Initialize RAGFlow instance
|
87 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
88 |
+
|
89 |
+
# Step 1: Create a new dataset
|
90 |
+
ds = rag.create_dataset(name="God2")
|
91 |
+
|
92 |
+
# Ensure dataset creation was successful
|
93 |
+
assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
|
94 |
+
assert ds.name == "God2", "Dataset name does not match."
|
95 |
+
|
96 |
+
# Step 2: Create a new document
|
97 |
+
# The blob is the actual file content or a placeholder in this case
|
98 |
+
name1 = "Test Document111.txt"
|
99 |
+
blob1 = b"Sample document content for ingestion test111."
|
100 |
+
name2 = "Test Document222.txt"
|
101 |
+
blob2 = b"Sample document content for ingestion test222."
|
102 |
+
|
103 |
+
rag.create_document(ds, name=name1, blob=blob1)
|
104 |
+
rag.create_document(ds, name=name2, blob=blob2)
|
105 |
+
for d in ds.list_docs(keywords="test", offset=0, limit=12):
|
106 |
+
assert isinstance(d, Document)
|
107 |
+
print(d)
|
108 |
+
|
109 |
+
def test_delete_documents_in_dataset_with_success(self):
|
110 |
+
"""
|
111 |
+
Test list all documents into a dataset with success.
|
112 |
+
"""
|
113 |
+
# Initialize RAGFlow instance
|
114 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
115 |
+
|
116 |
+
# Step 1: Create a new dataset
|
117 |
+
ds = rag.create_dataset(name="God3")
|
118 |
+
|
119 |
+
# Ensure dataset creation was successful
|
120 |
+
assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
|
121 |
+
assert ds.name == "God3", "Dataset name does not match."
|
122 |
+
|
123 |
+
# Step 2: Create a new document
|
124 |
+
# The blob is the actual file content or a placeholder in this case
|
125 |
+
name1 = "Test Document333.txt"
|
126 |
+
blob1 = b"Sample document content for ingestion test333."
|
127 |
+
name2 = "Test Document444.txt"
|
128 |
+
blob2 = b"Sample document content for ingestion test444."
|
129 |
+
name3='test.txt'
|
130 |
+
path='test_data/test.txt'
|
131 |
+
rag.create_document(ds, name=name3, blob=open(path, "rb").read())
|
132 |
+
rag.create_document(ds, name=name1, blob=blob1)
|
133 |
+
rag.create_document(ds, name=name2, blob=blob2)
|
134 |
+
for d in ds.list_docs(keywords="document", offset=0, limit=12):
|
135 |
+
assert isinstance(d, Document)
|
136 |
+
d.delete()
|
137 |
+
print(d)
|
138 |
+
remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
|
139 |
+
assert len(remaining_docs) == 0, "Documents were not properly deleted."
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|