JobSmithManipulation Kevin Hu commited on
Commit
ce45214
·
1 Parent(s): 5e72d47

update sdk document (#2374)

Browse files

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <[email protected]>

api/apps/sdk/doc.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ from flask import request,send_file
4
+ from api.utils.api_utils import get_json_result, construct_json_result, server_error_response
5
+ from api.utils.api_utils import get_json_result, token_required, get_data_error_result
6
+ from api.db import FileType, ParserType, FileSource, TaskStatus
7
+ from api.db.db_models import File
8
+ from api.db.services.document_service import DocumentService
9
+ from api.db.services.file2document_service import File2DocumentService
10
+ from api.db.services.file_service import FileService
11
+ from api.db.services.knowledgebase_service import KnowledgebaseService
12
+ from api.db.services.user_service import TenantService, UserTenantService
13
+ from api.settings import RetCode
14
+ from api.utils.api_utils import construct_json_result, construct_error_response
15
+ from rag.utils.storage_factory import STORAGE_IMPL
16
+
17
+
18
+ @manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
19
+ @token_required
20
+ def upload(dataset_id, tenant_id):
21
+ if 'file' not in request.files:
22
+ return get_json_result(
23
+ data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
24
+ file_objs = request.files.getlist('file')
25
+ for file_obj in file_objs:
26
+ if file_obj.filename == '':
27
+ return get_json_result(
28
+ data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
29
+ e, kb = KnowledgebaseService.get_by_id(dataset_id)
30
+ if not e:
31
+ raise LookupError(f"Can't find the knowledgebase with ID {dataset_id}!")
32
+ err, _ = FileService.upload_document(kb, file_objs, tenant_id)
33
+ if err:
34
+ return get_json_result(
35
+ data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
36
+ return get_json_result(data=True)
37
+
38
+
39
+ @manager.route('/infos', methods=['GET'])
40
+ @token_required
41
+ def docinfos(tenant_id):
42
+ req = request.args
43
+ if "id" in req:
44
+ doc_id = req["id"]
45
+ e, doc = DocumentService.get_by_id(doc_id)
46
+ return get_json_result(data=doc.to_json())
47
+ if "name" in req:
48
+ doc_name = req["name"]
49
+ doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
50
+ e, doc = DocumentService.get_by_id(doc_id)
51
+ return get_json_result(data=doc.to_json())
52
+
53
+
54
+ @manager.route('/save', methods=['POST'])
55
+ @token_required
56
+ def save_doc(tenant_id):
57
+ req = request.json # Expecting JSON input
58
+ if "id" in req:
59
+ doc_id = req["id"]
60
+ if "name" in req:
61
+ doc_name = req["name"]
62
+ doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
63
+ data = request.json
64
+ # Call the update method with the provided id and data
65
+ try:
66
+ num = DocumentService.update_by_id(doc_id, data)
67
+ if num > 0:
68
+ return get_json_result(retmsg="success", data={"updated_count": num})
69
+ else:
70
+ return get_json_result(retcode=404, retmsg="Document not found")
71
+ except Exception as e:
72
+ return get_json_result(retmsg=f"Error occurred: {str(e)}")
73
+
74
+
75
+ @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
76
+ @token_required
77
+ def download_document(dataset_id, document_id):
78
+ try:
79
+ # Check whether there is this dataset
80
+ exist, _ = KnowledgebaseService.get_by_id(dataset_id)
81
+ if not exist:
82
+ return construct_json_result(code=RetCode.DATA_ERROR,
83
+ message=f"This dataset '{dataset_id}' cannot be found!")
84
+
85
+ # Check whether there is this document
86
+ exist, document = DocumentService.get_by_id(document_id)
87
+ if not exist:
88
+ return construct_json_result(message=f"This document '{document_id}' cannot be found!",
89
+ code=RetCode.ARGUMENT_ERROR)
90
+
91
+ # The process of downloading
92
+ doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id) # minio address
93
+ file_stream = STORAGE_IMPL.get(doc_id, doc_location)
94
+ if not file_stream:
95
+ return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
96
+
97
+ file = BytesIO(file_stream)
98
+
99
+ # Use send_file with a proper filename and MIME type
100
+ return send_file(
101
+ file,
102
+ as_attachment=True,
103
+ download_name=document.name,
104
+ mimetype='application/octet-stream' # Set a default MIME type
105
+ )
106
+
107
+ # Error
108
+ except Exception as e:
109
+ return construct_error_response(e)
110
+
111
+ @manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
112
+ @token_required
113
+ def list_docs(dataset_id,tenant_id):
114
+ kb_id = request.args.get("kb_id")
115
+ if not kb_id:
116
+ return get_json_result(
117
+ data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
118
+ tenants = UserTenantService.query(user_id=tenant_id)
119
+ for tenant in tenants:
120
+ if KnowledgebaseService.query(
121
+ tenant_id=tenant.tenant_id, id=kb_id):
122
+ break
123
+ else:
124
+ return get_json_result(
125
+ data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
126
+ retcode=RetCode.OPERATING_ERROR)
127
+ keywords = request.args.get("keywords", "")
128
+
129
+ page_number = int(request.args.get("page", 1))
130
+ items_per_page = int(request.args.get("page_size", 15))
131
+ orderby = request.args.get("orderby", "create_time")
132
+ desc = request.args.get("desc", True)
133
+ try:
134
+ docs, tol = DocumentService.get_by_kb_id(
135
+ kb_id, page_number, items_per_page, orderby, desc, keywords)
136
+ return get_json_result(data={"total": tol, "docs": docs})
137
+ except Exception as e:
138
+ return server_error_response(e)
139
+
140
+
141
+ @manager.route('/delete', methods=['DELETE'])
142
+ @token_required
143
+ def rm(tenant_id):
144
+ req = request.args
145
+ if "doc_id" not in req:
146
+ return get_data_error_result(
147
+ retmsg="doc_id is required")
148
+ doc_ids = req["doc_id"]
149
+ if isinstance(doc_ids, str): doc_ids = [doc_ids]
150
+ root_folder = FileService.get_root_folder(tenant_id)
151
+ pf_id = root_folder["id"]
152
+ FileService.init_knowledgebase_docs(pf_id, tenant_id)
153
+ errors = ""
154
+ for doc_id in doc_ids:
155
+ try:
156
+ e, doc = DocumentService.get_by_id(doc_id)
157
+ if not e:
158
+ return get_data_error_result(retmsg="Document not found!")
159
+ tenant_id = DocumentService.get_tenant_id(doc_id)
160
+ if not tenant_id:
161
+ return get_data_error_result(retmsg="Tenant not found!")
162
+
163
+ b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
164
+
165
+ if not DocumentService.remove_document(doc, tenant_id):
166
+ return get_data_error_result(
167
+ retmsg="Database error (Document removal)!")
168
+
169
+ f2d = File2DocumentService.get_by_document_id(doc_id)
170
+ FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
171
+ File2DocumentService.delete_by_document_id(doc_id)
172
+
173
+ STORAGE_IMPL.rm(b, n)
174
+ except Exception as e:
175
+ errors += str(e)
176
+
177
+ if errors:
178
+ return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
179
+
180
+ return get_json_result(data=True,retmsg="success")
sdk/python/ragflow/__init__.py CHANGED
@@ -5,4 +5,5 @@ __version__ = importlib.metadata.version("ragflow")
5
  from .ragflow import RAGFlow
6
  from .modules.dataset import DataSet
7
  from .modules.assistant import Assistant
8
- from .modules.session import Session
 
 
5
  from .ragflow import RAGFlow
6
  from .modules.dataset import DataSet
7
  from .modules.assistant import Assistant
8
+ from .modules.session import Session
9
+ from .modules.document import Document
sdk/python/ragflow/modules/dataset.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from .base import Base
2
 
3
 
@@ -46,3 +50,39 @@ class DataSet(Base):
46
  res = res.json()
47
  if res.get("retmsg") == "success": return True
48
  raise Exception(res["retmsg"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from .document import Document
4
+
5
  from .base import Base
6
 
7
 
 
50
  res = res.json()
51
  if res.get("retmsg") == "success": return True
52
  raise Exception(res["retmsg"])
53
+
54
+ def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
55
+ """
56
+ List the documents in the dataset, optionally filtering by keywords, with pagination support.
57
+
58
+ Args:
59
+ keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
60
+ offset (int): The starting point for pagination. Defaults to 0.
61
+ limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
62
+
63
+ Returns:
64
+ List[Document]: A list of Document objects.
65
+ """
66
+ # Construct the request payload for listing documents
67
+ payload = {
68
+ "kb_id": self.id,
69
+ "keywords": keywords,
70
+ "offset": offset,
71
+ "limit": limit
72
+ }
73
+
74
+ # Send the request to the server to list documents
75
+ res = self.get(f'/doc/dataset/{self.id}/documents', payload)
76
+ res_json = res.json()
77
+
78
+ # Handle response and error checking
79
+ if res_json.get("retmsg") != "success":
80
+ raise Exception(res_json.get("retmsg"))
81
+
82
+ # Parse the document data from the response
83
+ documents = []
84
+ for doc_data in res_json["data"].get("docs", []):
85
+ doc = Document(self.rag, doc_data)
86
+ documents.append(doc)
87
+
88
+ return documents
sdk/python/ragflow/modules/document.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from .base import Base
3
+
4
+
5
+
6
+ class Document(Base):
7
+ def __init__(self, rag, res_dict):
8
+ self.id = ""
9
+ self.name = ""
10
+ self.thumbnail = None
11
+ self.kb_id = None
12
+ self.parser_method = ""
13
+ self.parser_config = {"pages": [[1, 1000000]]}
14
+ self.source_type = "local"
15
+ self.type = ""
16
+ self.created_by = ""
17
+ self.size = 0
18
+ self.token_num = 0
19
+ self.chunk_num = 0
20
+ self.progress = 0.0
21
+ self.progress_msg = ""
22
+ self.process_begin_at = None
23
+ self.process_duration = 0.0
24
+ for k in list(res_dict.keys()):
25
+ if k not in self.__dict__:
26
+ res_dict.pop(k)
27
+ super().__init__(rag, res_dict)
28
+
29
+ def save(self) -> bool:
30
+ """
31
+ Save the document details to the server.
32
+ """
33
+ res = self.post('/doc/save',
34
+ {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.kb_id,
35
+ "parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
36
+ "source_type": self.source_type, "type": self.type, "created_by": self.created_by,
37
+ "size": self.size, "token_num": self.token_num, "chunk_num": self.chunk_num,
38
+ "progress": self.progress, "progress_msg": self.progress_msg,
39
+ "process_begin_at": self.process_begin_at, "process_duation": self.process_duration
40
+ })
41
+ res = res.json()
42
+ if res.get("retmsg") == "success":
43
+ return True
44
+ raise Exception(res["retmsg"])
45
+
46
+ def delete(self) -> bool:
47
+ """
48
+ Delete the document from the server.
49
+ """
50
+ res = self.rm('/doc/delete',
51
+ {"doc_id": self.id})
52
+ res = res.json()
53
+ if res.get("retmsg") == "success":
54
+ return True
55
+ raise Exception(res["retmsg"])
56
+
57
+ def download(self) -> bytes:
58
+ """
59
+ Download the document content from the server using the Flask API.
60
+
61
+ :return: The downloaded document content in bytes.
62
+ """
63
+ # Construct the URL for the API request using the document ID and knowledge base ID
64
+ res = self.get(f"/doc/{self.kb_id}/documents/{self.id}",
65
+ {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
66
+
67
+ # Check the response status code to ensure the request was successful
68
+ if res.status_code == 200:
69
+ # Return the document content as bytes
70
+ return res.content
71
+ else:
72
+ # Handle the error and raise an exception
73
+ raise Exception(
74
+ f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
75
+ )
sdk/python/ragflow/ragflow.py CHANGED
@@ -19,7 +19,7 @@ import requests
19
 
20
  from .modules.assistant import Assistant
21
  from .modules.dataset import DataSet
22
-
23
 
24
  class RAGFlow:
25
  def __init__(self, user_key, base_url, version='v1'):
@@ -142,3 +142,32 @@ class RAGFlow:
142
  result_list.append(Assistant(self, data))
143
  return result_list
144
  raise Exception(res["retmsg"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  from .modules.assistant import Assistant
21
  from .modules.dataset import DataSet
22
+ from .modules.document import Document
23
 
24
  class RAGFlow:
25
  def __init__(self, user_key, base_url, version='v1'):
 
142
  result_list.append(Assistant(self, data))
143
  return result_list
144
  raise Exception(res["retmsg"])
145
+
146
+ def create_document(self, ds:DataSet, name: str, blob: bytes) -> bool:
147
+ url = f"/doc/dataset/{ds.id}/documents/upload"
148
+ files = {
149
+ 'file': (name, blob)
150
+ }
151
+ data = {
152
+ 'kb_id': ds.id
153
+ }
154
+ headers = {
155
+ 'Authorization': f"Bearer {ds.rag.user_key}"
156
+ }
157
+
158
+ response = requests.post(self.api_url + url, data=data, files=files,
159
+ headers=headers)
160
+
161
+ if response.status_code == 200 and response.json().get('retmsg') == 'success':
162
+ return True
163
+ else:
164
+ raise Exception(f"Upload failed: {response.json().get('retmsg')}")
165
+
166
+ return False
167
+ def get_document(self, id: str = None, name: str = None) -> Document:
168
+ res = self.get("/doc/infos", {"id": id, "name": name})
169
+ res = res.json()
170
+ if res.get("retmsg") == "success":
171
+ return Document(self, res['data'])
172
+ raise Exception(res["retmsg"])
173
+
sdk/python/test/ragflow.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":null,"retcode":100,"retmsg":"TypeError(\"download_document() got an unexpected keyword argument 'tenant_id'\")"}
sdk/python/test/t_document.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ragflow import RAGFlow, DataSet, Document
2
+
3
+ from common import API_KEY, HOST_ADDRESS
4
+ from test_sdkbase import TestSdk
5
+
6
+
7
+ class TestDocument(TestSdk):
8
+ def test_upload_document_with_success(self):
9
+ """
10
+ Test ingesting a document into a dataset with success.
11
+ """
12
+ # Initialize RAGFlow instance
13
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
14
+
15
+ # Step 1: Create a new dataset
16
+ ds = rag.create_dataset(name="God")
17
+
18
+ # Ensure dataset creation was successful
19
+ assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
20
+ assert ds.name == "God", "Dataset name does not match."
21
+
22
+ # Step 2: Create a new document
23
+ # The blob is the actual file content or a placeholder in this case
24
+ name = "TestDocument.txt"
25
+ blob = b"Sample document content for ingestion test."
26
+
27
+ res = rag.create_document(ds, name=name, blob=blob)
28
+
29
+ # Ensure document ingestion was successful
30
+ assert res is True, f"Failed to create document, error: {res}"
31
+
32
+ def test_get_detail_document_with_success(self):
33
+ """
34
+ Test getting a document's detail with success
35
+ """
36
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
37
+ doc = rag.get_document(name="TestDocument.txt")
38
+ assert isinstance(doc, Document), f"Failed to get dataset, error: {doc}."
39
+ assert doc.name == "TestDocument.txt", "Name does not match"
40
+
41
+ def test_update_document_with_success(self):
42
+ """
43
+ Test updating a document with success.
44
+ """
45
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
46
+ doc = rag.get_document(name="TestDocument.txt")
47
+ if isinstance(doc, Document):
48
+ doc.parser_method = "manual"
49
+ res = doc.save()
50
+ assert res is True, f"Failed to update document, error: {res}"
51
+ else:
52
+ assert False, f"Failed to get document, error: {doc}"
53
+
54
+ def test_download_document_with_success(self):
55
+ """
56
+ Test downloading a document with success.
57
+ """
58
+ # Initialize RAGFlow instance
59
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
60
+
61
+ # Retrieve a document
62
+ doc = rag.get_document(name="TestDocument.txt")
63
+
64
+ # Check if the retrieved document is of type Document
65
+ if isinstance(doc, Document):
66
+ # Download the document content and save it to a file
67
+ try:
68
+ with open("ragflow.txt", "wb+") as file:
69
+ file.write(doc.download())
70
+ # Print the document object for debugging
71
+ print(doc)
72
+
73
+ # Assert that the download was successful
74
+ assert True, "Document downloaded successfully."
75
+ except Exception as e:
76
+ # If an error occurs, raise an assertion error
77
+ assert False, f"Failed to download document, error: {str(e)}"
78
+ else:
79
+ # If the document retrieval fails, assert failure
80
+ assert False, f"Failed to get document, error: {doc}"
81
+
82
+ def test_list_all_documents_in_dataset_with_success(self):
83
+ """
84
+ Test list all documents into a dataset with success.
85
+ """
86
+ # Initialize RAGFlow instance
87
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
88
+
89
+ # Step 1: Create a new dataset
90
+ ds = rag.create_dataset(name="God2")
91
+
92
+ # Ensure dataset creation was successful
93
+ assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
94
+ assert ds.name == "God2", "Dataset name does not match."
95
+
96
+ # Step 2: Create a new document
97
+ # The blob is the actual file content or a placeholder in this case
98
+ name1 = "Test Document111.txt"
99
+ blob1 = b"Sample document content for ingestion test111."
100
+ name2 = "Test Document222.txt"
101
+ blob2 = b"Sample document content for ingestion test222."
102
+
103
+ rag.create_document(ds, name=name1, blob=blob1)
104
+ rag.create_document(ds, name=name2, blob=blob2)
105
+ for d in ds.list_docs(keywords="test", offset=0, limit=12):
106
+ assert isinstance(d, Document)
107
+ print(d)
108
+
109
+ def test_delete_documents_in_dataset_with_success(self):
110
+ """
111
+ Test list all documents into a dataset with success.
112
+ """
113
+ # Initialize RAGFlow instance
114
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
115
+
116
+ # Step 1: Create a new dataset
117
+ ds = rag.create_dataset(name="God3")
118
+
119
+ # Ensure dataset creation was successful
120
+ assert isinstance(ds, DataSet), f"Failed to create dataset, error: {ds}"
121
+ assert ds.name == "God3", "Dataset name does not match."
122
+
123
+ # Step 2: Create a new document
124
+ # The blob is the actual file content or a placeholder in this case
125
+ name1 = "Test Document333.txt"
126
+ blob1 = b"Sample document content for ingestion test333."
127
+ name2 = "Test Document444.txt"
128
+ blob2 = b"Sample document content for ingestion test444."
129
+ name3='test.txt'
130
+ path='test_data/test.txt'
131
+ rag.create_document(ds, name=name3, blob=open(path, "rb").read())
132
+ rag.create_document(ds, name=name1, blob=blob1)
133
+ rag.create_document(ds, name=name2, blob=blob2)
134
+ for d in ds.list_docs(keywords="document", offset=0, limit=12):
135
+ assert isinstance(d, Document)
136
+ d.delete()
137
+ print(d)
138
+ remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
139
+ assert len(remaining_docs) == 0, "Documents were not properly deleted."
140
+
141
+
142
+
143
+
144
+