cecilia-uu commited on
Commit
6674a75
·
1 Parent(s): 970e973

API: start parsing (#1377)

Browse files

### What problem does this PR solve?

Make the document start parsing.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/dataset_api.py CHANGED
@@ -18,30 +18,35 @@ import re
18
  import warnings
19
  from io import BytesIO
20
 
 
21
  from flask import request, send_file
22
  from flask_login import login_required, current_user
23
  from httpx import HTTPError
24
- from minio import S3Error
25
 
26
  from api.contants import NAME_LENGTH_LIMIT
27
- from api.db import FileType, ParserType, FileSource
28
  from api.db import StatusEnum
29
- from api.db.db_models import File
30
  from api.db.services import duplicate_name
31
  from api.db.services.document_service import DocumentService
32
  from api.db.services.file2document_service import File2DocumentService
33
  from api.db.services.file_service import FileService
34
  from api.db.services.knowledgebase_service import KnowledgebaseService
 
35
  from api.db.services.user_service import TenantService
36
  from api.settings import RetCode
37
  from api.utils import get_uuid
38
  from api.utils.api_utils import construct_json_result, construct_error_response
39
  from api.utils.api_utils import construct_result, validate_request
40
  from api.utils.file_utils import filename_type, thumbnail
 
 
 
41
  from rag.utils.minio_conn import MINIO
42
 
43
  MAXIMUM_OF_UPLOADING_FILES = 256
44
 
 
45
  # ------------------------------ create a dataset ---------------------------------------
46
 
47
  @manager.route("/", methods=["POST"])
@@ -116,6 +121,7 @@ def create_dataset():
116
  except Exception as e:
117
  return construct_error_response(e)
118
 
 
119
  # -----------------------------list datasets-------------------------------------------------------
120
 
121
  @manager.route("/", methods=["GET"])
@@ -135,6 +141,7 @@ def list_datasets():
135
  except HTTPError as http_err:
136
  return construct_json_result(http_err)
137
 
 
138
  # ---------------------------------delete a dataset ----------------------------
139
 
140
  @manager.route("/<dataset_id>", methods=["DELETE"])
@@ -162,13 +169,15 @@ def remove_dataset(dataset_id):
162
 
163
  # delete the dataset
164
  if not KnowledgebaseService.delete_by_id(dataset_id):
165
- return construct_json_result(code=RetCode.DATA_ERROR, message="There was an error during the dataset removal process. "
166
- "Please check the status of the RAGFlow server and try the removal again.")
 
167
  # success
168
  return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
169
  except Exception as e:
170
  return construct_error_response(e)
171
 
 
172
  # ------------------------------ get details of a dataset ----------------------------------------
173
 
174
  @manager.route("/<dataset_id>", methods=["GET"])
@@ -182,6 +191,7 @@ def get_dataset(dataset_id):
182
  except Exception as e:
183
  return construct_json_result(e)
184
 
 
185
  # ------------------------------ update a dataset --------------------------------------------
186
 
187
  @manager.route("/<dataset_id>", methods=["PUT"])
@@ -209,8 +219,9 @@ def update_dataset(dataset_id):
209
  if name.lower() != dataset.name.lower() \
210
  and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
211
  status=StatusEnum.VALID.value)) > 1:
212
- return construct_json_result(code=RetCode.DATA_ERROR, message=f"The name: {name.lower()} is already used by other "
213
- f"datasets. Please choose a different name.")
 
214
 
215
  dataset_updating_data = {}
216
  chunk_num = req.get("chunk_num")
@@ -222,17 +233,21 @@ def update_dataset(dataset_id):
222
  if chunk_num == 0:
223
  dataset_updating_data["embd_id"] = req["embedding_model_id"]
224
  else:
225
- construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
226
  "dataset, so you cannot change the embedding "
227
  "model.")
228
  # only if chunk_num is 0, the user can update the chunk_method
229
- if req.get("chunk_method"):
230
- if chunk_num == 0:
231
- dataset_updating_data['parser_id'] = req["chunk_method"]
232
- else:
 
 
233
  construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
234
  "in this dataset, so you cannot "
235
  "change the chunk method.")
 
 
236
  # convert the photo parameter to avatar
237
  if req.get("photo"):
238
  dataset_updating_data["avatar"] = req["photo"]
@@ -265,6 +280,7 @@ def update_dataset(dataset_id):
265
  except Exception as e:
266
  return construct_error_response(e)
267
 
 
268
  # --------------------------------content management ----------------------------------------------
269
 
270
  # ----------------------------upload files-----------------------------------------------------
@@ -339,9 +355,10 @@ def upload_documents(dataset_id):
339
  location += "_"
340
 
341
  blob = file.read()
 
342
  # the content is empty, raising a warning
343
  if blob == b'':
344
- warnings.warn(f"[WARNING]: The file {filename} is empty.")
345
 
346
  MINIO.put(dataset_id, location, blob)
347
 
@@ -453,6 +470,7 @@ def list_documents(dataset_id):
453
  except Exception as e:
454
  return construct_error_response(e)
455
 
 
456
  # ----------------------------update: enable rename-----------------------------------------------------
457
  @manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"])
458
  @login_required
@@ -555,6 +573,7 @@ def update_document(dataset_id, document_id):
555
  def is_illegal_value_for_enum(value, enum_class):
556
  return value not in enum_class.__members__.values()
557
 
 
558
  # ----------------------------download a file-----------------------------------------------------
559
  @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
560
  @login_required
@@ -563,7 +582,8 @@ def download_document(dataset_id, document_id):
563
  # Check whether there is this dataset
564
  exist, _ = KnowledgebaseService.get_by_id(dataset_id)
565
  if not exist:
566
- return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!")
 
567
 
568
  # Check whether there is this document
569
  exist, document = DocumentService.get_by_id(document_id)
@@ -591,8 +611,142 @@ def download_document(dataset_id, document_id):
591
  except Exception as e:
592
  return construct_error_response(e)
593
 
594
- # ----------------------------start parsing-----------------------------------------------------
595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  # ----------------------------stop parsing-----------------------------------------------------
597
 
598
  # ----------------------------show the status of the file-----------------------------------------------------
@@ -610,6 +764,3 @@ def download_document(dataset_id, document_id):
610
  # ----------------------------get a specific chunk-----------------------------------------------------
611
 
612
  # ----------------------------retrieval test-----------------------------------------------------
613
-
614
-
615
-
 
18
  import warnings
19
  from io import BytesIO
20
 
21
+ from elasticsearch_dsl import Q
22
  from flask import request, send_file
23
  from flask_login import login_required, current_user
24
  from httpx import HTTPError
 
25
 
26
  from api.contants import NAME_LENGTH_LIMIT
27
+ from api.db import FileType, ParserType, FileSource, TaskStatus
28
  from api.db import StatusEnum
29
+ from api.db.db_models import File, Task
30
  from api.db.services import duplicate_name
31
  from api.db.services.document_service import DocumentService
32
  from api.db.services.file2document_service import File2DocumentService
33
  from api.db.services.file_service import FileService
34
  from api.db.services.knowledgebase_service import KnowledgebaseService
35
+ from api.db.services.task_service import TaskService
36
  from api.db.services.user_service import TenantService
37
  from api.settings import RetCode
38
  from api.utils import get_uuid
39
  from api.utils.api_utils import construct_json_result, construct_error_response
40
  from api.utils.api_utils import construct_result, validate_request
41
  from api.utils.file_utils import filename_type, thumbnail
42
+ from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
43
+ from rag.nlp import search
44
+ from rag.utils.es_conn import ELASTICSEARCH
45
  from rag.utils.minio_conn import MINIO
46
 
47
  MAXIMUM_OF_UPLOADING_FILES = 256
48
 
49
+
50
  # ------------------------------ create a dataset ---------------------------------------
51
 
52
  @manager.route("/", methods=["POST"])
 
121
  except Exception as e:
122
  return construct_error_response(e)
123
 
124
+
125
  # -----------------------------list datasets-------------------------------------------------------
126
 
127
  @manager.route("/", methods=["GET"])
 
141
  except HTTPError as http_err:
142
  return construct_json_result(http_err)
143
 
144
+
145
  # ---------------------------------delete a dataset ----------------------------
146
 
147
  @manager.route("/<dataset_id>", methods=["DELETE"])
 
169
 
170
  # delete the dataset
171
  if not KnowledgebaseService.delete_by_id(dataset_id):
172
+ return construct_json_result(code=RetCode.DATA_ERROR,
173
+ message="There was an error during the dataset removal process. "
174
+ "Please check the status of the RAGFlow server and try the removal again.")
175
  # success
176
  return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
177
  except Exception as e:
178
  return construct_error_response(e)
179
 
180
+
181
  # ------------------------------ get details of a dataset ----------------------------------------
182
 
183
  @manager.route("/<dataset_id>", methods=["GET"])
 
191
  except Exception as e:
192
  return construct_json_result(e)
193
 
194
+
195
  # ------------------------------ update a dataset --------------------------------------------
196
 
197
  @manager.route("/<dataset_id>", methods=["PUT"])
 
219
  if name.lower() != dataset.name.lower() \
220
  and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
221
  status=StatusEnum.VALID.value)) > 1:
222
+ return construct_json_result(code=RetCode.DATA_ERROR,
223
+ message=f"The name: {name.lower()} is already used by other "
224
+ f"datasets. Please choose a different name.")
225
 
226
  dataset_updating_data = {}
227
  chunk_num = req.get("chunk_num")
 
233
  if chunk_num == 0:
234
  dataset_updating_data["embd_id"] = req["embedding_model_id"]
235
  else:
236
+ return construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
237
  "dataset, so you cannot change the embedding "
238
  "model.")
239
  # only if chunk_num is 0, the user can update the chunk_method
240
+ if "chunk_method" in req:
241
+ type_value = req["chunk_method"]
242
+ if is_illegal_value_for_enum(type_value, ParserType):
243
+ return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.",
244
+ code=RetCode.DATA_ERROR)
245
+ if chunk_num != 0:
246
  construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
247
  "in this dataset, so you cannot "
248
  "change the chunk method.")
249
+ dataset_updating_data["parser_id"] = req["template_type"]
250
+
251
  # convert the photo parameter to avatar
252
  if req.get("photo"):
253
  dataset_updating_data["avatar"] = req["photo"]
 
280
  except Exception as e:
281
  return construct_error_response(e)
282
 
283
+
284
  # --------------------------------content management ----------------------------------------------
285
 
286
  # ----------------------------upload files-----------------------------------------------------
 
355
  location += "_"
356
 
357
  blob = file.read()
358
+
359
  # the content is empty, raising a warning
360
  if blob == b'':
361
+ warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
362
 
363
  MINIO.put(dataset_id, location, blob)
364
 
 
470
  except Exception as e:
471
  return construct_error_response(e)
472
 
473
+
474
  # ----------------------------update: enable rename-----------------------------------------------------
475
  @manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"])
476
  @login_required
 
573
  def is_illegal_value_for_enum(value, enum_class):
574
  return value not in enum_class.__members__.values()
575
 
576
+
577
  # ----------------------------download a file-----------------------------------------------------
578
  @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
579
  @login_required
 
582
  # Check whether there is this dataset
583
  exist, _ = KnowledgebaseService.get_by_id(dataset_id)
584
  if not exist:
585
+ return construct_json_result(code=RetCode.DATA_ERROR,
586
+ message=f"This dataset '{dataset_id}' cannot be found!")
587
 
588
  # Check whether there is this document
589
  exist, document = DocumentService.get_by_id(document_id)
 
611
  except Exception as e:
612
  return construct_error_response(e)
613
 
 
614
 
615
+ # ----------------------------start parsing a document-----------------------------------------------------
616
+ # helper method for parsing
617
+ def dummy(prog=None, msg=""):
618
+ pass
619
+
620
+
621
+ def doc_parse(binary, doc_name, parser_name, tenant_id):
622
+ match parser_name:
623
+ case "book":
624
+ book.chunk(doc_name, binary=binary, callback=dummy)
625
+ case "laws":
626
+ laws.chunk(doc_name, binary=binary, callback=dummy)
627
+ case "manual":
628
+ manual.chunk(doc_name, binary=binary, callback=dummy)
629
+ case "naive":
630
+ # It's the mode by default, which is general in the front-end
631
+ naive.chunk(doc_name, binary=binary, callback=dummy)
632
+ case "one":
633
+ one.chunk(doc_name, binary=binary, callback=dummy)
634
+ case "paper":
635
+ paper.chunk(doc_name, binary=binary, callback=dummy)
636
+ case "picture":
637
+ picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
638
+ case "presentation":
639
+ presentation.chunk(doc_name, binary=binary, callback=dummy)
640
+ case "qa":
641
+ qa.chunk(doc_name, binary=binary, callback=dummy)
642
+ case "resume":
643
+ resume.chunk(doc_name, binary=binary, callback=dummy)
644
+ case "table":
645
+ table.chunk(doc_name, binary=binary, callback=dummy)
646
+ case _:
647
+ return False
648
+
649
+ return True
650
+
651
+
652
+ @manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
653
+ @login_required
654
+ def parse_document(dataset_id, document_id):
655
+ try:
656
+ # valid dataset
657
+ exist, _ = KnowledgebaseService.get_by_id(dataset_id)
658
+ if not exist:
659
+ return construct_json_result(code=RetCode.DATA_ERROR,
660
+ message=f"This dataset '{dataset_id}' cannot be found!")
661
+ message = ""
662
+ res = get_message_during_parsing_document(document_id, message)
663
+ if isinstance(res, str):
664
+ message += res
665
+ return construct_json_result(code=RetCode.SUCCESS, message=message)
666
+ else:
667
+ return res
668
+
669
+ except Exception as e:
670
+ return construct_error_response(e)
671
+
672
+
673
+ # ----------------------------start parsing documents-----------------------------------------------------
674
+ @manager.route("/<dataset_id>/documents/status", methods=["POST"])
675
+ @login_required
676
+ def parse_documents(dataset_id):
677
+ doc_ids = request.json["doc_ids"]
678
+ try:
679
+ exist, _ = KnowledgebaseService.get_by_id(dataset_id)
680
+ if not exist:
681
+ return construct_json_result(code=RetCode.DATA_ERROR,
682
+ message=f"This dataset '{dataset_id}' cannot be found!")
683
+
684
+ def process(doc_ids):
685
+ message = ""
686
+ # for loop
687
+ for id in doc_ids:
688
+ res = get_message_during_parsing_document(id, message)
689
+ if isinstance(res, str):
690
+ message += res
691
+ else:
692
+ return res
693
+ return construct_json_result(data=True, code=RetCode.SUCCESS, message=message)
694
+
695
+ # two conditions
696
+ if doc_ids:
697
+ return process(doc_ids)
698
+ else:
699
+ # documents inside the dataset
700
+ docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time",
701
+ True, "")
702
+ doc_ids = [doc["id"] for doc in docs]
703
+ return process(doc_ids)
704
+
705
+ except Exception as e:
706
+ return construct_error_response(e)
707
+
708
+
709
+ # helper method for getting message or response when parsing the document
710
+ def get_message_during_parsing_document(id, message):
711
+ try:
712
+ # Check whether there is this document
713
+ exist, document = DocumentService.get_by_id(id)
714
+ if not exist:
715
+ return construct_json_result(message=f"This document '{id}' cannot be found!",
716
+ code=RetCode.ARGUMENT_ERROR)
717
+
718
+ tenant_id = DocumentService.get_tenant_id(id)
719
+ if not tenant_id:
720
+ return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
721
+
722
+ info = {"run": "1", "progress": 0}
723
+ info["progress_msg"] = ""
724
+ info["chunk_num"] = 0
725
+ info["token_num"] = 0
726
+
727
+ DocumentService.update_by_id(id, info)
728
+
729
+ ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
730
+
731
+ _, doc_attributes = DocumentService.get_by_id(id)
732
+ doc_attributes = doc_attributes.to_dict()
733
+ doc_id = doc_attributes["id"]
734
+
735
+ bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id)
736
+ binary = MINIO.get(bucket, doc_name)
737
+ parser_name = doc_attributes["parser_id"]
738
+ if binary:
739
+ res = doc_parse(binary, doc_name, parser_name, tenant_id)
740
+ if res is False:
741
+ message += f"The parser id: {parser_name} of the document {doc_id} is not supported; "
742
+ else:
743
+ message += f"Empty data in the document: {doc_name}; "
744
+ # failed in parsing
745
+ if doc_attributes["status"] == TaskStatus.FAIL.value:
746
+ message += f"Failed in parsing the document: {doc_id}; "
747
+ return message
748
+ except Exception as e:
749
+ return construct_error_response(e)
750
  # ----------------------------stop parsing-----------------------------------------------------
751
 
752
  # ----------------------------show the status of the file-----------------------------------------------------
 
764
  # ----------------------------get a specific chunk-----------------------------------------------------
765
 
766
  # ----------------------------retrieval test-----------------------------------------------------
 
 
 
sdk/python/ragflow/ragflow.py CHANGED
@@ -142,7 +142,19 @@ class RAGFlow:
142
  with open(file_path, "wb") as file:
143
  file.write(content)
144
  return {"code": RetCode.SUCCESS, "data": content}
 
145
  # ----------------------------start parsing-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  # ----------------------------stop parsing-----------------------------------------------------
148
 
 
142
  with open(file_path, "wb") as file:
143
  file.write(content)
144
  return {"code": RetCode.SUCCESS, "data": content}
145
+
146
  # ----------------------------start parsing-----------------------------------------------------
147
+ def start_parsing_document(self, dataset_id, document_id):
148
+ endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status"
149
+ res = requests.post(endpoint, headers=self.authorization_header)
150
+
151
+ return res.json()
152
+
153
+ def start_parsing_documents(self, dataset_id, doc_ids=None):
154
+ endpoint = f"{self.dataset_url}/{dataset_id}/documents/status"
155
+ res = requests.post(endpoint, headers=self.authorization_header, json={"doc_ids": doc_ids})
156
+
157
+ return res.json()
158
 
159
  # ----------------------------stop parsing-----------------------------------------------------
160
 
sdk/python/test/test_data/lol.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ llll
2
+ ooooo
3
+ llll
sdk/python/test/test_document.py CHANGED
@@ -695,7 +695,261 @@ class TestFile(TestSdk):
695
  assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
696
 
697
  # ----------------------------start parsing-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  # ----------------------------stop parsing-----------------------------------------------------
700
 
701
  # ----------------------------show the status of the file-----------------------------------------------------
 
695
  assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
696
 
697
  # ----------------------------start parsing-----------------------------------------------------
698
+ def test_start_parsing_document_with_success(self):
699
+ """
700
+ Test the parsing of a document with success.
701
+ """
702
+ # create a dataset
703
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
704
+ created_res = ragflow.create_dataset("test_start_parsing_document_with_success")
705
+ created_res_id = created_res["data"]["dataset_id"]
706
+ # upload files
707
+ file_paths = ["test_data/lol.txt"]
708
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
709
+ # get the doc_id
710
+ data = uploading_res["data"][0]
711
+ doc_id = data["id"]
712
+ # parse file
713
+ res = ragflow.start_parsing_document(created_res_id, doc_id)
714
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
715
+
716
+ def test_start_parsing_nonexistent_document(self):
717
+ """
718
+ Test the parsing a document which does not exist.
719
+ """
720
+ # create a dataset
721
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
722
+ created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document")
723
+ created_res_id = created_res["data"]["dataset_id"]
724
+ res = ragflow.start_parsing_document(created_res_id, "imagination")
725
+ assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination' cannot be found!"
726
+
727
+ def test_start_parsing_document_in_nonexistent_dataset(self):
728
+ """
729
+ Test the parsing a document whose dataset is nonexistent.
730
+ """
731
+ # create a dataset
732
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
733
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
734
+ created_res_id = created_res["data"]["dataset_id"]
735
+ # upload files
736
+ file_paths = ["test_data/test.txt"]
737
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
738
+ # get the doc_id
739
+ data = uploading_res["data"][0]
740
+ doc_id = data["id"]
741
+ # parse
742
+ res = ragflow.start_parsing_document("imagination", doc_id)
743
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
744
+
745
+ def test_start_parsing_an_empty_document(self):
746
+ """
747
+ Test the parsing of an empty document.
748
+ """
749
+ # create a dataset
750
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
751
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
752
+ created_res_id = created_res["data"]["dataset_id"]
753
+ # upload files
754
+ file_paths = ["test_data/empty.txt"]
755
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
756
+ # get the doc_id
757
+ data = uploading_res["data"][0]
758
+ doc_id = data["id"]
759
+ res = ragflow.start_parsing_document(created_res_id, doc_id)
760
+ assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
761
+
762
+ # ------------------------parsing multiple documents----------------------------
763
+ def test_start_parsing_documents_in_nonexistent_dataset(self):
764
+ """
765
+ Test the parsing documents whose dataset is nonexistent.
766
+ """
767
+ # create a dataset
768
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
769
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
770
+ created_res_id = created_res["data"]["dataset_id"]
771
+ # upload files
772
+ file_paths = ["test_data/test.txt"]
773
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
774
+ # parse
775
+ res = ragflow.start_parsing_documents("imagination")
776
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
777
+
778
+ def test_start_parsing_multiple_documents(self):
779
+ """
780
+ Test the parsing documents with a success.
781
+ """
782
+ # create a dataset
783
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
784
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
785
+ created_res_id = created_res["data"]["dataset_id"]
786
+ # upload files
787
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
788
+ ragflow.upload_local_file(created_res_id, file_paths)
789
+ res = ragflow.start_parsing_documents(created_res_id)
790
+ assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == ""
791
+
792
+ def test_start_parsing_multiple_documents_with_one_empty_file(self):
793
+ """
794
+ Test the parsing documents, one of which is empty.
795
+ """
796
+ # create a dataset
797
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
798
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
799
+ created_res_id = created_res["data"]["dataset_id"]
800
+ # upload files
801
+ file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
802
+ ragflow.upload_local_file(created_res_id, file_paths)
803
+ res = ragflow.start_parsing_documents(created_res_id)
804
+ assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
805
+
806
+ def test_start_parsing_multiple_specific_documents(self):
807
+ """
808
+ Test the parsing documents whose document ids are specified.
809
+ """
810
+ # create a dataset
811
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
812
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
813
+ created_res_id = created_res["data"]["dataset_id"]
814
+ # upload files
815
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
816
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
817
+ # get the doc_id
818
+ data = uploading_res["data"]
819
+ doc_ids = []
820
+ for d in data:
821
+ doc_ids.append(d["id"])
822
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
823
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
824
+
825
+ def test_start_re_parsing_multiple_specific_documents(self):
826
+ """
827
+ Test the re-parsing documents.
828
+ """
829
+ # create a dataset
830
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
831
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
832
+ created_res_id = created_res["data"]["dataset_id"]
833
+ # upload files
834
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
835
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
836
+ # get the doc_id
837
+ data = uploading_res["data"]
838
+ doc_ids = []
839
+ for d in data:
840
+ doc_ids.append(d["id"])
841
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
842
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
843
+ # re-parse
844
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
845
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
846
+
847
+ def test_start_re_parsing_multiple_specific_documents_with_changing_parser_id(self):
848
+ """
849
+ Test the re-parsing documents after changing the parser id.
850
+ """
851
+ # create a dataset
852
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
853
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
854
+ created_res_id = created_res["data"]["dataset_id"]
855
+ # upload files
856
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
857
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
858
+ # get the doc_id
859
+ data = uploading_res["data"]
860
+ doc_ids = []
861
+ for d in data:
862
+ doc_ids.append(d["id"])
863
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
864
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
865
+ # general -> laws
866
+ params = {
867
+ "template_type": "laws"
868
+ }
869
+ ragflow.update_file(created_res_id, doc_ids[0], **params)
870
+ # re-parse
871
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
872
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
873
+
874
+ def test_start_re_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
875
+ """
876
+ Test the re-parsing documents after changing an illegal parser id.
877
+ """
878
+ # create a dataset
879
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
880
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
881
+ created_res_id = created_res["data"]["dataset_id"]
882
+ # upload files
883
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
884
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
885
+ # get the doc_id
886
+ data = uploading_res["data"]
887
+ doc_ids = []
888
+ for d in data:
889
+ doc_ids.append(d["id"])
890
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
891
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
892
+ # general -> illegal
893
+ params = {
894
+ "template_type": "illegal"
895
+ }
896
+ res = ragflow.update_file(created_res_id, doc_ids[0], **params)
897
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
898
+ # re-parse
899
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
900
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
901
+
902
+ def test_start_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
903
+ """
904
+ Test the parsing documents after changing an illegal parser id.
905
+ """
906
+ # create a dataset
907
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
908
+ created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
909
+ created_res_id = created_res["data"]["dataset_id"]
910
+ # upload files
911
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
912
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
913
+ # get the doc_id
914
+ data = uploading_res["data"]
915
+ doc_ids = []
916
+ for d in data:
917
+ doc_ids.append(d["id"])
918
+ # general -> illegal
919
+ params = {
920
+ "template_type": "illegal"
921
+ }
922
+ res = ragflow.update_file(created_res_id, doc_ids[0], **params)
923
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
924
+ # re-parse
925
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
926
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
927
 
928
+ def test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal(self):
929
+ """
930
+ Test the parsing documents whose dataset's parser id is illegal.
931
+ """
932
+ # create a dataset
933
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
934
+ created_res = ragflow.create_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal")
935
+ created_res_id = created_res["data"]["dataset_id"]
936
+ # update the parser id
937
+ params = {
938
+ "chunk_method": "illegal"
939
+ }
940
+ res = ragflow.update_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal", **params)
941
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'chunk_method' field."
942
+ # upload files
943
+ file_paths = ["test_data/test.txt", "test_data/test1.txt"]
944
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
945
+ # get the doc_id
946
+ data = uploading_res["data"]
947
+ doc_ids = []
948
+ for d in data:
949
+ doc_ids.append(d["id"])
950
+ # parse
951
+ res = ragflow.start_parsing_documents(created_res_id, doc_ids)
952
+ assert res["code"] == RetCode.SUCCESS and res["message"] == ""
953
  # ----------------------------stop parsing-----------------------------------------------------
954
 
955
  # ----------------------------show the status of the file-----------------------------------------------------