zhichyu commited on
Commit
47ea26c
·
1 Parent(s): 01d7cd1

Try to reuse existing chunks (#3983)

Browse files

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/document_app.py CHANGED
@@ -21,19 +21,25 @@ import flask
21
  from flask import request
22
  from flask_login import login_required, current_user
23
 
24
- from api.db.db_models import Task, File
 
 
 
 
25
  from api.db.services.file2document_service import File2DocumentService
26
  from api.db.services.file_service import FileService
27
- from api.db.services.task_service import TaskService, queue_tasks
28
  from api.db.services.user_service import UserTenantService
29
- from deepdoc.parser.html_parser import RAGFlowHtmlParser
30
- from rag.nlp import search
31
  from api.db.services import duplicate_name
32
  from api.db.services.knowledgebase_service import KnowledgebaseService
33
- from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
34
- from api.utils import get_uuid
35
- from api.db import FileType, TaskStatus, ParserType, FileSource
36
  from api.db.services.document_service import DocumentService, doc_upload_and_parse
 
 
 
 
 
 
37
  from api import settings
38
  from api.utils.api_utils import get_json_result
39
  from rag.utils.storage_factory import STORAGE_IMPL
@@ -316,6 +322,7 @@ def rm():
316
 
317
  b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
318
 
 
319
  if not DocumentService.remove_document(doc, tenant_id):
320
  return get_data_error_result(
321
  message="Database error (Document removal)!")
@@ -361,11 +368,12 @@ def run():
361
  e, doc = DocumentService.get_by_id(id)
362
  if not e:
363
  return get_data_error_result(message="Document not found!")
364
- if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
365
- settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
 
 
366
 
367
  if str(req["run"]) == TaskStatus.RUNNING.value:
368
- TaskService.filter_delete([Task.doc_id == id])
369
  e, doc = DocumentService.get_by_id(id)
370
  doc = doc.to_dict()
371
  doc["tenant_id"] = tenant_id
 
21
  from flask import request
22
  from flask_login import login_required, current_user
23
 
24
+ from deepdoc.parser.html_parser import RAGFlowHtmlParser
25
+ from rag.nlp import search
26
+
27
+ from api.db import FileType, TaskStatus, ParserType, FileSource
28
+ from api.db.db_models import File, Task
29
  from api.db.services.file2document_service import File2DocumentService
30
  from api.db.services.file_service import FileService
31
+ from api.db.services.task_service import queue_tasks
32
  from api.db.services.user_service import UserTenantService
 
 
33
  from api.db.services import duplicate_name
34
  from api.db.services.knowledgebase_service import KnowledgebaseService
35
+ from api.db.services.task_service import TaskService
 
 
36
  from api.db.services.document_service import DocumentService, doc_upload_and_parse
37
+ from api.utils.api_utils import (
38
+ server_error_response,
39
+ get_data_error_result,
40
+ validate_request,
41
+ )
42
+ from api.utils import get_uuid
43
  from api import settings
44
  from api.utils.api_utils import get_json_result
45
  from rag.utils.storage_factory import STORAGE_IMPL
 
322
 
323
  b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
324
 
325
+ TaskService.filter_delete([Task.doc_id == doc_id])
326
  if not DocumentService.remove_document(doc, tenant_id):
327
  return get_data_error_result(
328
  message="Database error (Document removal)!")
 
368
  e, doc = DocumentService.get_by_id(id)
369
  if not e:
370
  return get_data_error_result(message="Document not found!")
371
+ if req.get("delete", False):
372
+ TaskService.filter_delete([Task.doc_id == id])
373
+ if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
374
+ settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
375
 
376
  if str(req["run"]) == TaskStatus.RUNNING.value:
 
377
  e, doc = DocumentService.get_by_id(id)
378
  doc = doc.to_dict()
379
  doc["tenant_id"] = tenant_id
api/db/db_models.py CHANGED
@@ -855,6 +855,8 @@ class Task(DataBaseModel):
855
  help_text="process message",
856
  default="")
857
  retry_count = IntegerField(default=0)
 
 
858
 
859
 
860
  class Dialog(DataBaseModel):
@@ -1090,4 +1092,16 @@ def migrate_db():
1090
  )
1091
  except Exception:
1092
  pass
 
 
 
 
 
 
1093
 
 
 
 
 
 
 
 
855
  help_text="process message",
856
  default="")
857
  retry_count = IntegerField(default=0)
858
+ digest = TextField(null=True, help_text="task digest", default="")
859
+ chunk_ids = LongTextField(null=True, help_text="chunk ids", default="")
860
 
861
 
862
  class Dialog(DataBaseModel):
 
1092
  )
1093
  except Exception:
1094
  pass
1095
+ try:
1096
+ migrate(
1097
+ migrator.add_column("task", "digest", TextField(null=True, help_text="task digest", default=""))
1098
+ )
1099
+ except Exception:
1100
+ pass
1101
 
1102
+ try:
1103
+ migrate(
1104
+ migrator.add_column("task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
1105
+ )
1106
+ except Exception:
1107
+ pass
api/db/services/document_service.py CHANGED
@@ -282,6 +282,31 @@ class DocumentService(CommonService):
282
  return
283
  return docs[0]["embd_id"]
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  @classmethod
286
  @DB.connection_context()
287
  def get_doc_id_by_doc_name(cls, doc_name):
 
282
  return
283
  return docs[0]["embd_id"]
284
 
285
+ @classmethod
286
+ @DB.connection_context()
287
+ def get_chunking_config(cls, doc_id):
288
+ configs = (
289
+ cls.model.select(
290
+ cls.model.id,
291
+ cls.model.kb_id,
292
+ cls.model.parser_id,
293
+ cls.model.parser_config,
294
+ Knowledgebase.language,
295
+ Knowledgebase.embd_id,
296
+ Tenant.id.alias("tenant_id"),
297
+ Tenant.img2txt_id,
298
+ Tenant.asr_id,
299
+ Tenant.llm_id,
300
+ )
301
+ .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
302
+ .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
303
+ .where(cls.model.id == doc_id)
304
+ )
305
+ configs = configs.dicts()
306
+ if not configs:
307
+ return None
308
+ return configs[0]
309
+
310
  @classmethod
311
  @DB.connection_context()
312
  def get_doc_id_by_doc_name(cls, doc_name):
api/db/services/task_service.py CHANGED
@@ -15,6 +15,8 @@
15
  #
16
  import os
17
  import random
 
 
18
 
19
  from api.db.db_utils import bulk_insert_into_db
20
  from deepdoc.parser import PdfParser
@@ -29,7 +31,21 @@ from deepdoc.parser.excel_parser import RAGFlowExcelParser
29
  from rag.settings import SVR_QUEUE_NAME
30
  from rag.utils.storage_factory import STORAGE_IMPL
31
  from rag.utils.redis_conn import REDIS_CONN
 
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  class TaskService(CommonService):
35
  model = Task
@@ -87,6 +103,30 @@ class TaskService(CommonService):
87
 
88
  return docs[0]
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  @classmethod
91
  @DB.connection_context()
92
  def get_ongoing_doc_name(cls):
@@ -133,22 +173,18 @@ class TaskService(CommonService):
133
  @classmethod
134
  @DB.connection_context()
135
  def do_cancel(cls, id):
136
- try:
137
- task = cls.model.get_by_id(id)
138
- _, doc = DocumentService.get_by_id(task.doc_id)
139
- return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
140
- except Exception:
141
- pass
142
- return False
143
 
144
  @classmethod
145
  @DB.connection_context()
146
  def update_progress(cls, id, info):
147
  if os.environ.get("MACOS"):
148
  if info["progress_msg"]:
149
- cls.model.update(
150
- progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]
151
- ).where(cls.model.id == id).execute()
152
  if "progress" in info:
153
  cls.model.update(progress=info["progress"]).where(
154
  cls.model.id == id
@@ -157,9 +193,9 @@ class TaskService(CommonService):
157
 
158
  with DB.lock("update_progress", -1):
159
  if info["progress_msg"]:
160
- cls.model.update(
161
- progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]
162
- ).where(cls.model.id == id).execute()
163
  if "progress" in info:
164
  cls.model.update(progress=info["progress"]).where(
165
  cls.model.id == id
@@ -168,7 +204,7 @@ class TaskService(CommonService):
168
 
169
  def queue_tasks(doc: dict, bucket: str, name: str):
170
  def new_task():
171
- return {"id": get_uuid(), "doc_id": doc["id"]}
172
 
173
  tsks = []
174
 
@@ -203,10 +239,46 @@ def queue_tasks(doc: dict, bucket: str, name: str):
203
  else:
204
  tsks.append(new_task())
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  bulk_insert_into_db(Task, tsks, True)
207
  DocumentService.begin2parse(doc["id"])
208
 
 
209
  for t in tsks:
210
  assert REDIS_CONN.queue_product(
211
  SVR_QUEUE_NAME, message=t
212
  ), "Can't access Redis. Please check the Redis' status."
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  #
16
  import os
17
  import random
18
+ import xxhash
19
+ import bisect
20
 
21
  from api.db.db_utils import bulk_insert_into_db
22
  from deepdoc.parser import PdfParser
 
31
  from rag.settings import SVR_QUEUE_NAME
32
  from rag.utils.storage_factory import STORAGE_IMPL
33
  from rag.utils.redis_conn import REDIS_CONN
34
+ from api import settings
35
+ from rag.nlp import search
36
 
37
+ def trim_header_by_lines(text: str, max_length) -> str:
38
+ if len(text) <= max_length:
39
+ return text
40
+ lines = text.split("\n")
41
+ total = 0
42
+ idx = len(lines) - 1
43
+ for i in range(len(lines)-1, -1, -1):
44
+ if total + len(lines[i]) > max_length:
45
+ break
46
+ idx = i
47
+ text2 = "\n".join(lines[idx:])
48
+ return text2
49
 
50
  class TaskService(CommonService):
51
  model = Task
 
103
 
104
  return docs[0]
105
 
106
+ @classmethod
107
+ @DB.connection_context()
108
+ def get_tasks(cls, doc_id: str):
109
+ fields = [
110
+ cls.model.id,
111
+ cls.model.from_page,
112
+ cls.model.progress,
113
+ cls.model.digest,
114
+ cls.model.chunk_ids,
115
+ ]
116
+ tasks = (
117
+ cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc())
118
+ .where(cls.model.doc_id == doc_id)
119
+ )
120
+ tasks = list(tasks.dicts())
121
+ if not tasks:
122
+ return None
123
+ return tasks
124
+
125
+ @classmethod
126
+ @DB.connection_context()
127
+ def update_chunk_ids(cls, id: str, chunk_ids: str):
128
+ cls.model.update(chunk_ids=chunk_ids).where(cls.model.id == id).execute()
129
+
130
  @classmethod
131
  @DB.connection_context()
132
  def get_ongoing_doc_name(cls):
 
173
  @classmethod
174
  @DB.connection_context()
175
  def do_cancel(cls, id):
176
+ task = cls.model.get_by_id(id)
177
+ _, doc = DocumentService.get_by_id(task.doc_id)
178
+ return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
 
 
 
 
179
 
180
  @classmethod
181
  @DB.connection_context()
182
  def update_progress(cls, id, info):
183
  if os.environ.get("MACOS"):
184
  if info["progress_msg"]:
185
+ task = cls.model.get_by_id(id)
186
+ progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
187
+ cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
188
  if "progress" in info:
189
  cls.model.update(progress=info["progress"]).where(
190
  cls.model.id == id
 
193
 
194
  with DB.lock("update_progress", -1):
195
  if info["progress_msg"]:
196
+ task = cls.model.get_by_id(id)
197
+ progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
198
+ cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
199
  if "progress" in info:
200
  cls.model.update(progress=info["progress"]).where(
201
  cls.model.id == id
 
204
 
205
  def queue_tasks(doc: dict, bucket: str, name: str):
206
  def new_task():
207
+ return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0}
208
 
209
  tsks = []
210
 
 
239
  else:
240
  tsks.append(new_task())
241
 
242
+ chunking_config = DocumentService.get_chunking_config(doc["id"])
243
+ for task in tsks:
244
+ hasher = xxhash.xxh64()
245
+ for field in sorted(chunking_config.keys()):
246
+ hasher.update(str(chunking_config[field]).encode("utf-8"))
247
+ for field in ["doc_id", "from_page", "to_page"]:
248
+ hasher.update(str(task.get(field, "")).encode("utf-8"))
249
+ task_digest = hasher.hexdigest()
250
+ task["digest"] = task_digest
251
+ task["progress"] = 0.0
252
+
253
+ prev_tasks = TaskService.get_tasks(doc["id"])
254
+ if prev_tasks:
255
+ for task in tsks:
256
+ reuse_prev_task_chunks(task, prev_tasks, chunking_config)
257
+ TaskService.filter_delete([Task.doc_id == doc["id"]])
258
+ chunk_ids = []
259
+ for task in prev_tasks:
260
+ if task["chunk_ids"]:
261
+ chunk_ids.extend(task["chunk_ids"].split())
262
+ if chunk_ids:
263
+ settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
264
+
265
  bulk_insert_into_db(Task, tsks, True)
266
  DocumentService.begin2parse(doc["id"])
267
 
268
+ tsks = [task for task in tsks if task["progress"] < 1.0]
269
  for t in tsks:
270
  assert REDIS_CONN.queue_product(
271
  SVR_QUEUE_NAME, message=t
272
  ), "Can't access Redis. Please check the Redis' status."
273
+
274
+ def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
275
+ idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
276
+ if idx >= len(prev_tasks):
277
+ return
278
+ prev_task = prev_tasks[idx]
279
+ if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
280
+ return
281
+ task["chunk_ids"] = prev_task["chunk_ids"]
282
+ task["progress"] = 1.0
283
+ task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
284
+ prev_task["chunk_ids"] = ""
poetry.lock CHANGED
@@ -145,7 +145,7 @@ name = "aiolimiter"
145
  version = "1.2.0"
146
  description = "asyncio rate limiter, a leaky bucket implementation"
147
  optional = false
148
- python-versions = "<4.0,>=3.8"
149
  files = [
150
  {file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
151
  {file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
@@ -416,7 +416,7 @@ name = "aspose-slides"
416
  version = "24.12.0"
417
  description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
418
  optional = false
419
- python-versions = "<3.14,>=3.5"
420
  files = [
421
  {file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
422
  {file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
@@ -568,7 +568,7 @@ name = "bce-python-sdk"
568
  version = "0.9.23"
569
  description = "BCE SDK for python"
570
  optional = false
571
- python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,<4,>=2.7"
572
  files = [
573
  {file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
574
  {file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
@@ -1502,7 +1502,7 @@ name = "cryptography"
1502
  version = "44.0.0"
1503
  description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
1504
  optional = false
1505
- python-versions = "!=3.9.0,!=3.9.1,>=3.7"
1506
  files = [
1507
  {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
1508
  {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
@@ -1711,7 +1711,7 @@ name = "deprecated"
1711
  version = "1.2.15"
1712
  description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
1713
  optional = false
1714
- python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
1715
  files = [
1716
  {file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
1717
  {file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
@@ -2042,7 +2042,7 @@ name = "fastembed"
2042
  version = "0.3.6"
2043
  description = "Fast, light, accurate library built for retrieval embedding generation"
2044
  optional = false
2045
- python-versions = "<3.13,>=3.8.0"
2046
  files = [
2047
  {file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
2048
  {file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
@@ -2624,12 +2624,12 @@ files = [
2624
  google-auth = ">=2.14.1,<3.0.dev0"
2625
  googleapis-common-protos = ">=1.56.2,<2.0.dev0"
2626
  grpcio = [
2627
- {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
2628
  {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
 
2629
  ]
2630
  grpcio-status = [
2631
- {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
2632
  {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
 
2633
  ]
2634
  proto-plus = ">=1.22.3,<2.0.0dev"
2635
  protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
@@ -2965,7 +2965,7 @@ name = "graspologic"
2965
  version = "3.4.1"
2966
  description = "A set of Python modules for graph statistics"
2967
  optional = false
2968
- python-versions = "<3.13,>=3.9"
2969
  files = [
2970
  {file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
2971
  {file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
@@ -3650,7 +3650,7 @@ name = "infinity-emb"
3650
  version = "0.0.66"
3651
  description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
3652
  optional = false
3653
- python-versions = "<4,>=3.9"
3654
  files = [
3655
  {file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
3656
  {file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
@@ -4098,7 +4098,7 @@ name = "litellm"
4098
  version = "1.48.0"
4099
  description = "Library to easily interface with LLM API providers"
4100
  optional = false
4101
- python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
4102
  files = [
4103
  {file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
4104
  {file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
@@ -5416,9 +5416,9 @@ files = [
5416
 
5417
  [package.dependencies]
5418
  numpy = [
5419
- {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5420
  {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
5421
  {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
 
5422
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5423
  ]
5424
 
@@ -5440,9 +5440,9 @@ files = [
5440
 
5441
  [package.dependencies]
5442
  numpy = [
5443
- {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5444
  {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
5445
  {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
 
5446
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5447
  ]
5448
 
@@ -5657,8 +5657,8 @@ files = [
5657
 
5658
  [package.dependencies]
5659
  numpy = [
5660
- {version = ">=1.23.2", markers = "python_version == \"3.11\""},
5661
  {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 
5662
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5663
  ]
5664
  python-dateutil = ">=2.8.2"
@@ -6276,7 +6276,7 @@ name = "psutil"
6276
  version = "6.1.0"
6277
  description = "Cross-platform lib for process and system monitoring in Python."
6278
  optional = false
6279
- python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
6280
  files = [
6281
  {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
6282
  {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
@@ -6298,8 +6298,8 @@ files = [
6298
  ]
6299
 
6300
  [package.extras]
6301
- dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
6302
- test = ["pytest", "pytest-xdist", "setuptools"]
6303
 
6304
  [[package]]
6305
  name = "psycopg2-binary"
@@ -7803,40 +7803,30 @@ files = [
7803
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
7804
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
7805
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
7806
- {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
7807
- {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
7808
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
7809
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
7810
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
7811
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
7812
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
7813
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
7814
- {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
7815
- {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
7816
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
7817
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
7818
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
7819
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
7820
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
7821
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
7822
- {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
7823
- {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
7824
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
7825
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
7826
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
7827
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
7828
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
7829
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
7830
- {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
7831
- {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
7832
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
7833
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
7834
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
7835
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
7836
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
7837
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
7838
- {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
7839
- {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
7840
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
7841
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
7842
  {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
@@ -7847,7 +7837,7 @@ name = "s3transfer"
7847
  version = "0.10.4"
7848
  description = "An Amazon S3 Transfer Manager"
7849
  optional = false
7850
- python-versions = ">=3.8"
7851
  files = [
7852
  {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
7853
  {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
@@ -8309,7 +8299,7 @@ name = "smart-open"
8309
  version = "7.0.5"
8310
  description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
8311
  optional = false
8312
- python-versions = "<4.0,>=3.7"
8313
  files = [
8314
  {file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
8315
  {file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
@@ -10119,4 +10109,4 @@ cffi = ["cffi (>=1.11)"]
10119
  [metadata]
10120
  lock-version = "2.0"
10121
  python-versions = ">=3.10,<3.13"
10122
- content-hash = "6e401212ce8c6499bc8c687355a6cfe7d0fc2beecf21e7895546f82a2b4332cd"
 
145
  version = "1.2.0"
146
  description = "asyncio rate limiter, a leaky bucket implementation"
147
  optional = false
148
+ python-versions = ">=3.8,<4.0"
149
  files = [
150
  {file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
151
  {file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
 
416
  version = "24.12.0"
417
  description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
418
  optional = false
419
+ python-versions = ">=3.5,<3.14"
420
  files = [
421
  {file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
422
  {file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
 
568
  version = "0.9.23"
569
  description = "BCE SDK for python"
570
  optional = false
571
+ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4"
572
  files = [
573
  {file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
574
  {file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
 
1502
  version = "44.0.0"
1503
  description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
1504
  optional = false
1505
+ python-versions = ">=3.7, !=3.9.0, !=3.9.1"
1506
  files = [
1507
  {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
1508
  {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
 
1711
  version = "1.2.15"
1712
  description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
1713
  optional = false
1714
+ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
1715
  files = [
1716
  {file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
1717
  {file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
 
2042
  version = "0.3.6"
2043
  description = "Fast, light, accurate library built for retrieval embedding generation"
2044
  optional = false
2045
+ python-versions = ">=3.8.0,<3.13"
2046
  files = [
2047
  {file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
2048
  {file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
 
2624
  google-auth = ">=2.14.1,<3.0.dev0"
2625
  googleapis-common-protos = ">=1.56.2,<2.0.dev0"
2626
  grpcio = [
 
2627
  {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
2628
+ {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
2629
  ]
2630
  grpcio-status = [
 
2631
  {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
2632
+ {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
2633
  ]
2634
  proto-plus = ">=1.22.3,<2.0.0dev"
2635
  protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
 
2965
  version = "3.4.1"
2966
  description = "A set of Python modules for graph statistics"
2967
  optional = false
2968
+ python-versions = ">=3.9,<3.13"
2969
  files = [
2970
  {file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
2971
  {file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
 
3650
  version = "0.0.66"
3651
  description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
3652
  optional = false
3653
+ python-versions = ">=3.9,<4"
3654
  files = [
3655
  {file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
3656
  {file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
 
4098
  version = "1.48.0"
4099
  description = "Library to easily interface with LLM API providers"
4100
  optional = false
4101
+ python-versions = ">=3.8, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*"
4102
  files = [
4103
  {file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
4104
  {file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
 
5416
 
5417
  [package.dependencies]
5418
  numpy = [
 
5419
  {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
5420
  {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
5421
+ {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5422
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5423
  ]
5424
 
 
5440
 
5441
  [package.dependencies]
5442
  numpy = [
 
5443
  {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
5444
  {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
5445
+ {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5446
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5447
  ]
5448
 
 
5657
 
5658
  [package.dependencies]
5659
  numpy = [
 
5660
  {version = ">=1.22.4", markers = "python_version < \"3.11\""},
5661
+ {version = ">=1.23.2", markers = "python_version == \"3.11\""},
5662
  {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5663
  ]
5664
  python-dateutil = ">=2.8.2"
 
6276
  version = "6.1.0"
6277
  description = "Cross-platform lib for process and system monitoring in Python."
6278
  optional = false
6279
+ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
6280
  files = [
6281
  {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
6282
  {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
 
6298
  ]
6299
 
6300
  [package.extras]
6301
+ dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx-rtd-theme", "toml-sort", "twine", "virtualenv", "wheel"]
6302
+ test = ["enum34", "futures", "ipaddress", "mock (==1.0.1)", "pytest (==4.6.11)", "pytest-xdist", "setuptools", "unittest2"]
6303
 
6304
  [[package]]
6305
  name = "psycopg2-binary"
 
7803
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
7804
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
7805
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
 
 
7806
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
7807
  {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
7808
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
7809
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
7810
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
7811
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
 
 
7812
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
7813
  {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
7814
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
7815
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
7816
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
7817
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
 
 
7818
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
7819
  {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
7820
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
7821
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
7822
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
7823
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
 
 
7824
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
7825
  {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
7826
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
7827
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
7828
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
7829
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
 
 
7830
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
7831
  {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
7832
  {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
 
7837
  version = "0.10.4"
7838
  description = "An Amazon S3 Transfer Manager"
7839
  optional = false
7840
+ python-versions = ">= 3.8"
7841
  files = [
7842
  {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
7843
  {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
 
8299
  version = "7.0.5"
8300
  description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
8301
  optional = false
8302
+ python-versions = ">=3.7,<4.0"
8303
  files = [
8304
  {file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
8305
  {file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
 
10109
  [metadata]
10110
  lock-version = "2.0"
10111
  python-versions = ">=3.10,<3.13"
10112
+ content-hash = "69fbe11a30c649544196546b9384ca5972bfd17a923b7dc8ff340f790984b5df"
pyproject.toml CHANGED
@@ -121,6 +121,7 @@ pyicu = "^2.13.1"
121
  flasgger = "^0.9.7.1"
122
  polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
123
  polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
 
124
 
125
 
126
  [tool.poetry.group.full]
 
121
  flasgger = "^0.9.7.1"
122
  polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
123
  polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
124
+ xxhash = "^3.5.0"
125
 
126
 
127
  [tool.poetry.group.full]
rag/svr/task_executor.py CHANGED
@@ -39,8 +39,9 @@ from timeit import default_timer as timer
39
  import tracemalloc
40
 
41
  import numpy as np
 
42
 
43
- from api.db import LLMType, ParserType
44
  from api.db.services.dialog_service import keyword_extraction, question_proposal
45
  from api.db.services.document_service import DocumentService
46
  from api.db.services.llm_service import LLMBundle
@@ -89,12 +90,23 @@ DONE_TASKS = 0
89
  FAILED_TASKS = 0
90
  CURRENT_TASK = None
91
 
 
 
 
92
 
93
  def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
94
  global PAYLOAD
95
  if prog is not None and prog < 0:
96
  msg = "[ERROR]" + msg
97
- cancel = TaskService.do_cancel(task_id)
 
 
 
 
 
 
 
 
98
  if cancel:
99
  msg += " [Canceled]"
100
  prog = -1
@@ -105,18 +117,22 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
105
  d = {"progress_msg": msg}
106
  if prog is not None:
107
  d["progress"] = prog
 
 
108
  try:
109
- logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
110
  TaskService.update_progress(task_id, d)
111
- except Exception:
112
- logging.exception(f"set_progress({task_id}) got exception")
113
-
114
- close_connection()
115
- if cancel:
116
  if PAYLOAD:
117
  PAYLOAD.ack()
118
  PAYLOAD = None
119
- os._exit(0)
 
 
 
 
 
 
120
 
121
 
122
  def collect():
@@ -136,16 +152,22 @@ def collect():
136
  if not msg:
137
  return None
138
 
139
- if TaskService.do_cancel(msg["id"]):
140
- with mt_lock:
141
- DONE_TASKS += 1
142
- logging.info("Task {} has been canceled.".format(msg["id"]))
143
- return None
144
- task = TaskService.get_task(msg["id"])
145
- if not task:
 
 
 
 
 
 
146
  with mt_lock:
147
  DONE_TASKS += 1
148
- logging.warning("{} empty task!".format(msg["id"]))
149
  return None
150
 
151
  if msg.get("type", "") == "raptor":
@@ -186,6 +208,8 @@ def build_chunks(task, progress_callback):
186
  to_page=task["to_page"], lang=task["language"], callback=progress_callback,
187
  kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
188
  logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
 
 
189
  except Exception as e:
190
  progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
191
  logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
@@ -358,6 +382,8 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
358
  return res, tk_count, vector_size
359
 
360
 
 
 
361
  def do_handle_task(task):
362
  task_id = task["id"]
363
  task_from_page = task["from_page"]
@@ -373,6 +399,16 @@ def do_handle_task(task):
373
 
374
  # prepare the progress callback function
375
  progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
 
 
 
 
 
 
 
 
 
 
376
  try:
377
  # bind embedding model
378
  embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
@@ -390,6 +426,8 @@ def do_handle_task(task):
390
 
391
  # run RAPTOR
392
  chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
 
 
393
  except Exception as e:
394
  error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
395
  progress_callback(-1, msg=error_message)
@@ -420,6 +458,7 @@ def do_handle_task(task):
420
  progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
421
  logging.info(progress_message)
422
  progress_callback(msg=progress_message)
 
423
  # logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
424
  init_kb(task, vector_size)
425
  chunk_count = len(set([chunk["id"] for chunk in chunks]))
@@ -430,23 +469,25 @@ def do_handle_task(task):
430
  doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
431
  if b % 128 == 0:
432
  progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
433
- logging.info("Indexing {} elapsed: {:.2f}".format(task_document_name, timer() - start_ts))
434
- if doc_store_result:
435
- error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
436
- progress_callback(-1, msg=error_message)
437
- settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
438
- logging.error(error_message)
439
- raise Exception(error_message)
440
-
441
- if TaskService.do_cancel(task_id):
442
- settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
443
- return
 
 
444
 
445
  DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
446
 
447
  time_cost = timer() - start_ts
448
  progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
449
- logging.info("Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(task_id, token_count, len(chunks), time_cost))
450
 
451
 
452
  def handle_task():
@@ -462,6 +503,12 @@ def handle_task():
462
  DONE_TASKS += 1
463
  CURRENT_TASK = None
464
  logging.info(f"handle_task done for task {json.dumps(task)}")
 
 
 
 
 
 
465
  except Exception:
466
  with mt_lock:
467
  FAILED_TASKS += 1
 
39
  import tracemalloc
40
 
41
  import numpy as np
42
+ from peewee import DoesNotExist
43
 
44
+ from api.db import LLMType, ParserType, TaskStatus
45
  from api.db.services.dialog_service import keyword_extraction, question_proposal
46
  from api.db.services.document_service import DocumentService
47
  from api.db.services.llm_service import LLMBundle
 
90
  FAILED_TASKS = 0
91
  CURRENT_TASK = None
92
 
93
+ class TaskCanceledException(Exception):
94
+ def __init__(self, msg):
95
+ self.msg = msg
96
 
97
  def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
98
  global PAYLOAD
99
  if prog is not None and prog < 0:
100
  msg = "[ERROR]" + msg
101
+ try:
102
+ cancel = TaskService.do_cancel(task_id)
103
+ except DoesNotExist:
104
+ logging.warning(f"set_progress task {task_id} is unknown")
105
+ if PAYLOAD:
106
+ PAYLOAD.ack()
107
+ PAYLOAD = None
108
+ return
109
+
110
  if cancel:
111
  msg += " [Canceled]"
112
  prog = -1
 
117
  d = {"progress_msg": msg}
118
  if prog is not None:
119
  d["progress"] = prog
120
+
121
+ logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
122
  try:
 
123
  TaskService.update_progress(task_id, d)
124
+ except DoesNotExist:
125
+ logging.warning(f"set_progress task {task_id} is unknown")
 
 
 
126
  if PAYLOAD:
127
  PAYLOAD.ack()
128
  PAYLOAD = None
129
+ return
130
+
131
+ close_connection()
132
+ if cancel and PAYLOAD:
133
+ PAYLOAD.ack()
134
+ PAYLOAD = None
135
+ raise TaskCanceledException(msg)
136
 
137
 
138
  def collect():
 
152
  if not msg:
153
  return None
154
 
155
+ task = None
156
+ canceled = False
157
+ try:
158
+ task = TaskService.get_task(msg["id"])
159
+ if task:
160
+ _, doc = DocumentService.get_by_id(task["doc_id"])
161
+ canceled = doc.run == TaskStatus.CANCEL.value or doc.progress < 0
162
+ except DoesNotExist:
163
+ pass
164
+ except Exception:
165
+ logging.exception("collect get_task exception")
166
+ if not task or canceled:
167
+ state = "is unknown" if not task else "has been cancelled"
168
  with mt_lock:
169
  DONE_TASKS += 1
170
+ logging.info(f"collect task {msg['id']} {state}")
171
  return None
172
 
173
  if msg.get("type", "") == "raptor":
 
208
  to_page=task["to_page"], lang=task["language"], callback=progress_callback,
209
  kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
210
  logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
211
+ except TaskCanceledException:
212
+ raise
213
  except Exception as e:
214
  progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
215
  logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
 
382
  return res, tk_count, vector_size
383
 
384
 
385
+
386
+
387
  def do_handle_task(task):
388
  task_id = task["id"]
389
  task_from_page = task["from_page"]
 
399
 
400
  # prepare the progress callback function
401
  progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
402
+
403
+ try:
404
+ task_canceled = TaskService.do_cancel(task_id)
405
+ except DoesNotExist:
406
+ logging.warning(f"task {task_id} is unknown")
407
+ return
408
+ if task_canceled:
409
+ progress_callback(-1, msg="Task has been canceled.")
410
+ return
411
+
412
  try:
413
  # bind embedding model
414
  embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
 
426
 
427
  # run RAPTOR
428
  chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
429
+ except TaskCanceledException:
430
+ raise
431
  except Exception as e:
432
  error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
433
  progress_callback(-1, msg=error_message)
 
458
  progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
459
  logging.info(progress_message)
460
  progress_callback(msg=progress_message)
461
+
462
  # logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
463
  init_kb(task, vector_size)
464
  chunk_count = len(set([chunk["id"] for chunk in chunks]))
 
469
  doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
470
  if b % 128 == 0:
471
  progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
472
+ if doc_store_result:
473
+ error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
474
+ progress_callback(-1, msg=error_message)
475
+ raise Exception(error_message)
476
+ chunk_ids = [chunk["id"] for chunk in chunks[:b + es_bulk_size]]
477
+ chunk_ids_str = " ".join(chunk_ids)
478
+ try:
479
+ TaskService.update_chunk_ids(task["id"], chunk_ids_str)
480
+ except DoesNotExist:
481
+ logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
482
+ doc_store_result = settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id)
483
+ return
484
+ logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))
485
 
486
  DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
487
 
488
  time_cost = timer() - start_ts
489
  progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
490
+ logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, time_cost))
491
 
492
 
493
  def handle_task():
 
503
  DONE_TASKS += 1
504
  CURRENT_TASK = None
505
  logging.info(f"handle_task done for task {json.dumps(task)}")
506
+ except TaskCanceledException:
507
+ with mt_lock:
508
+ DONE_TASKS += 1
509
+ CURRENT_TASK = None
510
+ logging.info(f"handle_task got TaskCanceledException for task {json.dumps(task)}")
511
+ logging.debug("handle_task got TaskCanceledException", exc_info=True)
512
  except Exception:
513
  with mt_lock:
514
  FAILED_TASKS += 1