Try to reuse existing chunks (#3983)
Browse files### What problem does this PR solve?
Try to reuse existing chunks. Close #3793
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/document_app.py +18 -10
- api/db/db_models.py +14 -0
- api/db/services/document_service.py +25 -0
- api/db/services/task_service.py +86 -14
- poetry.lock +20 -30
- pyproject.toml +1 -0
- rag/svr/task_executor.py +76 -29
api/apps/document_app.py
CHANGED
@@ -21,19 +21,25 @@ import flask
|
|
21 |
from flask import request
|
22 |
from flask_login import login_required, current_user
|
23 |
|
24 |
-
from
|
|
|
|
|
|
|
|
|
25 |
from api.db.services.file2document_service import File2DocumentService
|
26 |
from api.db.services.file_service import FileService
|
27 |
-
from api.db.services.task_service import
|
28 |
from api.db.services.user_service import UserTenantService
|
29 |
-
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
30 |
-
from rag.nlp import search
|
31 |
from api.db.services import duplicate_name
|
32 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
33 |
-
from api.
|
34 |
-
from api.utils import get_uuid
|
35 |
-
from api.db import FileType, TaskStatus, ParserType, FileSource
|
36 |
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
from api import settings
|
38 |
from api.utils.api_utils import get_json_result
|
39 |
from rag.utils.storage_factory import STORAGE_IMPL
|
@@ -316,6 +322,7 @@ def rm():
|
|
316 |
|
317 |
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
|
318 |
|
|
|
319 |
if not DocumentService.remove_document(doc, tenant_id):
|
320 |
return get_data_error_result(
|
321 |
message="Database error (Document removal)!")
|
@@ -361,11 +368,12 @@ def run():
|
|
361 |
e, doc = DocumentService.get_by_id(id)
|
362 |
if not e:
|
363 |
return get_data_error_result(message="Document not found!")
|
364 |
-
if
|
365 |
-
|
|
|
|
|
366 |
|
367 |
if str(req["run"]) == TaskStatus.RUNNING.value:
|
368 |
-
TaskService.filter_delete([Task.doc_id == id])
|
369 |
e, doc = DocumentService.get_by_id(id)
|
370 |
doc = doc.to_dict()
|
371 |
doc["tenant_id"] = tenant_id
|
|
|
21 |
from flask import request
|
22 |
from flask_login import login_required, current_user
|
23 |
|
24 |
+
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
25 |
+
from rag.nlp import search
|
26 |
+
|
27 |
+
from api.db import FileType, TaskStatus, ParserType, FileSource
|
28 |
+
from api.db.db_models import File, Task
|
29 |
from api.db.services.file2document_service import File2DocumentService
|
30 |
from api.db.services.file_service import FileService
|
31 |
+
from api.db.services.task_service import queue_tasks
|
32 |
from api.db.services.user_service import UserTenantService
|
|
|
|
|
33 |
from api.db.services import duplicate_name
|
34 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
35 |
+
from api.db.services.task_service import TaskService
|
|
|
|
|
36 |
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
37 |
+
from api.utils.api_utils import (
|
38 |
+
server_error_response,
|
39 |
+
get_data_error_result,
|
40 |
+
validate_request,
|
41 |
+
)
|
42 |
+
from api.utils import get_uuid
|
43 |
from api import settings
|
44 |
from api.utils.api_utils import get_json_result
|
45 |
from rag.utils.storage_factory import STORAGE_IMPL
|
|
|
322 |
|
323 |
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
|
324 |
|
325 |
+
TaskService.filter_delete([Task.doc_id == doc_id])
|
326 |
if not DocumentService.remove_document(doc, tenant_id):
|
327 |
return get_data_error_result(
|
328 |
message="Database error (Document removal)!")
|
|
|
368 |
e, doc = DocumentService.get_by_id(id)
|
369 |
if not e:
|
370 |
return get_data_error_result(message="Document not found!")
|
371 |
+
if req.get("delete", False):
|
372 |
+
TaskService.filter_delete([Task.doc_id == id])
|
373 |
+
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
|
374 |
+
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
|
375 |
|
376 |
if str(req["run"]) == TaskStatus.RUNNING.value:
|
|
|
377 |
e, doc = DocumentService.get_by_id(id)
|
378 |
doc = doc.to_dict()
|
379 |
doc["tenant_id"] = tenant_id
|
api/db/db_models.py
CHANGED
@@ -855,6 +855,8 @@ class Task(DataBaseModel):
|
|
855 |
help_text="process message",
|
856 |
default="")
|
857 |
retry_count = IntegerField(default=0)
|
|
|
|
|
858 |
|
859 |
|
860 |
class Dialog(DataBaseModel):
|
@@ -1090,4 +1092,16 @@ def migrate_db():
|
|
1090 |
)
|
1091 |
except Exception:
|
1092 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
855 |
help_text="process message",
|
856 |
default="")
|
857 |
retry_count = IntegerField(default=0)
|
858 |
+
digest = TextField(null=True, help_text="task digest", default="")
|
859 |
+
chunk_ids = LongTextField(null=True, help_text="chunk ids", default="")
|
860 |
|
861 |
|
862 |
class Dialog(DataBaseModel):
|
|
|
1092 |
)
|
1093 |
except Exception:
|
1094 |
pass
|
1095 |
+
try:
|
1096 |
+
migrate(
|
1097 |
+
migrator.add_column("task", "digest", TextField(null=True, help_text="task digest", default=""))
|
1098 |
+
)
|
1099 |
+
except Exception:
|
1100 |
+
pass
|
1101 |
|
1102 |
+
try:
|
1103 |
+
migrate(
|
1104 |
+
migrator.add_column("task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
|
1105 |
+
)
|
1106 |
+
except Exception:
|
1107 |
+
pass
|
api/db/services/document_service.py
CHANGED
@@ -282,6 +282,31 @@ class DocumentService(CommonService):
|
|
282 |
return
|
283 |
return docs[0]["embd_id"]
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
@classmethod
|
286 |
@DB.connection_context()
|
287 |
def get_doc_id_by_doc_name(cls, doc_name):
|
|
|
282 |
return
|
283 |
return docs[0]["embd_id"]
|
284 |
|
285 |
+
@classmethod
|
286 |
+
@DB.connection_context()
|
287 |
+
def get_chunking_config(cls, doc_id):
|
288 |
+
configs = (
|
289 |
+
cls.model.select(
|
290 |
+
cls.model.id,
|
291 |
+
cls.model.kb_id,
|
292 |
+
cls.model.parser_id,
|
293 |
+
cls.model.parser_config,
|
294 |
+
Knowledgebase.language,
|
295 |
+
Knowledgebase.embd_id,
|
296 |
+
Tenant.id.alias("tenant_id"),
|
297 |
+
Tenant.img2txt_id,
|
298 |
+
Tenant.asr_id,
|
299 |
+
Tenant.llm_id,
|
300 |
+
)
|
301 |
+
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
|
302 |
+
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
|
303 |
+
.where(cls.model.id == doc_id)
|
304 |
+
)
|
305 |
+
configs = configs.dicts()
|
306 |
+
if not configs:
|
307 |
+
return None
|
308 |
+
return configs[0]
|
309 |
+
|
310 |
@classmethod
|
311 |
@DB.connection_context()
|
312 |
def get_doc_id_by_doc_name(cls, doc_name):
|
api/db/services/task_service.py
CHANGED
@@ -15,6 +15,8 @@
|
|
15 |
#
|
16 |
import os
|
17 |
import random
|
|
|
|
|
18 |
|
19 |
from api.db.db_utils import bulk_insert_into_db
|
20 |
from deepdoc.parser import PdfParser
|
@@ -29,7 +31,21 @@ from deepdoc.parser.excel_parser import RAGFlowExcelParser
|
|
29 |
from rag.settings import SVR_QUEUE_NAME
|
30 |
from rag.utils.storage_factory import STORAGE_IMPL
|
31 |
from rag.utils.redis_conn import REDIS_CONN
|
|
|
|
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
class TaskService(CommonService):
|
35 |
model = Task
|
@@ -87,6 +103,30 @@ class TaskService(CommonService):
|
|
87 |
|
88 |
return docs[0]
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
@classmethod
|
91 |
@DB.connection_context()
|
92 |
def get_ongoing_doc_name(cls):
|
@@ -133,22 +173,18 @@ class TaskService(CommonService):
|
|
133 |
@classmethod
|
134 |
@DB.connection_context()
|
135 |
def do_cancel(cls, id):
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
140 |
-
except Exception:
|
141 |
-
pass
|
142 |
-
return False
|
143 |
|
144 |
@classmethod
|
145 |
@DB.connection_context()
|
146 |
def update_progress(cls, id, info):
|
147 |
if os.environ.get("MACOS"):
|
148 |
if info["progress_msg"]:
|
149 |
-
cls.model.
|
150 |
-
|
151 |
-
).where(cls.model.id == id).execute()
|
152 |
if "progress" in info:
|
153 |
cls.model.update(progress=info["progress"]).where(
|
154 |
cls.model.id == id
|
@@ -157,9 +193,9 @@ class TaskService(CommonService):
|
|
157 |
|
158 |
with DB.lock("update_progress", -1):
|
159 |
if info["progress_msg"]:
|
160 |
-
cls.model.
|
161 |
-
|
162 |
-
).where(cls.model.id == id).execute()
|
163 |
if "progress" in info:
|
164 |
cls.model.update(progress=info["progress"]).where(
|
165 |
cls.model.id == id
|
@@ -168,7 +204,7 @@ class TaskService(CommonService):
|
|
168 |
|
169 |
def queue_tasks(doc: dict, bucket: str, name: str):
|
170 |
def new_task():
|
171 |
-
return {"id": get_uuid(), "doc_id": doc["id"]}
|
172 |
|
173 |
tsks = []
|
174 |
|
@@ -203,10 +239,46 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
203 |
else:
|
204 |
tsks.append(new_task())
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
bulk_insert_into_db(Task, tsks, True)
|
207 |
DocumentService.begin2parse(doc["id"])
|
208 |
|
|
|
209 |
for t in tsks:
|
210 |
assert REDIS_CONN.queue_product(
|
211 |
SVR_QUEUE_NAME, message=t
|
212 |
), "Can't access Redis. Please check the Redis' status."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
#
|
16 |
import os
|
17 |
import random
|
18 |
+
import xxhash
|
19 |
+
import bisect
|
20 |
|
21 |
from api.db.db_utils import bulk_insert_into_db
|
22 |
from deepdoc.parser import PdfParser
|
|
|
31 |
from rag.settings import SVR_QUEUE_NAME
|
32 |
from rag.utils.storage_factory import STORAGE_IMPL
|
33 |
from rag.utils.redis_conn import REDIS_CONN
|
34 |
+
from api import settings
|
35 |
+
from rag.nlp import search
|
36 |
|
37 |
+
def trim_header_by_lines(text: str, max_length) -> str:
|
38 |
+
if len(text) <= max_length:
|
39 |
+
return text
|
40 |
+
lines = text.split("\n")
|
41 |
+
total = 0
|
42 |
+
idx = len(lines) - 1
|
43 |
+
for i in range(len(lines)-1, -1, -1):
|
44 |
+
if total + len(lines[i]) > max_length:
|
45 |
+
break
|
46 |
+
idx = i
|
47 |
+
text2 = "\n".join(lines[idx:])
|
48 |
+
return text2
|
49 |
|
50 |
class TaskService(CommonService):
|
51 |
model = Task
|
|
|
103 |
|
104 |
return docs[0]
|
105 |
|
106 |
+
@classmethod
|
107 |
+
@DB.connection_context()
|
108 |
+
def get_tasks(cls, doc_id: str):
|
109 |
+
fields = [
|
110 |
+
cls.model.id,
|
111 |
+
cls.model.from_page,
|
112 |
+
cls.model.progress,
|
113 |
+
cls.model.digest,
|
114 |
+
cls.model.chunk_ids,
|
115 |
+
]
|
116 |
+
tasks = (
|
117 |
+
cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc())
|
118 |
+
.where(cls.model.doc_id == doc_id)
|
119 |
+
)
|
120 |
+
tasks = list(tasks.dicts())
|
121 |
+
if not tasks:
|
122 |
+
return None
|
123 |
+
return tasks
|
124 |
+
|
125 |
+
@classmethod
|
126 |
+
@DB.connection_context()
|
127 |
+
def update_chunk_ids(cls, id: str, chunk_ids: str):
|
128 |
+
cls.model.update(chunk_ids=chunk_ids).where(cls.model.id == id).execute()
|
129 |
+
|
130 |
@classmethod
|
131 |
@DB.connection_context()
|
132 |
def get_ongoing_doc_name(cls):
|
|
|
173 |
@classmethod
|
174 |
@DB.connection_context()
|
175 |
def do_cancel(cls, id):
|
176 |
+
task = cls.model.get_by_id(id)
|
177 |
+
_, doc = DocumentService.get_by_id(task.doc_id)
|
178 |
+
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
|
|
|
|
|
|
|
|
179 |
|
180 |
@classmethod
|
181 |
@DB.connection_context()
|
182 |
def update_progress(cls, id, info):
|
183 |
if os.environ.get("MACOS"):
|
184 |
if info["progress_msg"]:
|
185 |
+
task = cls.model.get_by_id(id)
|
186 |
+
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
|
187 |
+
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
188 |
if "progress" in info:
|
189 |
cls.model.update(progress=info["progress"]).where(
|
190 |
cls.model.id == id
|
|
|
193 |
|
194 |
with DB.lock("update_progress", -1):
|
195 |
if info["progress_msg"]:
|
196 |
+
task = cls.model.get_by_id(id)
|
197 |
+
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
|
198 |
+
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
199 |
if "progress" in info:
|
200 |
cls.model.update(progress=info["progress"]).where(
|
201 |
cls.model.id == id
|
|
|
204 |
|
205 |
def queue_tasks(doc: dict, bucket: str, name: str):
|
206 |
def new_task():
|
207 |
+
return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0}
|
208 |
|
209 |
tsks = []
|
210 |
|
|
|
239 |
else:
|
240 |
tsks.append(new_task())
|
241 |
|
242 |
+
chunking_config = DocumentService.get_chunking_config(doc["id"])
|
243 |
+
for task in tsks:
|
244 |
+
hasher = xxhash.xxh64()
|
245 |
+
for field in sorted(chunking_config.keys()):
|
246 |
+
hasher.update(str(chunking_config[field]).encode("utf-8"))
|
247 |
+
for field in ["doc_id", "from_page", "to_page"]:
|
248 |
+
hasher.update(str(task.get(field, "")).encode("utf-8"))
|
249 |
+
task_digest = hasher.hexdigest()
|
250 |
+
task["digest"] = task_digest
|
251 |
+
task["progress"] = 0.0
|
252 |
+
|
253 |
+
prev_tasks = TaskService.get_tasks(doc["id"])
|
254 |
+
if prev_tasks:
|
255 |
+
for task in tsks:
|
256 |
+
reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
257 |
+
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
258 |
+
chunk_ids = []
|
259 |
+
for task in prev_tasks:
|
260 |
+
if task["chunk_ids"]:
|
261 |
+
chunk_ids.extend(task["chunk_ids"].split())
|
262 |
+
if chunk_ids:
|
263 |
+
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
264 |
+
|
265 |
bulk_insert_into_db(Task, tsks, True)
|
266 |
DocumentService.begin2parse(doc["id"])
|
267 |
|
268 |
+
tsks = [task for task in tsks if task["progress"] < 1.0]
|
269 |
for t in tsks:
|
270 |
assert REDIS_CONN.queue_product(
|
271 |
SVR_QUEUE_NAME, message=t
|
272 |
), "Can't access Redis. Please check the Redis' status."
|
273 |
+
|
274 |
+
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
275 |
+
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
|
276 |
+
if idx >= len(prev_tasks):
|
277 |
+
return
|
278 |
+
prev_task = prev_tasks[idx]
|
279 |
+
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
|
280 |
+
return
|
281 |
+
task["chunk_ids"] = prev_task["chunk_ids"]
|
282 |
+
task["progress"] = 1.0
|
283 |
+
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
|
284 |
+
prev_task["chunk_ids"] = ""
|
poetry.lock
CHANGED
@@ -145,7 +145,7 @@ name = "aiolimiter"
|
|
145 |
version = "1.2.0"
|
146 |
description = "asyncio rate limiter, a leaky bucket implementation"
|
147 |
optional = false
|
148 |
-
python-versions = "
|
149 |
files = [
|
150 |
{file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
|
151 |
{file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
|
@@ -416,7 +416,7 @@ name = "aspose-slides"
|
|
416 |
version = "24.12.0"
|
417 |
description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
|
418 |
optional = false
|
419 |
-
python-versions = "
|
420 |
files = [
|
421 |
{file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
|
422 |
{file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
|
@@ -568,7 +568,7 @@ name = "bce-python-sdk"
|
|
568 |
version = "0.9.23"
|
569 |
description = "BCE SDK for python"
|
570 |
optional = false
|
571 |
-
python-versions = "!=3.0
|
572 |
files = [
|
573 |
{file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
|
574 |
{file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
|
@@ -1502,7 +1502,7 @@ name = "cryptography"
|
|
1502 |
version = "44.0.0"
|
1503 |
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
1504 |
optional = false
|
1505 |
-
python-versions = "!=3.9.0
|
1506 |
files = [
|
1507 |
{file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
|
1508 |
{file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
|
@@ -1711,7 +1711,7 @@ name = "deprecated"
|
|
1711 |
version = "1.2.15"
|
1712 |
description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
|
1713 |
optional = false
|
1714 |
-
python-versions = "!=3.0
|
1715 |
files = [
|
1716 |
{file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
|
1717 |
{file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
|
@@ -2042,7 +2042,7 @@ name = "fastembed"
|
|
2042 |
version = "0.3.6"
|
2043 |
description = "Fast, light, accurate library built for retrieval embedding generation"
|
2044 |
optional = false
|
2045 |
-
python-versions = "
|
2046 |
files = [
|
2047 |
{file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
|
2048 |
{file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
|
@@ -2624,12 +2624,12 @@ files = [
|
|
2624 |
google-auth = ">=2.14.1,<3.0.dev0"
|
2625 |
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
|
2626 |
grpcio = [
|
2627 |
-
{version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
2628 |
{version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
|
|
2629 |
]
|
2630 |
grpcio-status = [
|
2631 |
-
{version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
2632 |
{version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
|
|
2633 |
]
|
2634 |
proto-plus = ">=1.22.3,<2.0.0dev"
|
2635 |
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
|
@@ -2965,7 +2965,7 @@ name = "graspologic"
|
|
2965 |
version = "3.4.1"
|
2966 |
description = "A set of Python modules for graph statistics"
|
2967 |
optional = false
|
2968 |
-
python-versions = "
|
2969 |
files = [
|
2970 |
{file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
|
2971 |
{file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
|
@@ -3650,7 +3650,7 @@ name = "infinity-emb"
|
|
3650 |
version = "0.0.66"
|
3651 |
description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
|
3652 |
optional = false
|
3653 |
-
python-versions = "
|
3654 |
files = [
|
3655 |
{file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
|
3656 |
{file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
|
@@ -4098,7 +4098,7 @@ name = "litellm"
|
|
4098 |
version = "1.48.0"
|
4099 |
description = "Library to easily interface with LLM API providers"
|
4100 |
optional = false
|
4101 |
-
python-versions = "!=2.7
|
4102 |
files = [
|
4103 |
{file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
|
4104 |
{file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
|
@@ -5416,9 +5416,9 @@ files = [
|
|
5416 |
|
5417 |
[package.dependencies]
|
5418 |
numpy = [
|
5419 |
-
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
5420 |
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
5421 |
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
|
|
5422 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5423 |
]
|
5424 |
|
@@ -5440,9 +5440,9 @@ files = [
|
|
5440 |
|
5441 |
[package.dependencies]
|
5442 |
numpy = [
|
5443 |
-
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
5444 |
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
5445 |
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
|
|
5446 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5447 |
]
|
5448 |
|
@@ -5657,8 +5657,8 @@ files = [
|
|
5657 |
|
5658 |
[package.dependencies]
|
5659 |
numpy = [
|
5660 |
-
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
5661 |
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
|
|
5662 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5663 |
]
|
5664 |
python-dateutil = ">=2.8.2"
|
@@ -6276,7 +6276,7 @@ name = "psutil"
|
|
6276 |
version = "6.1.0"
|
6277 |
description = "Cross-platform lib for process and system monitoring in Python."
|
6278 |
optional = false
|
6279 |
-
python-versions = "!=3.0
|
6280 |
files = [
|
6281 |
{file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
|
6282 |
{file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
|
@@ -6298,8 +6298,8 @@ files = [
|
|
6298 |
]
|
6299 |
|
6300 |
[package.extras]
|
6301 |
-
dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "
|
6302 |
-
test = ["pytest", "pytest-xdist", "setuptools"]
|
6303 |
|
6304 |
[[package]]
|
6305 |
name = "psycopg2-binary"
|
@@ -7803,40 +7803,30 @@ files = [
|
|
7803 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
|
7804 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
|
7805 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
|
7806 |
-
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
|
7807 |
-
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
|
7808 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
|
7809 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
|
7810 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
|
7811 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
|
7812 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
|
7813 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
|
7814 |
-
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
|
7815 |
-
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
|
7816 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
|
7817 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
|
7818 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
|
7819 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
|
7820 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
|
7821 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
|
7822 |
-
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
|
7823 |
-
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
|
7824 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
|
7825 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
|
7826 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
|
7827 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
|
7828 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
|
7829 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
|
7830 |
-
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
|
7831 |
-
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
|
7832 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
|
7833 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
|
7834 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
|
7835 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
|
7836 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
|
7837 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
|
7838 |
-
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
|
7839 |
-
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
|
7840 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
|
7841 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
|
7842 |
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
@@ -7847,7 +7837,7 @@ name = "s3transfer"
|
|
7847 |
version = "0.10.4"
|
7848 |
description = "An Amazon S3 Transfer Manager"
|
7849 |
optional = false
|
7850 |
-
python-versions = ">=3.8"
|
7851 |
files = [
|
7852 |
{file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
|
7853 |
{file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
|
@@ -8309,7 +8299,7 @@ name = "smart-open"
|
|
8309 |
version = "7.0.5"
|
8310 |
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
|
8311 |
optional = false
|
8312 |
-
python-versions = "
|
8313 |
files = [
|
8314 |
{file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
|
8315 |
{file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
|
@@ -10119,4 +10109,4 @@ cffi = ["cffi (>=1.11)"]
|
|
10119 |
[metadata]
|
10120 |
lock-version = "2.0"
|
10121 |
python-versions = ">=3.10,<3.13"
|
10122 |
-
content-hash = "
|
|
|
145 |
version = "1.2.0"
|
146 |
description = "asyncio rate limiter, a leaky bucket implementation"
|
147 |
optional = false
|
148 |
+
python-versions = ">=3.8,<4.0"
|
149 |
files = [
|
150 |
{file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
|
151 |
{file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
|
|
|
416 |
version = "24.12.0"
|
417 |
description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
|
418 |
optional = false
|
419 |
+
python-versions = ">=3.5,<3.14"
|
420 |
files = [
|
421 |
{file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
|
422 |
{file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
|
|
|
568 |
version = "0.9.23"
|
569 |
description = "BCE SDK for python"
|
570 |
optional = false
|
571 |
+
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4"
|
572 |
files = [
|
573 |
{file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
|
574 |
{file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
|
|
|
1502 |
version = "44.0.0"
|
1503 |
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
1504 |
optional = false
|
1505 |
+
python-versions = ">=3.7, !=3.9.0, !=3.9.1"
|
1506 |
files = [
|
1507 |
{file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
|
1508 |
{file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
|
|
|
1711 |
version = "1.2.15"
|
1712 |
description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
|
1713 |
optional = false
|
1714 |
+
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
1715 |
files = [
|
1716 |
{file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
|
1717 |
{file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
|
|
|
2042 |
version = "0.3.6"
|
2043 |
description = "Fast, light, accurate library built for retrieval embedding generation"
|
2044 |
optional = false
|
2045 |
+
python-versions = ">=3.8.0,<3.13"
|
2046 |
files = [
|
2047 |
{file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
|
2048 |
{file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
|
|
|
2624 |
google-auth = ">=2.14.1,<3.0.dev0"
|
2625 |
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
|
2626 |
grpcio = [
|
|
|
2627 |
{version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
2628 |
+
{version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
2629 |
]
|
2630 |
grpcio-status = [
|
|
|
2631 |
{version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
2632 |
+
{version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
2633 |
]
|
2634 |
proto-plus = ">=1.22.3,<2.0.0dev"
|
2635 |
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
|
|
|
2965 |
version = "3.4.1"
|
2966 |
description = "A set of Python modules for graph statistics"
|
2967 |
optional = false
|
2968 |
+
python-versions = ">=3.9,<3.13"
|
2969 |
files = [
|
2970 |
{file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
|
2971 |
{file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
|
|
|
3650 |
version = "0.0.66"
|
3651 |
description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
|
3652 |
optional = false
|
3653 |
+
python-versions = ">=3.9,<4"
|
3654 |
files = [
|
3655 |
{file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
|
3656 |
{file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
|
|
|
4098 |
version = "1.48.0"
|
4099 |
description = "Library to easily interface with LLM API providers"
|
4100 |
optional = false
|
4101 |
+
python-versions = ">=3.8, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*"
|
4102 |
files = [
|
4103 |
{file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
|
4104 |
{file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
|
|
|
5416 |
|
5417 |
[package.dependencies]
|
5418 |
numpy = [
|
|
|
5419 |
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
5420 |
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
5421 |
+
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
5422 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5423 |
]
|
5424 |
|
|
|
5440 |
|
5441 |
[package.dependencies]
|
5442 |
numpy = [
|
|
|
5443 |
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
5444 |
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
5445 |
+
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
5446 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5447 |
]
|
5448 |
|
|
|
5657 |
|
5658 |
[package.dependencies]
|
5659 |
numpy = [
|
|
|
5660 |
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
5661 |
+
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
5662 |
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
5663 |
]
|
5664 |
python-dateutil = ">=2.8.2"
|
|
|
6276 |
version = "6.1.0"
|
6277 |
description = "Cross-platform lib for process and system monitoring in Python."
|
6278 |
optional = false
|
6279 |
+
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
6280 |
files = [
|
6281 |
{file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
|
6282 |
{file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
|
|
|
6298 |
]
|
6299 |
|
6300 |
[package.extras]
|
6301 |
+
dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx-rtd-theme", "toml-sort", "twine", "virtualenv", "wheel"]
|
6302 |
+
test = ["enum34", "futures", "ipaddress", "mock (==1.0.1)", "pytest (==4.6.11)", "pytest-xdist", "setuptools", "unittest2"]
|
6303 |
|
6304 |
[[package]]
|
6305 |
name = "psycopg2-binary"
|
|
|
7803 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
|
7804 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
|
7805 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
|
|
|
|
|
7806 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
|
7807 |
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
|
7808 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
|
7809 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
|
7810 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
|
7811 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
|
|
|
|
|
7812 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
|
7813 |
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
|
7814 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
|
7815 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
|
7816 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
|
7817 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
|
|
|
|
|
7818 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
|
7819 |
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
|
7820 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
|
7821 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
|
7822 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
|
7823 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
|
|
|
|
|
7824 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
|
7825 |
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
|
7826 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
|
7827 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
|
7828 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
|
7829 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
|
|
|
|
|
7830 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
|
7831 |
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
|
7832 |
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
|
|
7837 |
version = "0.10.4"
|
7838 |
description = "An Amazon S3 Transfer Manager"
|
7839 |
optional = false
|
7840 |
+
python-versions = ">= 3.8"
|
7841 |
files = [
|
7842 |
{file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
|
7843 |
{file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
|
|
|
8299 |
version = "7.0.5"
|
8300 |
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
|
8301 |
optional = false
|
8302 |
+
python-versions = ">=3.7,<4.0"
|
8303 |
files = [
|
8304 |
{file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
|
8305 |
{file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
|
|
|
10109 |
[metadata]
|
10110 |
lock-version = "2.0"
|
10111 |
python-versions = ">=3.10,<3.13"
|
10112 |
+
content-hash = "69fbe11a30c649544196546b9384ca5972bfd17a923b7dc8ff340f790984b5df"
|
pyproject.toml
CHANGED
@@ -121,6 +121,7 @@ pyicu = "^2.13.1"
|
|
121 |
flasgger = "^0.9.7.1"
|
122 |
polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
|
123 |
polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
|
|
|
124 |
|
125 |
|
126 |
[tool.poetry.group.full]
|
|
|
121 |
flasgger = "^0.9.7.1"
|
122 |
polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
|
123 |
polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
|
124 |
+
xxhash = "^3.5.0"
|
125 |
|
126 |
|
127 |
[tool.poetry.group.full]
|
rag/svr/task_executor.py
CHANGED
@@ -39,8 +39,9 @@ from timeit import default_timer as timer
|
|
39 |
import tracemalloc
|
40 |
|
41 |
import numpy as np
|
|
|
42 |
|
43 |
-
from api.db import LLMType, ParserType
|
44 |
from api.db.services.dialog_service import keyword_extraction, question_proposal
|
45 |
from api.db.services.document_service import DocumentService
|
46 |
from api.db.services.llm_service import LLMBundle
|
@@ -89,12 +90,23 @@ DONE_TASKS = 0
|
|
89 |
FAILED_TASKS = 0
|
90 |
CURRENT_TASK = None
|
91 |
|
|
|
|
|
|
|
92 |
|
93 |
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
|
94 |
global PAYLOAD
|
95 |
if prog is not None and prog < 0:
|
96 |
msg = "[ERROR]" + msg
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
if cancel:
|
99 |
msg += " [Canceled]"
|
100 |
prog = -1
|
@@ -105,18 +117,22 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
|
|
105 |
d = {"progress_msg": msg}
|
106 |
if prog is not None:
|
107 |
d["progress"] = prog
|
|
|
|
|
108 |
try:
|
109 |
-
logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
|
110 |
TaskService.update_progress(task_id, d)
|
111 |
-
except
|
112 |
-
logging.
|
113 |
-
|
114 |
-
close_connection()
|
115 |
-
if cancel:
|
116 |
if PAYLOAD:
|
117 |
PAYLOAD.ack()
|
118 |
PAYLOAD = None
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
|
122 |
def collect():
|
@@ -136,16 +152,22 @@ def collect():
|
|
136 |
if not msg:
|
137 |
return None
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
with mt_lock:
|
147 |
DONE_TASKS += 1
|
148 |
-
logging.
|
149 |
return None
|
150 |
|
151 |
if msg.get("type", "") == "raptor":
|
@@ -186,6 +208,8 @@ def build_chunks(task, progress_callback):
|
|
186 |
to_page=task["to_page"], lang=task["language"], callback=progress_callback,
|
187 |
kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
|
188 |
logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
|
|
|
|
|
189 |
except Exception as e:
|
190 |
progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
|
191 |
logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
|
@@ -358,6 +382,8 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
|
|
358 |
return res, tk_count, vector_size
|
359 |
|
360 |
|
|
|
|
|
361 |
def do_handle_task(task):
|
362 |
task_id = task["id"]
|
363 |
task_from_page = task["from_page"]
|
@@ -373,6 +399,16 @@ def do_handle_task(task):
|
|
373 |
|
374 |
# prepare the progress callback function
|
375 |
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
try:
|
377 |
# bind embedding model
|
378 |
embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
|
@@ -390,6 +426,8 @@ def do_handle_task(task):
|
|
390 |
|
391 |
# run RAPTOR
|
392 |
chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
|
|
|
|
|
393 |
except Exception as e:
|
394 |
error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
|
395 |
progress_callback(-1, msg=error_message)
|
@@ -420,6 +458,7 @@ def do_handle_task(task):
|
|
420 |
progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
|
421 |
logging.info(progress_message)
|
422 |
progress_callback(msg=progress_message)
|
|
|
423 |
# logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
|
424 |
init_kb(task, vector_size)
|
425 |
chunk_count = len(set([chunk["id"] for chunk in chunks]))
|
@@ -430,23 +469,25 @@ def do_handle_task(task):
|
|
430 |
doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
|
431 |
if b % 128 == 0:
|
432 |
progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
|
|
|
|
444 |
|
445 |
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
|
446 |
|
447 |
time_cost = timer() - start_ts
|
448 |
progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
|
449 |
-
logging.info("Chunk doc({}),
|
450 |
|
451 |
|
452 |
def handle_task():
|
@@ -462,6 +503,12 @@ def handle_task():
|
|
462 |
DONE_TASKS += 1
|
463 |
CURRENT_TASK = None
|
464 |
logging.info(f"handle_task done for task {json.dumps(task)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
except Exception:
|
466 |
with mt_lock:
|
467 |
FAILED_TASKS += 1
|
|
|
39 |
import tracemalloc
|
40 |
|
41 |
import numpy as np
|
42 |
+
from peewee import DoesNotExist
|
43 |
|
44 |
+
from api.db import LLMType, ParserType, TaskStatus
|
45 |
from api.db.services.dialog_service import keyword_extraction, question_proposal
|
46 |
from api.db.services.document_service import DocumentService
|
47 |
from api.db.services.llm_service import LLMBundle
|
|
|
90 |
FAILED_TASKS = 0
|
91 |
CURRENT_TASK = None
|
92 |
|
93 |
+
class TaskCanceledException(Exception):
|
94 |
+
def __init__(self, msg):
|
95 |
+
self.msg = msg
|
96 |
|
97 |
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
|
98 |
global PAYLOAD
|
99 |
if prog is not None and prog < 0:
|
100 |
msg = "[ERROR]" + msg
|
101 |
+
try:
|
102 |
+
cancel = TaskService.do_cancel(task_id)
|
103 |
+
except DoesNotExist:
|
104 |
+
logging.warning(f"set_progress task {task_id} is unknown")
|
105 |
+
if PAYLOAD:
|
106 |
+
PAYLOAD.ack()
|
107 |
+
PAYLOAD = None
|
108 |
+
return
|
109 |
+
|
110 |
if cancel:
|
111 |
msg += " [Canceled]"
|
112 |
prog = -1
|
|
|
117 |
d = {"progress_msg": msg}
|
118 |
if prog is not None:
|
119 |
d["progress"] = prog
|
120 |
+
|
121 |
+
logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
|
122 |
try:
|
|
|
123 |
TaskService.update_progress(task_id, d)
|
124 |
+
except DoesNotExist:
|
125 |
+
logging.warning(f"set_progress task {task_id} is unknown")
|
|
|
|
|
|
|
126 |
if PAYLOAD:
|
127 |
PAYLOAD.ack()
|
128 |
PAYLOAD = None
|
129 |
+
return
|
130 |
+
|
131 |
+
close_connection()
|
132 |
+
if cancel and PAYLOAD:
|
133 |
+
PAYLOAD.ack()
|
134 |
+
PAYLOAD = None
|
135 |
+
raise TaskCanceledException(msg)
|
136 |
|
137 |
|
138 |
def collect():
|
|
|
152 |
if not msg:
|
153 |
return None
|
154 |
|
155 |
+
task = None
|
156 |
+
canceled = False
|
157 |
+
try:
|
158 |
+
task = TaskService.get_task(msg["id"])
|
159 |
+
if task:
|
160 |
+
_, doc = DocumentService.get_by_id(task["doc_id"])
|
161 |
+
canceled = doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
162 |
+
except DoesNotExist:
|
163 |
+
pass
|
164 |
+
except Exception:
|
165 |
+
logging.exception("collect get_task exception")
|
166 |
+
if not task or canceled:
|
167 |
+
state = "is unknown" if not task else "has been cancelled"
|
168 |
with mt_lock:
|
169 |
DONE_TASKS += 1
|
170 |
+
logging.info(f"collect task {msg['id']} {state}")
|
171 |
return None
|
172 |
|
173 |
if msg.get("type", "") == "raptor":
|
|
|
208 |
to_page=task["to_page"], lang=task["language"], callback=progress_callback,
|
209 |
kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
|
210 |
logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
|
211 |
+
except TaskCanceledException:
|
212 |
+
raise
|
213 |
except Exception as e:
|
214 |
progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
|
215 |
logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
|
|
|
382 |
return res, tk_count, vector_size
|
383 |
|
384 |
|
385 |
+
|
386 |
+
|
387 |
def do_handle_task(task):
|
388 |
task_id = task["id"]
|
389 |
task_from_page = task["from_page"]
|
|
|
399 |
|
400 |
# prepare the progress callback function
|
401 |
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
|
402 |
+
|
403 |
+
try:
|
404 |
+
task_canceled = TaskService.do_cancel(task_id)
|
405 |
+
except DoesNotExist:
|
406 |
+
logging.warning(f"task {task_id} is unknown")
|
407 |
+
return
|
408 |
+
if task_canceled:
|
409 |
+
progress_callback(-1, msg="Task has been canceled.")
|
410 |
+
return
|
411 |
+
|
412 |
try:
|
413 |
# bind embedding model
|
414 |
embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
|
|
|
426 |
|
427 |
# run RAPTOR
|
428 |
chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
|
429 |
+
except TaskCanceledException:
|
430 |
+
raise
|
431 |
except Exception as e:
|
432 |
error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
|
433 |
progress_callback(-1, msg=error_message)
|
|
|
458 |
progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
|
459 |
logging.info(progress_message)
|
460 |
progress_callback(msg=progress_message)
|
461 |
+
|
462 |
# logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
|
463 |
init_kb(task, vector_size)
|
464 |
chunk_count = len(set([chunk["id"] for chunk in chunks]))
|
|
|
469 |
doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
|
470 |
if b % 128 == 0:
|
471 |
progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
|
472 |
+
if doc_store_result:
|
473 |
+
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
|
474 |
+
progress_callback(-1, msg=error_message)
|
475 |
+
raise Exception(error_message)
|
476 |
+
chunk_ids = [chunk["id"] for chunk in chunks[:b + es_bulk_size]]
|
477 |
+
chunk_ids_str = " ".join(chunk_ids)
|
478 |
+
try:
|
479 |
+
TaskService.update_chunk_ids(task["id"], chunk_ids_str)
|
480 |
+
except DoesNotExist:
|
481 |
+
logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
|
482 |
+
doc_store_result = settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id)
|
483 |
+
return
|
484 |
+
logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))
|
485 |
|
486 |
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
|
487 |
|
488 |
time_cost = timer() - start_ts
|
489 |
progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
|
490 |
+
logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, time_cost))
|
491 |
|
492 |
|
493 |
def handle_task():
|
|
|
503 |
DONE_TASKS += 1
|
504 |
CURRENT_TASK = None
|
505 |
logging.info(f"handle_task done for task {json.dumps(task)}")
|
506 |
+
except TaskCanceledException:
|
507 |
+
with mt_lock:
|
508 |
+
DONE_TASKS += 1
|
509 |
+
CURRENT_TASK = None
|
510 |
+
logging.info(f"handle_task got TaskCanceledException for task {json.dumps(task)}")
|
511 |
+
logging.debug("handle_task got TaskCanceledException", exc_info=True)
|
512 |
except Exception:
|
513 |
with mt_lock:
|
514 |
FAILED_TASKS += 1
|