KevinHuSh
commited on
Commit
·
cac848f
1
Parent(s):
54ec234
fix bug about fetching file from minio (#574)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- api/apps/file_app.py +3 -3
- api/db/services/file2document_service.py +17 -0
- api/db/services/file_service.py +1 -1
- api/db/services/task_service.py +6 -4
- rag/svr/task_broker.py +5 -2
- rag/svr/task_executor.py +4 -1
api/apps/file_app.py
CHANGED
@@ -328,12 +328,12 @@ def rename():
|
|
328 |
# @login_required
|
329 |
def get(file_id):
|
330 |
try:
|
331 |
-
e,
|
332 |
if not e:
|
333 |
return get_data_error_result(retmsg="Document not found!")
|
334 |
|
335 |
-
response = flask.make_response(MINIO.get(
|
336 |
-
ext = re.search(r"\.([^.]+)$",
|
337 |
if ext:
|
338 |
if doc.type == FileType.VISUAL.value:
|
339 |
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
|
|
328 |
# @login_required
|
329 |
def get(file_id):
|
330 |
try:
|
331 |
+
e, file = FileService.get_by_id(file_id)
|
332 |
if not e:
|
333 |
return get_data_error_result(retmsg="Document not found!")
|
334 |
|
335 |
+
response = flask.make_response(MINIO.get(file.parent_id, file.location))
|
336 |
+
ext = re.search(r"\.([^.]+)$", file.name)
|
337 |
if ext:
|
338 |
if doc.type == FileType.VISUAL.value:
|
339 |
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
api/db/services/file2document_service.py
CHANGED
@@ -18,6 +18,8 @@ from datetime import datetime
|
|
18 |
from api.db.db_models import DB
|
19 |
from api.db.db_models import File, Document, File2Document
|
20 |
from api.db.services.common_service import CommonService
|
|
|
|
|
21 |
from api.utils import current_timestamp, datetime_format
|
22 |
|
23 |
|
@@ -64,3 +66,18 @@ class File2DocumentService(CommonService):
|
|
64 |
num = cls.model.update(obj).where(cls.model.id == file_id).execute()
|
65 |
e, obj = cls.get_by_id(cls.model.id)
|
66 |
return obj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
from api.db.db_models import DB
|
19 |
from api.db.db_models import File, Document, File2Document
|
20 |
from api.db.services.common_service import CommonService
|
21 |
+
from api.db.services.document_service import DocumentService
|
22 |
+
from api.db.services.file_service import FileService
|
23 |
from api.utils import current_timestamp, datetime_format
|
24 |
|
25 |
|
|
|
66 |
num = cls.model.update(obj).where(cls.model.id == file_id).execute()
|
67 |
e, obj = cls.get_by_id(cls.model.id)
|
68 |
return obj
|
69 |
+
|
70 |
+
@classmethod
|
71 |
+
@DB.connection_context()
|
72 |
+
def get_minio_address(cls, doc_id=None, file_id=None):
|
73 |
+
if doc_id:
|
74 |
+
ids = File2DocumentService.get_by_document_id(doc_id)
|
75 |
+
else:
|
76 |
+
ids = File2DocumentService.get_by_file_id(file_id)
|
77 |
+
if ids:
|
78 |
+
e, file = FileService.get_by_id(ids[0].file_id)
|
79 |
+
return file.parent_id, file.location
|
80 |
+
else:
|
81 |
+
assert doc_id, "please specify doc_id"
|
82 |
+
e, doc = DocumentService.get_by_id(doc_id)
|
83 |
+
return doc.kb_id, doc.location
|
api/db/services/file_service.py
CHANGED
@@ -21,7 +21,6 @@ from api.db.db_models import DB, File2Document, Knowledgebase
|
|
21 |
from api.db.db_models import File, Document
|
22 |
from api.db.services.common_service import CommonService
|
23 |
from api.utils import get_uuid
|
24 |
-
from rag.utils import MINIO
|
25 |
|
26 |
|
27 |
class FileService(CommonService):
|
@@ -241,3 +240,4 @@ class FileService(CommonService):
|
|
241 |
|
242 |
dfs(folder_id)
|
243 |
return size
|
|
|
|
21 |
from api.db.db_models import File, Document
|
22 |
from api.db.services.common_service import CommonService
|
23 |
from api.utils import get_uuid
|
|
|
24 |
|
25 |
|
26 |
class FileService(CommonService):
|
|
|
240 |
|
241 |
dfs(folder_id)
|
242 |
return size
|
243 |
+
|
api/db/services/task_service.py
CHANGED
@@ -15,8 +15,8 @@
|
|
15 |
#
|
16 |
import random
|
17 |
|
18 |
-
from peewee import Expression
|
19 |
-
from api.db.db_models import DB
|
20 |
from api.db import StatusEnum, FileType, TaskStatus
|
21 |
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
22 |
from api.db.services.common_service import CommonService
|
@@ -75,8 +75,10 @@ class TaskService(CommonService):
|
|
75 |
@DB.connection_context()
|
76 |
def get_ongoing_doc_name(cls):
|
77 |
with DB.lock("get_task", -1):
|
78 |
-
docs = cls.model.select(*[Document.kb_id, Document.location]) \
|
79 |
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
|
|
|
|
80 |
.where(
|
81 |
Document.status == StatusEnum.VALID.value,
|
82 |
Document.run == TaskStatus.RUNNING.value,
|
@@ -88,7 +90,7 @@ class TaskService(CommonService):
|
|
88 |
docs = list(docs.dicts())
|
89 |
if not docs: return []
|
90 |
|
91 |
-
return list(set([(d["kb_id"], d["location"]) for d in docs]))
|
92 |
|
93 |
@classmethod
|
94 |
@DB.connection_context()
|
|
|
15 |
#
|
16 |
import random
|
17 |
|
18 |
+
from peewee import Expression, JOIN
|
19 |
+
from api.db.db_models import DB, File2Document, File
|
20 |
from api.db import StatusEnum, FileType, TaskStatus
|
21 |
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
22 |
from api.db.services.common_service import CommonService
|
|
|
75 |
@DB.connection_context()
|
76 |
def get_ongoing_doc_name(cls):
|
77 |
with DB.lock("get_task", -1):
|
78 |
+
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
79 |
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
80 |
+
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
81 |
+
.join(File, on=(File2Document.file_id == File.id)) \
|
82 |
.where(
|
83 |
Document.status == StatusEnum.VALID.value,
|
84 |
Document.run == TaskStatus.RUNNING.value,
|
|
|
90 |
docs = list(docs.dicts())
|
91 |
if not docs: return []
|
92 |
|
93 |
+
return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs]))
|
94 |
|
95 |
@classmethod
|
96 |
@DB.connection_context()
|
rag/svr/task_broker.py
CHANGED
@@ -20,6 +20,8 @@ import random
|
|
20 |
from datetime import datetime
|
21 |
from api.db.db_models import Task
|
22 |
from api.db.db_utils import bulk_insert_into_db
|
|
|
|
|
23 |
from api.db.services.task_service import TaskService
|
24 |
from deepdoc.parser import PdfParser
|
25 |
from deepdoc.parser.excel_parser import HuExcelParser
|
@@ -87,10 +89,11 @@ def dispatch():
|
|
87 |
|
88 |
tsks = []
|
89 |
try:
|
90 |
-
|
|
|
91 |
if REDIS_CONN.is_alive():
|
92 |
try:
|
93 |
-
REDIS_CONN.set("{}/{}".format(
|
94 |
except Exception as e:
|
95 |
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
96 |
|
|
|
20 |
from datetime import datetime
|
21 |
from api.db.db_models import Task
|
22 |
from api.db.db_utils import bulk_insert_into_db
|
23 |
+
from api.db.services.file2document_service import File2DocumentService
|
24 |
+
from api.db.services.file_service import FileService
|
25 |
from api.db.services.task_service import TaskService
|
26 |
from deepdoc.parser import PdfParser
|
27 |
from deepdoc.parser.excel_parser import HuExcelParser
|
|
|
89 |
|
90 |
tsks = []
|
91 |
try:
|
92 |
+
bucket, name = File2DocumentService.get_minio_address(doc_id=r["id"])
|
93 |
+
file_bin = MINIO.get(bucket, name)
|
94 |
if REDIS_CONN.is_alive():
|
95 |
try:
|
96 |
+
REDIS_CONN.set("{}/{}".format(bucket, name), file_bin, 12*60)
|
97 |
except Exception as e:
|
98 |
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
99 |
|
rag/svr/task_executor.py
CHANGED
@@ -24,6 +24,8 @@ import sys
|
|
24 |
import time
|
25 |
import traceback
|
26 |
from functools import partial
|
|
|
|
|
27 |
from rag.utils import MINIO
|
28 |
from api.db.db_models import close_connection
|
29 |
from rag.settings import database_logger
|
@@ -135,7 +137,8 @@ def build(row):
|
|
135 |
pool = Pool(processes=1)
|
136 |
try:
|
137 |
st = timer()
|
138 |
-
|
|
|
139 |
binary = thr.get(timeout=90)
|
140 |
pool.terminate()
|
141 |
cron_logger.info(
|
|
|
24 |
import time
|
25 |
import traceback
|
26 |
from functools import partial
|
27 |
+
|
28 |
+
from api.db.services.file2document_service import File2DocumentService
|
29 |
from rag.utils import MINIO
|
30 |
from api.db.db_models import close_connection
|
31 |
from rag.settings import database_logger
|
|
|
137 |
pool = Pool(processes=1)
|
138 |
try:
|
139 |
st = timer()
|
140 |
+
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
|
141 |
+
thr = pool.apply_async(get_minio_binary, args=(bucket, name))
|
142 |
binary = thr.get(timeout=90)
|
143 |
pool.terminate()
|
144 |
cron_logger.info(
|