chongcb chongchuanbing commited on
Commit
d048400
·
1 Parent(s): 405e0ff

fix: Large document thumbnail display failed (#2763)

Browse files

### What problem does this PR solve?

In MySQL, when the thumbnail base64 of a document is relatively large,
the display of the document's thumbnail fails.
Now, I put the document thumbnail into MiniIO storage.

### Type of change

- [✓] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: chongchuanbing <[email protected]>

api/apps/document_app.py CHANGED
@@ -51,6 +51,7 @@ from api.utils.api_utils import get_json_result
51
  from rag.utils.storage_factory import STORAGE_IMPL
52
  from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
53
  from api.utils.web_utils import html2pdf, is_valid_url
 
54
 
55
 
56
  @manager.route('/upload', methods=['POST'])
@@ -209,6 +210,11 @@ def list_docs():
209
  try:
210
  docs, tol = DocumentService.get_by_kb_id(
211
  kb_id, page_number, items_per_page, orderby, desc, keywords)
 
 
 
 
 
212
  return get_json_result(data={"total": tol, "docs": docs})
213
  except Exception as e:
214
  return server_error_response(e)
 
51
  from rag.utils.storage_factory import STORAGE_IMPL
52
  from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
53
  from api.utils.web_utils import html2pdf, is_valid_url
54
+ from api.contants import IMG_BASE64_PREFIX
55
 
56
 
57
  @manager.route('/upload', methods=['POST'])
 
210
  try:
211
  docs, tol = DocumentService.get_by_kb_id(
212
  kb_id, page_number, items_per_page, orderby, desc, keywords)
213
+
214
+ for doc_item in docs:
215
+ if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
216
+ doc_item['thumbnail'] = f'/v1/document/image/{kb_id}-{doc_item['thumbnail']}'
217
+
218
  return get_json_result(data={"total": tol, "docs": docs})
219
  except Exception as e:
220
  return server_error_response(e)
api/contants.py CHANGED
@@ -13,4 +13,6 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
- NAME_LENGTH_LIMIT = 2 ** 10
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
+ NAME_LENGTH_LIMIT = 2 ** 10
17
+
18
+ IMG_BASE64_PREFIX = 'data:image/png;base64,'
api/db/services/file_service.py CHANGED
@@ -26,7 +26,7 @@ from api.db.services.common_service import CommonService
26
  from api.db.services.document_service import DocumentService
27
  from api.db.services.file2document_service import File2DocumentService
28
  from api.utils import get_uuid
29
- from api.utils.file_utils import filename_type, thumbnail
30
  from rag.utils.storage_factory import STORAGE_IMPL
31
 
32
 
@@ -354,8 +354,15 @@ class FileService(CommonService):
354
  location += "_"
355
  blob = file.read()
356
  STORAGE_IMPL.put(kb.id, location, blob)
 
 
 
 
 
 
 
357
  doc = {
358
- "id": get_uuid(),
359
  "kb_id": kb.id,
360
  "parser_id": self.get_parser(filetype, filename, kb.parser_id),
361
  "parser_config": kb.parser_config,
@@ -364,7 +371,7 @@ class FileService(CommonService):
364
  "name": filename,
365
  "location": location,
366
  "size": len(blob),
367
- "thumbnail": thumbnail(filename, blob)
368
  }
369
  DocumentService.insert(doc)
370
 
 
26
  from api.db.services.document_service import DocumentService
27
  from api.db.services.file2document_service import File2DocumentService
28
  from api.utils import get_uuid
29
+ from api.utils.file_utils import filename_type, thumbnail_img
30
  from rag.utils.storage_factory import STORAGE_IMPL
31
 
32
 
 
354
  location += "_"
355
  blob = file.read()
356
  STORAGE_IMPL.put(kb.id, location, blob)
357
+
358
+ doc_id = get_uuid()
359
+
360
+ img = thumbnail_img(filename, blob)
361
+ thumbnail_location = f'thumbnail_{doc_id}.png'
362
+ STORAGE_IMPL.put(kb.id, thumbnail_location, img)
363
+
364
  doc = {
365
+ "id": doc_id,
366
  "kb_id": kb.id,
367
  "parser_id": self.get_parser(filetype, filename, kb.parser_id),
368
  "parser_config": kb.parser_config,
 
371
  "name": filename,
372
  "location": location,
373
  "size": len(blob),
374
+ "thumbnail": thumbnail_location
375
  }
376
  DocumentService.insert(doc)
377
 
api/utils/file_utils.py CHANGED
@@ -25,6 +25,7 @@ from cachetools import LRUCache, cached
25
  from ruamel.yaml import YAML
26
 
27
  from api.db import FileType
 
28
 
29
  PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
30
  RAG_BASE = os.getenv("RAG_BASE")
@@ -168,23 +169,20 @@ def filename_type(filename):
168
 
169
  return FileType.OTHER.value
170
 
171
-
172
- def thumbnail(filename, blob):
173
  filename = filename.lower()
174
  if re.match(r".*\.pdf$", filename):
175
  pdf = pdfplumber.open(BytesIO(blob))
176
  buffered = BytesIO()
177
  pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
178
- return "data:image/png;base64," + \
179
- base64.b64encode(buffered.getvalue()).decode("utf-8")
180
 
181
  if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
182
  image = Image.open(BytesIO(blob))
183
  image.thumbnail((30, 30))
184
  buffered = BytesIO()
185
  image.save(buffered, format="png")
186
- return "data:image/png;base64," + \
187
- base64.b64encode(buffered.getvalue()).decode("utf-8")
188
 
189
  if re.match(r".*\.(ppt|pptx)$", filename):
190
  import aspose.slides as slides
@@ -194,11 +192,15 @@ def thumbnail(filename, blob):
194
  buffered = BytesIO()
195
  presentation.slides[0].get_thumbnail(0.03, 0.03).save(
196
  buffered, drawing.imaging.ImageFormat.png)
197
- return "data:image/png;base64," + \
198
- base64.b64encode(buffered.getvalue()).decode("utf-8")
199
  except Exception as e:
200
  pass
 
201
 
 
 
 
 
202
 
203
  def traversal_files(base):
204
  for root, ds, fs in os.walk(base):
 
25
  from ruamel.yaml import YAML
26
 
27
  from api.db import FileType
28
+ from api.contants import IMG_BASE64_PREFIX
29
 
30
  PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
31
  RAG_BASE = os.getenv("RAG_BASE")
 
169
 
170
  return FileType.OTHER.value
171
 
172
+ def thumbnail_img(filename, blob):
 
173
  filename = filename.lower()
174
  if re.match(r".*\.pdf$", filename):
175
  pdf = pdfplumber.open(BytesIO(blob))
176
  buffered = BytesIO()
177
  pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
178
+ return buffered.getvalue()
 
179
 
180
  if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
181
  image = Image.open(BytesIO(blob))
182
  image.thumbnail((30, 30))
183
  buffered = BytesIO()
184
  image.save(buffered, format="png")
185
+ return buffered.getvalue()
 
186
 
187
  if re.match(r".*\.(ppt|pptx)$", filename):
188
  import aspose.slides as slides
 
192
  buffered = BytesIO()
193
  presentation.slides[0].get_thumbnail(0.03, 0.03).save(
194
  buffered, drawing.imaging.ImageFormat.png)
195
+ return buffered.getvalue()
 
196
  except Exception as e:
197
  pass
198
+ return None
199
 
200
+ def thumbnail(filename, blob):
201
+ img = thumbnail_img(filename, blob)
202
+ return IMG_BASE64_PREFIX + \
203
+ base64.b64encode(img).decode("utf-8")
204
 
205
  def traversal_files(base):
206
  for root, ds, fs in os.walk(base):