lamhieu commited on
Commit
141a997
Β·
1 Parent(s): aa9c9a7

fix: resolve invalid file extension

Browse files
Files changed (3) hide show
  1. docsifer/router.py +1 -1
  2. docsifer/service.py +7 -5
  3. requirements.txt +1 -1
docsifer/router.py CHANGED
@@ -68,7 +68,7 @@ async def convert_document(
68
  if file is not None:
69
  with tempfile.TemporaryDirectory() as tmpdir:
70
  contents = await file.read()
71
- guessed_ext = mimetypes.guess_extension(file.content_type) or ""
72
  new_name = f"{Path(file.filename).stem}{guessed_ext}"
73
  temp_path = Path(tmpdir) / new_name
74
  temp_path.write_bytes(contents)
 
68
  if file is not None:
69
  with tempfile.TemporaryDirectory() as tmpdir:
70
  contents = await file.read()
71
+ guessed_ext = mimetypes.guess_extension(file.content_type) or ".tmp"
72
  new_name = f"{Path(file.filename).stem}{guessed_ext}"
73
  temp_path = Path(tmpdir) / new_name
74
  temp_path.write_bytes(contents)
docsifer/service.py CHANGED
@@ -4,7 +4,8 @@ from __future__ import annotations
4
 
5
  import logging
6
  import tempfile
7
- import filetype
 
8
  from pathlib import Path
9
  from typing import Optional, Dict, Tuple, Any
10
 
@@ -109,13 +110,14 @@ class DocsiferService:
109
 
110
  # Use a temp directory so MarkItDown sees the real file extension
111
  with tempfile.TemporaryDirectory() as tmpdir:
112
- kind = filetype.guess(str(src))
113
- if kind is None:
114
  logger.warning(f"Could not detect file type for: {src}")
115
  new_filename = src.name
116
  else:
117
- logger.debug(f"Detected file type '{kind.extension}' for: {src}")
118
- new_filename = f"{src.stem}.{kind.extension}"
 
119
  tmp_path = Path(tmpdir) / new_filename
120
  tmp_path.write_bytes(src.read_bytes())
121
 
 
4
 
5
  import logging
6
  import tempfile
7
+ import magic
8
+ import mimetypes
9
  from pathlib import Path
10
  from typing import Optional, Dict, Tuple, Any
11
 
 
110
 
111
  # Use a temp directory so MarkItDown sees the real file extension
112
  with tempfile.TemporaryDirectory() as tmpdir:
113
+ mime_type = magic.from_file(str(src), mime=True)
114
+ if not mime_type:
115
  logger.warning(f"Could not detect file type for: {src}")
116
  new_filename = src.name
117
  else:
118
+ logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
119
+ guessed_ext = mimetypes.guess_extension(mime_type) or ""
120
+ new_filename = f"{src.stem}{guessed_ext}"
121
  tmp_path = Path(tmpdir) / new_filename
122
  tmp_path.write_bytes(src.read_bytes())
123
 
requirements.txt CHANGED
@@ -11,4 +11,4 @@ pyquery==2.0.1
11
  tiktoken==0.8.0
12
  scuid
13
  aiohttp==3.11.11
14
- filetype==1.2.0
 
11
  tiktoken==0.8.0
12
  scuid
13
  aiohttp==3.11.11
14
+ python-magic==0.4.27