chore: update something
Browse files- docsifer/service.py +27 -29
docsifer/service.py
CHANGED
@@ -131,37 +131,32 @@ class DocsiferService:
|
|
131 |
|
132 |
logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
)
|
154 |
-
|
155 |
-
# Perform HTML cleanup if requested.
|
156 |
-
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
157 |
-
self._maybe_cleanup_html(tmp_path)
|
158 |
|
159 |
-
|
160 |
-
|
|
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
print(f"Filename: {filename}, Source: {source}, Content: {xxx}")
|
165 |
|
166 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
167 |
if openai_config and openai_config.get("api_key"):
|
@@ -174,6 +169,9 @@ class DocsiferService:
|
|
174 |
except Exception as e:
|
175 |
logger.error("MarkItDown conversion failed: %s", e)
|
176 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
|
|
|
|
|
|
177 |
|
178 |
# Count tokens in the resulting markdown text.
|
179 |
token_count = self._count_tokens(result_obj.text_content)
|
|
|
131 |
|
132 |
logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
|
133 |
|
134 |
+
mime_type = magic.from_file(str(src), mime=True)
|
135 |
+
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
136 |
+
if not mime_type:
|
137 |
+
logger.warning(f"Could not detect file type for: {src}")
|
138 |
+
new_filename = src.name
|
139 |
+
else:
|
140 |
+
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
141 |
+
new_filename = f"{src.stem}{guessed_ext}"
|
142 |
+
tmp_path = src.parent / new_filename
|
143 |
+
tmp_path.write_bytes(src.read_bytes())
|
144 |
+
src.unlink()
|
145 |
+
|
146 |
+
logger.info(
|
147 |
+
"Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
|
148 |
+
tmp_path,
|
149 |
+
mime_type,
|
150 |
+
guessed_ext,
|
151 |
+
tmp_path.exists(),
|
152 |
+
)
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
# Perform HTML cleanup if requested.
|
155 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
156 |
+
self._maybe_cleanup_html(tmp_path)
|
157 |
|
158 |
+
filename = new_filename
|
159 |
+
source = tmp_path
|
|
|
160 |
|
161 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
162 |
if openai_config and openai_config.get("api_key"):
|
|
|
169 |
except Exception as e:
|
170 |
logger.error("MarkItDown conversion failed: %s", e)
|
171 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
172 |
+
|
173 |
+
if isinstance(source, Path) and source.exists():
|
174 |
+
source.unlink()
|
175 |
|
176 |
# Count tokens in the resulting markdown text.
|
177 |
token_count = self._count_tokens(result_obj.text_content)
|