lamhieu commited on
Commit
dbb74f2
Β·
1 Parent(s): 4af30b6

chore: update something

Browse files
Files changed (1) hide show
  1. docsifer/service.py +27 -29
docsifer/service.py CHANGED
@@ -131,37 +131,32 @@ class DocsiferService:
131
 
132
  logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
133
 
134
- # Create a temporary directory so that MarkItDown sees the proper file extension.
135
- with tempfile.TemporaryDirectory() as tmpdir:
136
- mime_type = magic.from_file(str(src), mime=True)
137
- guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
138
- if not mime_type:
139
- logger.warning(f"Could not detect file type for: {src}")
140
- new_filename = src.name
141
- else:
142
- logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
143
- new_filename = f"{src.stem}{guessed_ext}"
144
- tmp_path = Path(tmpdir) / new_filename
145
- tmp_path.write_bytes(src.read_bytes())
146
-
147
- logger.info(
148
- "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
149
- tmp_path,
150
- mime_type,
151
- guessed_ext,
152
- tmp_path.exists(),
153
- )
154
-
155
- # Perform HTML cleanup if requested.
156
- if cleanup and guessed_ext.lower() in (".html", ".htm"):
157
- self._maybe_cleanup_html(tmp_path)
158
 
159
- filename = new_filename
160
- source = str(tmp_path)
 
161
 
162
- with open(source) as f:
163
- xxx = f.read()
164
- print(f"Filename: {filename}, Source: {source}, Content: {xxx}")
165
 
166
  # Decide whether to use LLM-enhanced conversion or the basic converter.
167
  if openai_config and openai_config.get("api_key"):
@@ -174,6 +169,9 @@ class DocsiferService:
174
  except Exception as e:
175
  logger.error("MarkItDown conversion failed: %s", e)
176
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
 
 
 
177
 
178
  # Count tokens in the resulting markdown text.
179
  token_count = self._count_tokens(result_obj.text_content)
 
131
 
132
  logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
133
 
134
+ mime_type = magic.from_file(str(src), mime=True)
135
+ guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
136
+ if not mime_type:
137
+ logger.warning(f"Could not detect file type for: {src}")
138
+ new_filename = src.name
139
+ else:
140
+ logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
141
+ new_filename = f"{src.stem}{guessed_ext}"
142
+ tmp_path = src.parent / new_filename
143
+ tmp_path.write_bytes(src.read_bytes())
144
+ src.unlink()
145
+
146
+ logger.info(
147
+ "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
148
+ tmp_path,
149
+ mime_type,
150
+ guessed_ext,
151
+ tmp_path.exists(),
152
+ )
 
 
 
 
 
153
 
154
+ # Perform HTML cleanup if requested.
155
+ if cleanup and guessed_ext.lower() in (".html", ".htm"):
156
+ self._maybe_cleanup_html(tmp_path)
157
 
158
+ filename = new_filename
159
+ source = tmp_path
 
160
 
161
  # Decide whether to use LLM-enhanced conversion or the basic converter.
162
  if openai_config and openai_config.get("api_key"):
 
169
  except Exception as e:
170
  logger.error("MarkItDown conversion failed: %s", e)
171
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
172
+
173
+ if isinstance(source, Path) and source.exists():
174
+ source.unlink()
175
 
176
  # Count tokens in the resulting markdown text.
177
  token_count = self._count_tokens(result_obj.text_content)