lamhieu commited on
Commit
86b0060
Β·
1 Parent(s): 25ddcaa

chore: update something

Browse files
Files changed (3) hide show
  1. docsifer/service.py +13 -13
  2. poetry.lock +0 -0
  3. requirements.txt +1 -1
docsifer/service.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
 
3
  import asyncio
4
  import logging
 
5
  # import tempfile
6
 
7
  import requests.cookies
@@ -149,7 +150,7 @@ class DocsiferService:
149
  new_filename = f"{src.stem}{guessed_ext}"
150
  tmp_path = src.parent / new_filename
151
  tmp_path.write_bytes(src.read_bytes())
152
- # src.unlink()
153
 
154
  logger.info(
155
  "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
@@ -160,8 +161,8 @@ class DocsiferService:
160
  )
161
 
162
  # Perform HTML cleanup if requested.
163
- # if cleanup and guessed_ext.lower() in (".html", ".htm"):
164
- # self._maybe_cleanup_html(tmp_path)
165
 
166
  filename = new_filename
167
  source = tmp_path
@@ -173,23 +174,22 @@ class DocsiferService:
173
  md_converter = self._basic_markitdown
174
 
175
  # Load cookies if provided in the HTTP config.
176
- # if http_config:
177
- # if "cookies" in http_config:
178
- # requests.cookies.cookiejar_from_dict(
179
- # http_config["cookies"],
180
- # requests.cookies.RequestsCookieJar,
181
- # overwrite=True,
182
- # )
183
 
184
  try:
185
  result_obj = md_converter.convert(source)
186
- print("result_obj:\n", result_obj)
187
  except Exception as e:
188
  logger.error("MarkItDown conversion failed: %s", e)
189
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
190
 
191
- # if isinstance(source, Path) and source.exists():
192
- # source.unlink()
193
 
194
  # Count tokens in the resulting markdown text.
195
  token_count = self._count_tokens(result_obj.text_content)
 
2
 
3
  import asyncio
4
  import logging
5
+
6
  # import tempfile
7
 
8
  import requests.cookies
 
150
  new_filename = f"{src.stem}{guessed_ext}"
151
  tmp_path = src.parent / new_filename
152
  tmp_path.write_bytes(src.read_bytes())
153
+ src.unlink()
154
 
155
  logger.info(
156
  "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
 
161
  )
162
 
163
  # Perform HTML cleanup if requested.
164
+ if cleanup and guessed_ext.lower() in (".html", ".htm"):
165
+ self._maybe_cleanup_html(tmp_path)
166
 
167
  filename = new_filename
168
  source = tmp_path
 
174
  md_converter = self._basic_markitdown
175
 
176
  # Load cookies if provided in the HTTP config.
177
+ if http_config:
178
+ if "cookies" in http_config:
179
+ requests.cookies.cookiejar_from_dict(
180
+ http_config["cookies"],
181
+ requests.cookies.RequestsCookieJar,
182
+ overwrite=True,
183
+ )
184
 
185
  try:
186
  result_obj = md_converter.convert(source)
 
187
  except Exception as e:
188
  logger.error("MarkItDown conversion failed: %s", e)
189
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
190
 
191
+ if isinstance(source, Path) and source.exists():
192
+ source.unlink()
193
 
194
  # Count tokens in the resulting markdown text.
195
  token_count = self._count_tokens(result_obj.text_content)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -5,7 +5,7 @@ requests
5
  pydantic
6
  cachetools
7
  scuid
8
- markitdown==0.0.1a3
9
  upstash_redis==1.2.0
10
  openai==1.59.7
11
  pyquery==2.0.1
 
5
  pydantic
6
  cachetools
7
  scuid
8
+ markitdown @ git+https://github.com/lh0x00/markitdown@c5e3ab4
9
  upstash_redis==1.2.0
10
  openai==1.59.7
11
  pyquery==2.0.1