lamhieu commited on
Commit
9f2ec30
Β·
1 Parent(s): dbb74f2

chore: update something

Browse files
Files changed (3) hide show
  1. docsifer/__init__.py +32 -8
  2. docsifer/router.py +8 -0
  3. docsifer/service.py +26 -4
docsifer/__init__.py CHANGED
@@ -101,6 +101,7 @@ def call_convert_api(
101
  openai_base_url: Optional[str] = None,
102
  openai_api_key: Optional[str] = None,
103
  openai_model: Optional[str] = None,
 
104
  ) -> Tuple[str, str]:
105
  """
106
  Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
@@ -115,6 +116,7 @@ def call_convert_api(
115
  openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
116
  openai_api_key (str, optional): API key for the LLM.
117
  openai_model (str, optional): Model name to use for LLM-based extraction.
 
118
 
119
  Returns:
120
  (str, str):
@@ -143,6 +145,14 @@ def call_convert_api(
143
  if len(openai_dict) <= 3:
144
  data.pop("openai")
145
 
 
 
 
 
 
 
 
 
146
  # Decide if we're sending a file or a URL
147
  files = {}
148
  if file_obj:
@@ -220,7 +230,7 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
220
  all_models = set()
221
  for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
222
  period_dict = bucket.get(period_key, {})
223
- all_models.update(period_dict.keys()) # typically just "docsifer"
224
 
225
  result_dict = {
226
  "Model": [],
@@ -251,7 +261,7 @@ def create_main_interface():
251
  Create a Gradio Blocks interface that includes:
252
  1) 'Conversion Playground' Tab:
253
  - File upload OR URL-based conversion
254
- - Optional OpenAI configuration
255
  - Convert button
256
  - Display of conversion result as Markdown
257
  - Downloadable .md file
@@ -317,6 +327,17 @@ def create_main_interface():
317
  value="gpt-4o-mini",
318
  )
319
 
 
 
 
 
 
 
 
 
 
 
 
320
  with gr.Accordion("Conversion Settings", open=True):
321
  gr.Markdown(
322
  "Enable to remove <style> tags or hidden elements "
@@ -371,11 +392,12 @@ def create_main_interface():
371
  )
372
 
373
  # Callback function triggered by convert_btn.click
374
- def on_convert(file_bytes, url_str, base_url, api_key, model_id, cleanup):
 
 
375
  """
376
- Converts the uploaded file or a URL to Markdown by calling the Docsifer
377
- API. Returns the resulting Markdown content and path to the
378
- temporary .md file for download.
379
 
380
  Args:
381
  file_bytes (bytes): The raw file content (None if not uploaded).
@@ -384,20 +406,20 @@ def create_main_interface():
384
  api_key (str): The API key for the LLM.
385
  model_id (str): The model to use for the LLM.
386
  cleanup (bool): Whether to enable cleanup on HTML files.
 
387
 
388
  Returns:
389
  (str, str):
390
  - The Markdown content or error message.
391
  - The path to the temp .md file for download.
392
  """
393
- # If file is not provided, we attempt the URL approach
394
  if not file_bytes and not url_str:
395
  return "❌ Please upload a file or provide a URL.", None
396
 
397
  # Create a unique temporary filename if file is present
398
  unique_name = f"{scuid()}.tmp" if file_bytes else ""
399
 
400
- # Call the convert API
401
  markdown, temp_md_path = call_convert_api(
402
  file_obj=file_bytes,
403
  filename=unique_name,
@@ -406,6 +428,7 @@ def create_main_interface():
406
  openai_api_key=api_key,
407
  openai_model=model_id,
408
  cleanup=cleanup,
 
409
  )
410
 
411
  return markdown, temp_md_path
@@ -420,6 +443,7 @@ def create_main_interface():
420
  openai_api_key,
421
  openai_model,
422
  cleanup_toggle,
 
423
  ],
424
  outputs=[output_md, download_file],
425
  )
 
101
  openai_base_url: Optional[str] = None,
102
  openai_api_key: Optional[str] = None,
103
  openai_model: Optional[str] = None,
104
+ http_cookies: Optional[str] = None,
105
  ) -> Tuple[str, str]:
106
  """
107
  Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
 
116
  openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
117
  openai_api_key (str, optional): API key for the LLM.
118
  openai_model (str, optional): Model name to use for LLM-based extraction.
119
+ http_cookies (str, optional): JSON-formatted string representing cookies for HTTP requests.
120
 
121
  Returns:
122
  (str, str):
 
145
  if len(openai_dict) <= 3:
146
  data.pop("openai")
147
 
148
+ # Build the HTTP configuration object
149
+ if http_cookies and http_cookies.strip():
150
+ try:
151
+ cookies_obj = json.loads(http_cookies)
152
+ except Exception as e:
153
+ return (f"❌ Invalid JSON for HTTP Cookies: {str(e)}", "")
154
+ data["http"] = json.dumps({"cookies": cookies_obj})
155
+
156
  # Decide if we're sending a file or a URL
157
  files = {}
158
  if file_obj:
 
230
  all_models = set()
231
  for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
232
  period_dict = bucket.get(period_key, {})
233
+ all_models.update(period_dict.keys())
234
 
235
  result_dict = {
236
  "Model": [],
 
261
  Create a Gradio Blocks interface that includes:
262
  1) 'Conversion Playground' Tab:
263
  - File upload OR URL-based conversion
264
+ - Optional OpenAI configuration and HTTP configuration
265
  - Convert button
266
  - Display of conversion result as Markdown
267
  - Downloadable .md file
 
327
  value="gpt-4o-mini",
328
  )
329
 
330
+ with gr.Accordion("HTTP Configuration (Optional)", open=False):
331
+ gr.Markdown(
332
+ "Provide additional HTTP configuration. "
333
+ "In particular, you can specify cookies as a JSON object to be included in the request."
334
+ )
335
+ http_cookies = gr.Textbox(
336
+ label="Cookies",
337
+ placeholder='e.g. {"session": "abcd1234"}',
338
+ lines=3,
339
+ )
340
+
341
  with gr.Accordion("Conversion Settings", open=True):
342
  gr.Markdown(
343
  "Enable to remove <style> tags or hidden elements "
 
392
  )
393
 
394
  # Callback function triggered by convert_btn.click
395
+ def on_convert(
396
+ file_bytes, url_str, base_url, api_key, model_id, cleanup, http_cookies
397
+ ):
398
  """
399
+ Converts the uploaded file or a URL to Markdown by calling the Docsifer API.
400
+ Returns the resulting Markdown content and path to the temporary .md file for download.
 
401
 
402
  Args:
403
  file_bytes (bytes): The raw file content (None if not uploaded).
 
406
  api_key (str): The API key for the LLM.
407
  model_id (str): The model to use for the LLM.
408
  cleanup (bool): Whether to enable cleanup on HTML files.
409
+ http_cookies (str): JSON-formatted string for HTTP cookies.
410
 
411
  Returns:
412
  (str, str):
413
  - The Markdown content or error message.
414
  - The path to the temp .md file for download.
415
  """
 
416
  if not file_bytes and not url_str:
417
  return "❌ Please upload a file or provide a URL.", None
418
 
419
  # Create a unique temporary filename if file is present
420
  unique_name = f"{scuid()}.tmp" if file_bytes else ""
421
 
422
+ # Call the convert API with HTTP configuration
423
  markdown, temp_md_path = call_convert_api(
424
  file_obj=file_bytes,
425
  filename=unique_name,
 
428
  openai_api_key=api_key,
429
  openai_model=model_id,
430
  cleanup=cleanup,
431
+ http_cookies=http_cookies,
432
  )
433
 
434
  return markdown, temp_md_path
 
443
  openai_api_key,
444
  openai_model,
445
  cleanup_toggle,
446
+ http_cookies,
447
  ],
448
  outputs=[output_md, download_file],
449
  )
docsifer/router.py CHANGED
@@ -39,6 +39,7 @@ async def convert_document(
39
  None, description="URL to convert (used only if no file is provided)"
40
  ),
41
  openai: str = Form("{}", description="OpenAI config as a JSON object"),
 
42
  settings: str = Form("{}", description="Settings as a JSON object"),
43
  ):
44
  """
@@ -55,6 +56,11 @@ async def convert_document(
55
  except json.JSONDecodeError:
56
  raise ValueError("Invalid JSON in 'openai' parameter.")
57
 
 
 
 
 
 
58
  try:
59
  settings_config = json.loads(settings) if settings else {}
60
  except json.JSONDecodeError:
@@ -71,6 +77,7 @@ async def convert_document(
71
  result, token_count = await docsifer_service.convert_file(
72
  source=str(temp_path),
73
  openai_config=openai_config,
 
74
  cleanup=cleanup,
75
  )
76
  elif url:
@@ -90,6 +97,7 @@ async def convert_document(
90
  result, token_count = await docsifer_service.convert_file(
91
  source=str(url),
92
  openai_config=openai_config,
 
93
  cleanup=cleanup,
94
  )
95
  else:
 
39
  None, description="URL to convert (used only if no file is provided)"
40
  ),
41
  openai: str = Form("{}", description="OpenAI config as a JSON object"),
42
+ http: str = Form("{}", description="HTTP config as a JSON object"),
43
  settings: str = Form("{}", description="Settings as a JSON object"),
44
  ):
45
  """
 
56
  except json.JSONDecodeError:
57
  raise ValueError("Invalid JSON in 'openai' parameter.")
58
 
59
+ try:
60
+ http_config = json.loads(http) if http else {}
61
+ except json.JSONDecodeError:
62
+ raise ValueError("Invalid JSON in 'http' parameter.")
63
+
64
  try:
65
  settings_config = json.loads(settings) if settings else {}
66
  except json.JSONDecodeError:
 
77
  result, token_count = await docsifer_service.convert_file(
78
  source=str(temp_path),
79
  openai_config=openai_config,
80
+ http_config=http_config,
81
  cleanup=cleanup,
82
  )
83
  elif url:
 
97
  result, token_count = await docsifer_service.convert_file(
98
  source=str(url),
99
  openai_config=openai_config,
100
+ http_config=http_config,
101
  cleanup=cleanup,
102
  )
103
  else:
docsifer/service.py CHANGED
@@ -3,8 +3,11 @@ from __future__ import annotations
3
  import asyncio
4
  import logging
5
  import tempfile
 
 
6
  import magic
7
  import mimetypes
 
8
  from pathlib import Path
9
  from typing import Optional, Dict, Tuple, Any
10
  from scuid import scuid
@@ -107,7 +110,11 @@ class DocsiferService:
107
  return len(text.split())
108
 
109
  def _convert_sync(
110
- self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
 
 
 
 
111
  ) -> Tuple[Dict[str, str], int]:
112
  """
113
  Synchronously convert a file at `file_path` to Markdown.
@@ -117,6 +124,7 @@ class DocsiferService:
117
  Args:
118
  source: Path to the source file or URL to fetch content from.
119
  openai_config: Optional dictionary with OpenAI configuration.
 
120
  cleanup: Whether to perform HTML cleanup if the file is an HTML file.
121
 
122
  Returns:
@@ -164,12 +172,21 @@ class DocsiferService:
164
  else:
165
  md_converter = self._basic_markitdown
166
 
 
 
 
 
 
 
 
 
 
167
  try:
168
  result_obj = md_converter.convert(source)
169
  except Exception as e:
170
  logger.error("MarkItDown conversion failed: %s", e)
171
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
172
-
173
  if isinstance(source, Path) and source.exists():
174
  source.unlink()
175
 
@@ -183,7 +200,11 @@ class DocsiferService:
183
  return result_dict, token_count
184
 
185
  async def convert_file(
186
- self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
 
 
 
 
187
  ) -> Tuple[Dict[str, str], int]:
188
  """
189
  Asynchronously convert a file at `source` to Markdown.
@@ -192,6 +213,7 @@ class DocsiferService:
192
  Args:
193
  source: Path to the file to convert or a URL to fetch content from.
194
  openai_config: Optional OpenAI configuration dictionary.
 
195
  cleanup: Whether to perform HTML cleanup if applicable.
196
 
197
  Returns:
@@ -199,5 +221,5 @@ class DocsiferService:
199
  and the token count.
200
  """
201
  return await asyncio.to_thread(
202
- self._convert_sync, source, openai_config, cleanup
203
  )
 
3
  import asyncio
4
  import logging
5
  import tempfile
6
+
7
+ import requests.cookies
8
  import magic
9
  import mimetypes
10
+ import requests
11
  from pathlib import Path
12
  from typing import Optional, Dict, Tuple, Any
13
  from scuid import scuid
 
110
  return len(text.split())
111
 
112
  def _convert_sync(
113
+ self,
114
+ source: str,
115
+ openai_config: Optional[dict] = None,
116
+ http_config: Optional[dict] = None,
117
+ cleanup: bool = True,
118
  ) -> Tuple[Dict[str, str], int]:
119
  """
120
  Synchronously convert a file at `file_path` to Markdown.
 
124
  Args:
125
  source: Path to the source file or URL to fetch content from.
126
  openai_config: Optional dictionary with OpenAI configuration.
127
+ http_config: Optional dictionary with HTTP configuration.
128
  cleanup: Whether to perform HTML cleanup if the file is an HTML file.
129
 
130
  Returns:
 
172
  else:
173
  md_converter = self._basic_markitdown
174
 
175
+ # Load cookies if provided in the HTTP config.
176
+ if http_config:
177
+ if "cookies" in http_config:
178
+ requests.cookies.cookiejar_from_dict(
179
+ http_config["cookies"],
180
+ requests.cookies.RequestsCookieJar,
181
+ overwrite=True,
182
+ )
183
+
184
  try:
185
  result_obj = md_converter.convert(source)
186
  except Exception as e:
187
  logger.error("MarkItDown conversion failed: %s", e)
188
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
189
+
190
  if isinstance(source, Path) and source.exists():
191
  source.unlink()
192
 
 
200
  return result_dict, token_count
201
 
202
  async def convert_file(
203
+ self,
204
+ source: str,
205
+ openai_config: Optional[dict] = None,
206
+ http_config: Optional[dict] = None,
207
+ cleanup: bool = True,
208
  ) -> Tuple[Dict[str, str], int]:
209
  """
210
  Asynchronously convert a file at `source` to Markdown.
 
213
  Args:
214
  source: Path to the file to convert or a URL to fetch content from.
215
  openai_config: Optional OpenAI configuration dictionary.
216
+ http_config: Optional HTTP configuration dictionary.
217
  cleanup: Whether to perform HTML cleanup if applicable.
218
 
219
  Returns:
 
221
  and the token count.
222
  """
223
  return await asyncio.to_thread(
224
+ self._convert_sync, source, openai_config, http_config, cleanup
225
  )