lamhieu commited on
Commit
6e481d9
·
1 Parent(s): 772800b

feat: support convert from url and more

Browse files
Files changed (3) hide show
  1. docsifer/__init__.py +135 -53
  2. docsifer/router.py +42 -15
  3. requirements.txt +1 -0
docsifer/__init__.py CHANGED
@@ -17,8 +17,12 @@ from pathlib import Path
17
  from scuid import scuid
18
 
19
 
20
- # Filter out /v1 requests from the access log
21
  class LogFilter(logging.Filter):
 
 
 
 
 
22
  def filter(self, record):
23
  # Only keep log records that contain "/v1" in the request path
24
  if record.args and len(record.args) >= 3:
@@ -30,7 +34,6 @@ class LogFilter(logging.Filter):
30
  logger = logging.getLogger("uvicorn.access")
31
  logger.addFilter(LogFilter())
32
 
33
- # Application metadata
34
  __version__ = "1.0.0"
35
  __author__ = "lamhieu"
36
  __description__ = "Docsifer: Efficient Data Conversion to Markdown."
@@ -46,11 +49,10 @@ __metadata__ = {
46
  "spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
47
  }
48
 
49
- # Update your Docsifer API endpoints (you can replace with your HF Space or other URL)
50
  DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
51
  DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
52
 
53
- # Markdown description for the main interface
54
  APP_DESCRIPTION = f"""
55
  # 📝 **Docsifer: Convert Your Documents to Markdown**
56
 
@@ -60,7 +62,7 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
60
 
61
  - **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
62
  - **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
63
- - **Privacy-Focused**: We never store user data; all processing is ephemeral. We only collect minimal anonymous usage stats for service improvement.
64
  - **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
65
  - **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
66
 
@@ -68,7 +70,6 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
68
  - [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
69
  """
70
 
71
- # Initialize FastAPI application
72
  app = FastAPI(
73
  title="Docsifer Service API",
74
  description=__description__,
@@ -77,7 +78,7 @@ app = FastAPI(
77
  redoc_url="/redoc",
78
  )
79
 
80
- # Configure CORS
81
  app.add_middleware(
82
  CORSMiddleware,
83
  allow_origins=["*"], # Adjust if needed for specific domains
@@ -86,34 +87,40 @@ app.add_middleware(
86
  allow_headers=["*"],
87
  )
88
 
89
- # Import and include your existing router (which has /v1/convert, /v1/stats, etc.)
90
  from .router import router
91
 
92
  app.include_router(router, prefix="/v1")
93
 
94
 
95
  def call_convert_api(
96
- file_obj: bytes,
97
- filename: str,
 
98
  cleanup: bool = True,
99
  openai_base_url: Optional[str] = None,
100
  openai_api_key: Optional[str] = None,
101
  openai_model: Optional[str] = None,
102
  ) -> Tuple[str, str]:
103
  """
104
- Calls the /v1/convert endpoint, returning (markdown_content, md_file_path).
105
- If there's an error, the first return value is an error message (str),
106
- the second is an empty string.
107
-
108
- The updated /v1/convert expects:
109
- - file (UploadFile)
110
- - openai (object, e.g. {"api_key":"...","base_url":"..."})
111
- - settings (object, e.g. {"cleanup": true})
 
 
 
 
 
 
 
 
 
112
  """
113
-
114
- if file_obj is None:
115
- return ("❌ No file was uploaded.", "")
116
-
117
  # Build the "openai" object
118
  openai_dict = {}
119
  if openai_api_key and openai_api_key.strip():
@@ -127,17 +134,27 @@ def call_convert_api(
127
  settings_dict = {"cleanup": cleanup}
128
 
129
  data = {
130
- # These must match the `Form(...)` fields named "openai" and "settings"
131
  "openai": json.dumps(openai_dict),
132
  "settings": json.dumps(settings_dict),
133
  }
134
 
 
135
  if len(openai_dict) <= 3:
136
  data.pop("openai")
137
 
138
- # Prepare files for multipart/form-data
139
- files = {"file": (filename, file_obj)}
140
-
 
 
 
 
 
 
 
 
 
141
  try:
142
  response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
143
  except requests.exceptions.RequestException as e:
@@ -146,14 +163,15 @@ def call_convert_api(
146
  if response.status_code != 200:
147
  return (f"❌ API Error {response.status_code}: {response.text}", "")
148
 
 
149
  try:
150
  converted = response.json()
151
- # Expecting { "filename": "...", "markdown": "..." }
152
  markdown_content = converted["markdown"]
153
  except Exception as e:
154
  return (f"❌ Error parsing JSON: {str(e)}", "")
155
 
156
- # Write the returned Markdown to a temporary .md file so Gradio can serve it
157
  with tempfile.NamedTemporaryFile(
158
  mode="w+", suffix=".md", dir="/tmp", delete=False
159
  ) as tmp_file:
@@ -165,8 +183,17 @@ def call_convert_api(
165
 
166
  def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
167
  """
168
- Calls /v1/stats endpoint to retrieve analytics data.
169
- Returns two DataFrames: (access_df, tokens_df).
 
 
 
 
 
 
 
 
 
170
  """
171
  try:
172
  response = requests.get(DOCSIFER_STATS_URL, timeout=10)
@@ -186,8 +213,10 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
186
  tokens_data = data.get("tokens", {})
187
 
188
  def build_stats_df(bucket: dict) -> pd.DataFrame:
189
- # We want columns for periods: total, daily, weekly, monthly, yearly
190
- # Each row => "docsifer" (just 1 row if everything is aggregated)
 
 
191
  all_models = set()
192
  for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
193
  period_dict = bucket.get(period_key, {})
@@ -219,21 +248,31 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
219
 
220
  def create_main_interface():
221
  """
222
- Creates a Gradio Blocks interface:
223
- - A 'Conversion Playground' tab for uploading a file and converting to Markdown
224
- - An 'Analytics Stats' section to display usage statistics
225
- - cURL examples for reference
 
 
 
 
 
 
 
 
 
226
  """
227
  with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
228
  gr.Markdown(APP_DESCRIPTION)
229
 
230
  with gr.Tab("Conversion Playground"):
231
- gr.Markdown("### Convert your files to Markdown with Docsifer.")
232
 
233
  with gr.Row():
 
234
  with gr.Column():
235
  file_input = gr.File(
236
- label="Upload File",
237
  file_types=[
238
  ".pdf",
239
  ".docx",
@@ -251,6 +290,11 @@ def create_main_interface():
251
  type="binary",
252
  )
253
 
 
 
 
 
 
254
  with gr.Accordion("OpenAI Configuration (Optional)", open=False):
255
  gr.Markdown(
256
  "Provide these if you'd like **LLM-assisted** extraction. "
@@ -275,7 +319,8 @@ def create_main_interface():
275
 
276
  with gr.Accordion("Conversion Settings", open=True):
277
  gr.Markdown(
278
- "Enable to remove <style> tags or hidden elements from `.html` files before conversion."
 
279
  )
280
  cleanup_toggle = gr.Checkbox(
281
  label="Enable Cleanup",
@@ -284,13 +329,12 @@ def create_main_interface():
284
 
285
  convert_btn = gr.Button("Convert")
286
 
 
287
  with gr.Column():
288
- output_md = gr.Textbox(
289
- label="Conversion Result (Markdown)",
290
- lines=20,
291
- interactive=False,
292
- )
293
- # Set visible=True so the user always sees a small download button
294
  download_file = gr.File(
295
  label="Download",
296
  interactive=False,
@@ -309,32 +353,64 @@ def create_main_interface():
309
  -F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
310
  -F "settings={\\"cleanup\\":true}"
311
  ```
 
 
 
 
 
 
 
 
 
312
  """
313
  )
314
 
315
- def on_convert(file_bytes, base_url, api_key, model_id, cleanup):
 
316
  """
317
- Callback for the 'Convert' button.
318
- We generate a unique name if the user uploads a file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  """
320
- if not file_bytes:
321
- return "❌ Please upload a file first.", None
 
 
 
 
322
 
323
- unique_name = f"{scuid()}.tmp"
324
  markdown, temp_md_path = call_convert_api(
325
  file_obj=file_bytes,
326
  filename=unique_name,
 
327
  openai_base_url=base_url,
328
  openai_api_key=api_key,
329
  openai_model=model_id,
330
  cleanup=cleanup,
331
  )
 
332
  return markdown, temp_md_path
333
 
 
334
  convert_btn.click(
335
  fn=on_convert,
336
  inputs=[
337
  file_input,
 
338
  openai_base_url,
339
  openai_api_key,
340
  openai_model,
@@ -348,6 +424,7 @@ def create_main_interface():
348
  "View Docsifer usage statistics (access count, token usage, etc.)"
349
  )
350
  stats_btn = gr.Button("Get Stats")
 
351
  access_df = gr.DataFrame(
352
  label="Access Stats",
353
  headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
@@ -359,6 +436,7 @@ def create_main_interface():
359
  interactive=False,
360
  )
361
 
 
362
  stats_btn.click(
363
  fn=call_stats_api_df,
364
  inputs=[],
@@ -368,17 +446,21 @@ def create_main_interface():
368
  return demo
369
 
370
 
371
- # Build our Gradio interface and mount it at the root path
372
  main_interface = create_main_interface()
373
  mount_gradio_app(app, main_interface, path="/")
374
 
375
 
376
- # Startup / Shutdown events
377
  @app.on_event("startup")
378
  async def startup_event():
 
 
 
379
  logger.info("Docsifer Service is starting up...")
380
 
381
 
382
  @app.on_event("shutdown")
383
  async def shutdown_event():
 
 
 
384
  logger.info("Docsifer Service is shutting down.")
 
17
  from scuid import scuid
18
 
19
 
 
20
  class LogFilter(logging.Filter):
21
+ """
22
+ A custom logging filter that only keeps log records containing '/v1'
23
+ in the request path. This helps to filter out other logs and reduce noise.
24
+ """
25
+
26
  def filter(self, record):
27
  # Only keep log records that contain "/v1" in the request path
28
  if record.args and len(record.args) >= 3:
 
34
  logger = logging.getLogger("uvicorn.access")
35
  logger.addFilter(LogFilter())
36
 
 
37
  __version__ = "1.0.0"
38
  __author__ = "lamhieu"
39
  __description__ = "Docsifer: Efficient Data Conversion to Markdown."
 
49
  "spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
50
  }
51
 
52
+ # Docsifer API Endpoints (can be replaced with your live URLs if desired)
53
  DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
54
  DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
55
 
 
56
  APP_DESCRIPTION = f"""
57
  # 📝 **Docsifer: Convert Your Documents to Markdown**
58
 
 
62
 
63
  - **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
64
  - **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
65
+ - **Privacy-Focused**: We never store user data; all processing is temporary. We only collect minimal anonymous usage statistics to count the number of calls and the number of tokens, nothing else.
66
  - **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
67
  - **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
68
 
 
70
  - [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
71
  """
72
 
 
73
  app = FastAPI(
74
  title="Docsifer Service API",
75
  description=__description__,
 
78
  redoc_url="/redoc",
79
  )
80
 
81
+ # Configure CORS (Cross-Origin Resource Sharing)
82
  app.add_middleware(
83
  CORSMiddleware,
84
  allow_origins=["*"], # Adjust if needed for specific domains
 
87
  allow_headers=["*"],
88
  )
89
 
90
+ # Import and include your existing router (with /v1 endpoints)
91
  from .router import router
92
 
93
  app.include_router(router, prefix="/v1")
94
 
95
 
96
  def call_convert_api(
97
+ file_obj: Optional[bytes],
98
+ filename: str = "",
99
+ url: Optional[str] = None,
100
  cleanup: bool = True,
101
  openai_base_url: Optional[str] = None,
102
  openai_api_key: Optional[str] = None,
103
  openai_model: Optional[str] = None,
104
  ) -> Tuple[str, str]:
105
  """
106
+ Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
107
+ - If there's an error, the first return value is an error message (str),
108
+ the second is an empty string.
109
+
110
+ Args:
111
+ file_obj (Optional[bytes]): The raw file bytes to be sent. If None, 'url' is used.
112
+ filename (str): Name of the file (will be posted to the endpoint).
113
+ url (str, optional): URL to be converted (used only if file_obj is None).
114
+ cleanup (bool): Whether to enable cleanup mode for HTML files.
115
+ openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
116
+ openai_api_key (str, optional): API key for the LLM.
117
+ openai_model (str, optional): Model name to use for LLM-based extraction.
118
+
119
+ Returns:
120
+ (str, str):
121
+ - markdown_content (str): The conversion result in Markdown form or an error message.
122
+ - tmp_md_path (str): The path to the temporary .md file for download.
123
  """
 
 
 
 
124
  # Build the "openai" object
125
  openai_dict = {}
126
  if openai_api_key and openai_api_key.strip():
 
134
  settings_dict = {"cleanup": cleanup}
135
 
136
  data = {
137
+ # Must match the `Form(...)` fields named "openai" and "settings"
138
  "openai": json.dumps(openai_dict),
139
  "settings": json.dumps(settings_dict),
140
  }
141
 
142
+ # If the user left the OpenAI fields blank, remove the `openai` key from data
143
  if len(openai_dict) <= 3:
144
  data.pop("openai")
145
 
146
+ # Decide if we're sending a file or a URL
147
+ files = {}
148
+ if file_obj:
149
+ # If file is provided, it takes priority
150
+ files = {"file": (filename, file_obj)}
151
+ data["url"] = "" # ensure 'url' is empty on the form
152
+ elif url and url.strip():
153
+ data["url"] = url.strip()
154
+ else:
155
+ return ("❌ Please upload a file or provide a URL.", "")
156
+
157
+ # Perform the POST request
158
  try:
159
  response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
160
  except requests.exceptions.RequestException as e:
 
163
  if response.status_code != 200:
164
  return (f"❌ API Error {response.status_code}: {response.text}", "")
165
 
166
+ # Parse the API response
167
  try:
168
  converted = response.json()
169
+ # Expected structure: { "filename": "...", "markdown": "..." }
170
  markdown_content = converted["markdown"]
171
  except Exception as e:
172
  return (f"❌ Error parsing JSON: {str(e)}", "")
173
 
174
+ # Write the returned Markdown to a temp .md file
175
  with tempfile.NamedTemporaryFile(
176
  mode="w+", suffix=".md", dir="/tmp", delete=False
177
  ) as tmp_file:
 
183
 
184
  def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
185
  """
186
+ Call /v1/stats endpoint to retrieve analytics data and return two DataFrames:
187
+ - access_df: Access statistics
188
+ - tokens_df: Token usage statistics
189
+
190
+ Raises:
191
+ ValueError: If the stats endpoint fails or returns invalid data.
192
+
193
+ Returns:
194
+ Tuple[pd.DataFrame, pd.DataFrame]:
195
+ (access_df, tokens_df) with columns ["Model", "Total", "Daily",
196
+ "Weekly", "Monthly", "Yearly"].
197
  """
198
  try:
199
  response = requests.get(DOCSIFER_STATS_URL, timeout=10)
 
213
  tokens_data = data.get("tokens", {})
214
 
215
  def build_stats_df(bucket: dict) -> pd.DataFrame:
216
+ """
217
+ Helper function to transform a nested dictionary (by period, by model)
218
+ into a tabular pandas DataFrame.
219
+ """
220
  all_models = set()
221
  for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
222
  period_dict = bucket.get(period_key, {})
 
248
 
249
  def create_main_interface():
250
  """
251
+ Create a Gradio Blocks interface that includes:
252
+ 1) 'Conversion Playground' Tab:
253
+ - File upload OR URL-based conversion
254
+ - Optional OpenAI configuration
255
+ - Convert button
256
+ - Display of conversion result as Markdown
257
+ - Downloadable .md file
258
+ 2) 'Analytics Stats' Tab:
259
+ - Button to fetch usage statistics
260
+ - DataFrames for Access Stats and Token Stats
261
+
262
+ Returns:
263
+ Gradio Blocks instance that can be mounted into the FastAPI app.
264
  """
265
  with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
266
  gr.Markdown(APP_DESCRIPTION)
267
 
268
  with gr.Tab("Conversion Playground"):
269
+ gr.Markdown("### Convert your files or a URL to Markdown with Docsifer.")
270
 
271
  with gr.Row():
272
+ # Left Column: File Upload, URL Input, Settings, Button
273
  with gr.Column():
274
  file_input = gr.File(
275
+ label="Upload File (optional)",
276
  file_types=[
277
  ".pdf",
278
  ".docx",
 
290
  type="binary",
291
  )
292
 
293
+ url_input = gr.Textbox(
294
+ label="URL (optional)",
295
+ placeholder="Enter a URL if no file is uploaded",
296
+ )
297
+
298
  with gr.Accordion("OpenAI Configuration (Optional)", open=False):
299
  gr.Markdown(
300
  "Provide these if you'd like **LLM-assisted** extraction. "
 
319
 
320
  with gr.Accordion("Conversion Settings", open=True):
321
  gr.Markdown(
322
+ "Enable to remove <style> tags or hidden elements "
323
+ "from `.html` files before conversion."
324
  )
325
  cleanup_toggle = gr.Checkbox(
326
  label="Enable Cleanup",
 
329
 
330
  convert_btn = gr.Button("Convert")
331
 
332
+ # Right Column: Conversion Result Display & Download
333
  with gr.Column():
334
+ # Display the result as Markdown
335
+ output_md = gr.Markdown(label="Conversion Result (Markdown)")
336
+
337
+ # The user can still download the .md file
 
 
338
  download_file = gr.File(
339
  label="Download",
340
  interactive=False,
 
353
  -F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
354
  -F "settings={\\"cleanup\\":true}"
355
  ```
356
+
357
+ **Convert from a URL (no file)**:
358
+ ```bash
359
+ curl -X POST \\
360
+ "https://lamhieu-docsifer.hf.space/v1/convert" \\
361
+ -F "url=https://example.com/page.html" \\
362
+ -F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
363
+ -F "settings={\\"cleanup\\":true}"
364
+ ```
365
  """
366
  )
367
 
368
+ # Callback function triggered by convert_btn.click
369
+ def on_convert(file_bytes, url_str, base_url, api_key, model_id, cleanup):
370
  """
371
+ Converts the uploaded file or a URL to Markdown by calling the Docsifer
372
+ API. Returns the resulting Markdown content and path to the
373
+ temporary .md file for download.
374
+
375
+ Args:
376
+ file_bytes (bytes): The raw file content (None if not uploaded).
377
+ url_str (str): The URL to convert (only used if file_bytes is None).
378
+ base_url (str): The base URL for OpenAI or compatible LLM.
379
+ api_key (str): The API key for the LLM.
380
+ model_id (str): The model to use for the LLM.
381
+ cleanup (bool): Whether to enable cleanup on HTML files.
382
+
383
+ Returns:
384
+ (str, str):
385
+ - The Markdown content or error message.
386
+ - The path to the temp .md file for download.
387
  """
388
+ # If file is not provided, we attempt the URL approach
389
+ if not file_bytes and not url_str:
390
+ return "❌ Please upload a file or provide a URL.", None
391
+
392
+ # Create a unique temporary filename if file is present
393
+ unique_name = f"{scuid()}.tmp" if file_bytes else ""
394
 
395
+ # Call the convert API
396
  markdown, temp_md_path = call_convert_api(
397
  file_obj=file_bytes,
398
  filename=unique_name,
399
+ url=url_str,
400
  openai_base_url=base_url,
401
  openai_api_key=api_key,
402
  openai_model=model_id,
403
  cleanup=cleanup,
404
  )
405
+
406
  return markdown, temp_md_path
407
 
408
+ # Link the on_convert function to the convert_btn
409
  convert_btn.click(
410
  fn=on_convert,
411
  inputs=[
412
  file_input,
413
+ url_input,
414
  openai_base_url,
415
  openai_api_key,
416
  openai_model,
 
424
  "View Docsifer usage statistics (access count, token usage, etc.)"
425
  )
426
  stats_btn = gr.Button("Get Stats")
427
+
428
  access_df = gr.DataFrame(
429
  label="Access Stats",
430
  headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
 
436
  interactive=False,
437
  )
438
 
439
+ # When the button is clicked, call_stats_api_df returns two dataframes
440
  stats_btn.click(
441
  fn=call_stats_api_df,
442
  inputs=[],
 
446
  return demo
447
 
448
 
 
449
  main_interface = create_main_interface()
450
  mount_gradio_app(app, main_interface, path="/")
451
 
452
 
 
453
  @app.on_event("startup")
454
  async def startup_event():
455
+ """
456
+ Logs a startup message when the Docsifer Service is starting.
457
+ """
458
  logger.info("Docsifer Service is starting up...")
459
 
460
 
461
  @app.on_event("shutdown")
462
  async def shutdown_event():
463
+ """
464
+ Logs a shutdown message when the Docsifer Service is shutting down.
465
+ """
466
  logger.info("Docsifer Service is shutting down.")
docsifer/router.py CHANGED
@@ -4,10 +4,12 @@ import logging
4
  import json
5
  import tempfile
6
  import os
 
7
  from pathlib import Path
8
 
9
  from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
10
  from pydantic import BaseModel
 
11
 
12
  from .service import DocsiferService
13
  from .analytics import Analytics
@@ -34,17 +36,21 @@ class ConvertResponse(BaseModel):
34
  @router.post("/convert", response_model=ConvertResponse)
35
  async def convert_document(
36
  background_tasks: BackgroundTasks,
37
- file: UploadFile = File(..., description="File to convert (1 file per request)"),
 
 
 
38
  openai: str = Form("{}", description="OpenAI config as a JSON object"),
39
  settings: str = Form("{}", description="Settings as a JSON object"),
40
  ):
41
  """
42
- Convert a single uploaded file to Markdown, optionally using OpenAI for advanced text extraction.
43
- - `openai` is a JSON string with keys: {"api_key": "...", "base_url": "..."}
44
- - `settings` is a JSON string with keys: {"cleanup": bool}
45
- - We do not store or track model_id in analytics; everything is aggregated as "docsifer".
46
  """
47
  try:
 
48
  try:
49
  openai_config = json.loads(openai) if openai else {}
50
  except json.JSONDecodeError:
@@ -57,22 +63,43 @@ async def convert_document(
57
 
58
  cleanup = settings_config.get("cleanup", True)
59
 
60
- with tempfile.TemporaryDirectory() as tmpdir:
61
- temp_path = Path(tmpdir) / file.filename
62
- contents = await file.read()
63
- temp_path.write_bytes(contents)
64
-
65
- result, token_count = await docsifer_service.convert_file(
66
- file_path=str(temp_path), openai_config=openai_config, cleanup=cleanup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
 
69
- # Track usage in analytics (single aggregator => "docsifer")
70
  background_tasks.add_task(analytics.access, token_count)
71
-
72
  return ConvertResponse(**result)
73
 
74
  except Exception as e:
75
- msg = f"Failed to convert document. Error: {str(e)}"
76
  logger.error(msg)
77
  raise HTTPException(status_code=500, detail=msg)
78
 
 
4
  import json
5
  import tempfile
6
  import os
7
+ import aiohttp
8
  from pathlib import Path
9
 
10
  from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
11
  from pydantic import BaseModel
12
+ from scuid import scuid
13
 
14
  from .service import DocsiferService
15
  from .analytics import Analytics
 
36
  @router.post("/convert", response_model=ConvertResponse)
37
  async def convert_document(
38
  background_tasks: BackgroundTasks,
39
+ file: UploadFile = File(None, description="File to convert"),
40
+ url: str = Form(
41
+ None, description="URL to convert (used only if no file is provided)"
42
+ ),
43
  openai: str = Form("{}", description="OpenAI config as a JSON object"),
44
  settings: str = Form("{}", description="Settings as a JSON object"),
45
  ):
46
  """
47
+ Convert a file or an HTML page from a URL into Markdown.
48
+ If 'file' is provided, it has priority over 'url'.
49
+ - 'openai' is a JSON string with keys: {"api_key": "...", "base_url": "..."}
50
+ - 'settings' is a JSON string with keys: {"cleanup": bool}
51
  """
52
  try:
53
+ # Parse configs
54
  try:
55
  openai_config = json.loads(openai) if openai else {}
56
  except json.JSONDecodeError:
 
63
 
64
  cleanup = settings_config.get("cleanup", True)
65
 
66
+ # If a file is provided, use the existing flow
67
+ if file is not None:
68
+ with tempfile.TemporaryDirectory() as tmpdir:
69
+ temp_path = Path(tmpdir) / file.filename
70
+ contents = await file.read()
71
+ temp_path.write_bytes(contents)
72
+ result, token_count = await docsifer_service.convert_file(
73
+ file_path=str(temp_path),
74
+ openai_config=openai_config,
75
+ cleanup=cleanup,
76
+ )
77
+ # Otherwise, fetch HTML from URL and convert
78
+ elif url:
79
+ async with aiohttp.ClientSession() as session:
80
+ async with session.get(url) as resp:
81
+ if resp.status != 200:
82
+ raise ValueError(f"Failed to fetch URL: status {resp.status}")
83
+ data = await resp.read()
84
+ with tempfile.TemporaryDirectory() as tmpdir:
85
+ temp_path = Path(tmpdir) / f"{scuid()}.html"
86
+ temp_path.write_bytes(data)
87
+ result, token_count = await docsifer_service.convert_file(
88
+ file_path=str(temp_path),
89
+ openai_config=openai_config,
90
+ cleanup=cleanup,
91
+ )
92
+ else:
93
+ raise HTTPException(
94
+ status_code=400, detail="Provide either 'file' or 'url'."
95
  )
96
 
97
+ # Track usage
98
  background_tasks.add_task(analytics.access, token_count)
 
99
  return ConvertResponse(**result)
100
 
101
  except Exception as e:
102
+ msg = f"Failed to convert content. Error: {str(e)}"
103
  logger.error(msg)
104
  raise HTTPException(status_code=500, detail=msg)
105
 
requirements.txt CHANGED
@@ -13,3 +13,4 @@ scuid
13
  python-magic
14
  plotly
15
  matplotlib
 
 
13
  python-magic
14
  plotly
15
  matplotlib
16
+ aiohttp