feat: support convert from url and more
Browse files- docsifer/__init__.py +135 -53
- docsifer/router.py +42 -15
- requirements.txt +1 -0
docsifer/__init__.py
CHANGED
@@ -17,8 +17,12 @@ from pathlib import Path
|
|
17 |
from scuid import scuid
|
18 |
|
19 |
|
20 |
-
# Filter out /v1 requests from the access log
|
21 |
class LogFilter(logging.Filter):
|
|
|
|
|
|
|
|
|
|
|
22 |
def filter(self, record):
|
23 |
# Only keep log records that contain "/v1" in the request path
|
24 |
if record.args and len(record.args) >= 3:
|
@@ -30,7 +34,6 @@ class LogFilter(logging.Filter):
|
|
30 |
logger = logging.getLogger("uvicorn.access")
|
31 |
logger.addFilter(LogFilter())
|
32 |
|
33 |
-
# Application metadata
|
34 |
__version__ = "1.0.0"
|
35 |
__author__ = "lamhieu"
|
36 |
__description__ = "Docsifer: Efficient Data Conversion to Markdown."
|
@@ -46,11 +49,10 @@ __metadata__ = {
|
|
46 |
"spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
|
47 |
}
|
48 |
|
49 |
-
#
|
50 |
DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
|
51 |
DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
|
52 |
|
53 |
-
# Markdown description for the main interface
|
54 |
APP_DESCRIPTION = f"""
|
55 |
# 📝 **Docsifer: Convert Your Documents to Markdown**
|
56 |
|
@@ -60,7 +62,7 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
|
|
60 |
|
61 |
- **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
|
62 |
- **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
|
63 |
-
- **Privacy-Focused**: We never store user data; all processing is
|
64 |
- **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
|
65 |
- **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
|
66 |
|
@@ -68,7 +70,6 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
|
|
68 |
- [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
|
69 |
"""
|
70 |
|
71 |
-
# Initialize FastAPI application
|
72 |
app = FastAPI(
|
73 |
title="Docsifer Service API",
|
74 |
description=__description__,
|
@@ -77,7 +78,7 @@ app = FastAPI(
|
|
77 |
redoc_url="/redoc",
|
78 |
)
|
79 |
|
80 |
-
# Configure CORS
|
81 |
app.add_middleware(
|
82 |
CORSMiddleware,
|
83 |
allow_origins=["*"], # Adjust if needed for specific domains
|
@@ -86,34 +87,40 @@ app.add_middleware(
|
|
86 |
allow_headers=["*"],
|
87 |
)
|
88 |
|
89 |
-
# Import and include your existing router (
|
90 |
from .router import router
|
91 |
|
92 |
app.include_router(router, prefix="/v1")
|
93 |
|
94 |
|
95 |
def call_convert_api(
|
96 |
-
file_obj: bytes,
|
97 |
-
filename: str,
|
|
|
98 |
cleanup: bool = True,
|
99 |
openai_base_url: Optional[str] = None,
|
100 |
openai_api_key: Optional[str] = None,
|
101 |
openai_model: Optional[str] = None,
|
102 |
) -> Tuple[str, str]:
|
103 |
"""
|
104 |
-
|
105 |
-
If there's an error, the first return value is an error message (str),
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
"""
|
113 |
-
|
114 |
-
if file_obj is None:
|
115 |
-
return ("❌ No file was uploaded.", "")
|
116 |
-
|
117 |
# Build the "openai" object
|
118 |
openai_dict = {}
|
119 |
if openai_api_key and openai_api_key.strip():
|
@@ -127,17 +134,27 @@ def call_convert_api(
|
|
127 |
settings_dict = {"cleanup": cleanup}
|
128 |
|
129 |
data = {
|
130 |
-
#
|
131 |
"openai": json.dumps(openai_dict),
|
132 |
"settings": json.dumps(settings_dict),
|
133 |
}
|
134 |
|
|
|
135 |
if len(openai_dict) <= 3:
|
136 |
data.pop("openai")
|
137 |
|
138 |
-
#
|
139 |
-
files = {
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
try:
|
142 |
response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
|
143 |
except requests.exceptions.RequestException as e:
|
@@ -146,14 +163,15 @@ def call_convert_api(
|
|
146 |
if response.status_code != 200:
|
147 |
return (f"❌ API Error {response.status_code}: {response.text}", "")
|
148 |
|
|
|
149 |
try:
|
150 |
converted = response.json()
|
151 |
-
#
|
152 |
markdown_content = converted["markdown"]
|
153 |
except Exception as e:
|
154 |
return (f"❌ Error parsing JSON: {str(e)}", "")
|
155 |
|
156 |
-
# Write the returned Markdown to a
|
157 |
with tempfile.NamedTemporaryFile(
|
158 |
mode="w+", suffix=".md", dir="/tmp", delete=False
|
159 |
) as tmp_file:
|
@@ -165,8 +183,17 @@ def call_convert_api(
|
|
165 |
|
166 |
def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
167 |
"""
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
"""
|
171 |
try:
|
172 |
response = requests.get(DOCSIFER_STATS_URL, timeout=10)
|
@@ -186,8 +213,10 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
186 |
tokens_data = data.get("tokens", {})
|
187 |
|
188 |
def build_stats_df(bucket: dict) -> pd.DataFrame:
|
189 |
-
|
190 |
-
|
|
|
|
|
191 |
all_models = set()
|
192 |
for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
|
193 |
period_dict = bucket.get(period_key, {})
|
@@ -219,21 +248,31 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
219 |
|
220 |
def create_main_interface():
|
221 |
"""
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
"""
|
227 |
with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
|
228 |
gr.Markdown(APP_DESCRIPTION)
|
229 |
|
230 |
with gr.Tab("Conversion Playground"):
|
231 |
-
gr.Markdown("### Convert your files to Markdown with Docsifer.")
|
232 |
|
233 |
with gr.Row():
|
|
|
234 |
with gr.Column():
|
235 |
file_input = gr.File(
|
236 |
-
label="Upload File",
|
237 |
file_types=[
|
238 |
".pdf",
|
239 |
".docx",
|
@@ -251,6 +290,11 @@ def create_main_interface():
|
|
251 |
type="binary",
|
252 |
)
|
253 |
|
|
|
|
|
|
|
|
|
|
|
254 |
with gr.Accordion("OpenAI Configuration (Optional)", open=False):
|
255 |
gr.Markdown(
|
256 |
"Provide these if you'd like **LLM-assisted** extraction. "
|
@@ -275,7 +319,8 @@ def create_main_interface():
|
|
275 |
|
276 |
with gr.Accordion("Conversion Settings", open=True):
|
277 |
gr.Markdown(
|
278 |
-
"Enable to remove <style> tags or hidden elements
|
|
|
279 |
)
|
280 |
cleanup_toggle = gr.Checkbox(
|
281 |
label="Enable Cleanup",
|
@@ -284,13 +329,12 @@ def create_main_interface():
|
|
284 |
|
285 |
convert_btn = gr.Button("Convert")
|
286 |
|
|
|
287 |
with gr.Column():
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
)
|
293 |
-
# Set visible=True so the user always sees a small download button
|
294 |
download_file = gr.File(
|
295 |
label="Download",
|
296 |
interactive=False,
|
@@ -309,32 +353,64 @@ def create_main_interface():
|
|
309 |
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
310 |
-F "settings={\\"cleanup\\":true}"
|
311 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
"""
|
313 |
)
|
314 |
|
315 |
-
|
|
|
316 |
"""
|
317 |
-
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
"""
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
322 |
|
323 |
-
|
324 |
markdown, temp_md_path = call_convert_api(
|
325 |
file_obj=file_bytes,
|
326 |
filename=unique_name,
|
|
|
327 |
openai_base_url=base_url,
|
328 |
openai_api_key=api_key,
|
329 |
openai_model=model_id,
|
330 |
cleanup=cleanup,
|
331 |
)
|
|
|
332 |
return markdown, temp_md_path
|
333 |
|
|
|
334 |
convert_btn.click(
|
335 |
fn=on_convert,
|
336 |
inputs=[
|
337 |
file_input,
|
|
|
338 |
openai_base_url,
|
339 |
openai_api_key,
|
340 |
openai_model,
|
@@ -348,6 +424,7 @@ def create_main_interface():
|
|
348 |
"View Docsifer usage statistics (access count, token usage, etc.)"
|
349 |
)
|
350 |
stats_btn = gr.Button("Get Stats")
|
|
|
351 |
access_df = gr.DataFrame(
|
352 |
label="Access Stats",
|
353 |
headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
|
@@ -359,6 +436,7 @@ def create_main_interface():
|
|
359 |
interactive=False,
|
360 |
)
|
361 |
|
|
|
362 |
stats_btn.click(
|
363 |
fn=call_stats_api_df,
|
364 |
inputs=[],
|
@@ -368,17 +446,21 @@ def create_main_interface():
|
|
368 |
return demo
|
369 |
|
370 |
|
371 |
-
# Build our Gradio interface and mount it at the root path
|
372 |
main_interface = create_main_interface()
|
373 |
mount_gradio_app(app, main_interface, path="/")
|
374 |
|
375 |
|
376 |
-
# Startup / Shutdown events
|
377 |
@app.on_event("startup")
|
378 |
async def startup_event():
|
|
|
|
|
|
|
379 |
logger.info("Docsifer Service is starting up...")
|
380 |
|
381 |
|
382 |
@app.on_event("shutdown")
|
383 |
async def shutdown_event():
|
|
|
|
|
|
|
384 |
logger.info("Docsifer Service is shutting down.")
|
|
|
17 |
from scuid import scuid
|
18 |
|
19 |
|
|
|
20 |
class LogFilter(logging.Filter):
|
21 |
+
"""
|
22 |
+
A custom logging filter that only keeps log records containing '/v1'
|
23 |
+
in the request path. This helps to filter out other logs and reduce noise.
|
24 |
+
"""
|
25 |
+
|
26 |
def filter(self, record):
|
27 |
# Only keep log records that contain "/v1" in the request path
|
28 |
if record.args and len(record.args) >= 3:
|
|
|
34 |
logger = logging.getLogger("uvicorn.access")
|
35 |
logger.addFilter(LogFilter())
|
36 |
|
|
|
37 |
__version__ = "1.0.0"
|
38 |
__author__ = "lamhieu"
|
39 |
__description__ = "Docsifer: Efficient Data Conversion to Markdown."
|
|
|
49 |
"spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
|
50 |
}
|
51 |
|
52 |
+
# Docsifer API Endpoints (can be replaced with your live URLs if desired)
|
53 |
DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
|
54 |
DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
|
55 |
|
|
|
56 |
APP_DESCRIPTION = f"""
|
57 |
# 📝 **Docsifer: Convert Your Documents to Markdown**
|
58 |
|
|
|
62 |
|
63 |
- **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
|
64 |
- **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
|
65 |
+
- **Privacy-Focused**: We never store user data; all processing is temporary. We only collect minimal anonymous usage statistics to count the number of calls and the number of tokens, nothing else.
|
66 |
- **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
|
67 |
- **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
|
68 |
|
|
|
70 |
- [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
|
71 |
"""
|
72 |
|
|
|
73 |
app = FastAPI(
|
74 |
title="Docsifer Service API",
|
75 |
description=__description__,
|
|
|
78 |
redoc_url="/redoc",
|
79 |
)
|
80 |
|
81 |
+
# Configure CORS (Cross-Origin Resource Sharing)
|
82 |
app.add_middleware(
|
83 |
CORSMiddleware,
|
84 |
allow_origins=["*"], # Adjust if needed for specific domains
|
|
|
87 |
allow_headers=["*"],
|
88 |
)
|
89 |
|
90 |
+
# Import and include your existing router (with /v1 endpoints)
|
91 |
from .router import router
|
92 |
|
93 |
app.include_router(router, prefix="/v1")
|
94 |
|
95 |
|
96 |
def call_convert_api(
|
97 |
+
file_obj: Optional[bytes],
|
98 |
+
filename: str = "",
|
99 |
+
url: Optional[str] = None,
|
100 |
cleanup: bool = True,
|
101 |
openai_base_url: Optional[str] = None,
|
102 |
openai_api_key: Optional[str] = None,
|
103 |
openai_model: Optional[str] = None,
|
104 |
) -> Tuple[str, str]:
|
105 |
"""
|
106 |
+
Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
|
107 |
+
- If there's an error, the first return value is an error message (str),
|
108 |
+
the second is an empty string.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
file_obj (Optional[bytes]): The raw file bytes to be sent. If None, 'url' is used.
|
112 |
+
filename (str): Name of the file (will be posted to the endpoint).
|
113 |
+
url (str, optional): URL to be converted (used only if file_obj is None).
|
114 |
+
cleanup (bool): Whether to enable cleanup mode for HTML files.
|
115 |
+
openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
|
116 |
+
openai_api_key (str, optional): API key for the LLM.
|
117 |
+
openai_model (str, optional): Model name to use for LLM-based extraction.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
(str, str):
|
121 |
+
- markdown_content (str): The conversion result in Markdown form or an error message.
|
122 |
+
- tmp_md_path (str): The path to the temporary .md file for download.
|
123 |
"""
|
|
|
|
|
|
|
|
|
124 |
# Build the "openai" object
|
125 |
openai_dict = {}
|
126 |
if openai_api_key and openai_api_key.strip():
|
|
|
134 |
settings_dict = {"cleanup": cleanup}
|
135 |
|
136 |
data = {
|
137 |
+
# Must match the `Form(...)` fields named "openai" and "settings"
|
138 |
"openai": json.dumps(openai_dict),
|
139 |
"settings": json.dumps(settings_dict),
|
140 |
}
|
141 |
|
142 |
+
# If the user left the OpenAI fields blank, remove the `openai` key from data
|
143 |
if len(openai_dict) <= 3:
|
144 |
data.pop("openai")
|
145 |
|
146 |
+
# Decide if we're sending a file or a URL
|
147 |
+
files = {}
|
148 |
+
if file_obj:
|
149 |
+
# If file is provided, it takes priority
|
150 |
+
files = {"file": (filename, file_obj)}
|
151 |
+
data["url"] = "" # ensure 'url' is empty on the form
|
152 |
+
elif url and url.strip():
|
153 |
+
data["url"] = url.strip()
|
154 |
+
else:
|
155 |
+
return ("❌ Please upload a file or provide a URL.", "")
|
156 |
+
|
157 |
+
# Perform the POST request
|
158 |
try:
|
159 |
response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
|
160 |
except requests.exceptions.RequestException as e:
|
|
|
163 |
if response.status_code != 200:
|
164 |
return (f"❌ API Error {response.status_code}: {response.text}", "")
|
165 |
|
166 |
+
# Parse the API response
|
167 |
try:
|
168 |
converted = response.json()
|
169 |
+
# Expected structure: { "filename": "...", "markdown": "..." }
|
170 |
markdown_content = converted["markdown"]
|
171 |
except Exception as e:
|
172 |
return (f"❌ Error parsing JSON: {str(e)}", "")
|
173 |
|
174 |
+
# Write the returned Markdown to a temp .md file
|
175 |
with tempfile.NamedTemporaryFile(
|
176 |
mode="w+", suffix=".md", dir="/tmp", delete=False
|
177 |
) as tmp_file:
|
|
|
183 |
|
184 |
def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
185 |
"""
|
186 |
+
Call /v1/stats endpoint to retrieve analytics data and return two DataFrames:
|
187 |
+
- access_df: Access statistics
|
188 |
+
- tokens_df: Token usage statistics
|
189 |
+
|
190 |
+
Raises:
|
191 |
+
ValueError: If the stats endpoint fails or returns invalid data.
|
192 |
+
|
193 |
+
Returns:
|
194 |
+
Tuple[pd.DataFrame, pd.DataFrame]:
|
195 |
+
(access_df, tokens_df) with columns ["Model", "Total", "Daily",
|
196 |
+
"Weekly", "Monthly", "Yearly"].
|
197 |
"""
|
198 |
try:
|
199 |
response = requests.get(DOCSIFER_STATS_URL, timeout=10)
|
|
|
213 |
tokens_data = data.get("tokens", {})
|
214 |
|
215 |
def build_stats_df(bucket: dict) -> pd.DataFrame:
|
216 |
+
"""
|
217 |
+
Helper function to transform a nested dictionary (by period, by model)
|
218 |
+
into a tabular pandas DataFrame.
|
219 |
+
"""
|
220 |
all_models = set()
|
221 |
for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
|
222 |
period_dict = bucket.get(period_key, {})
|
|
|
248 |
|
249 |
def create_main_interface():
|
250 |
"""
|
251 |
+
Create a Gradio Blocks interface that includes:
|
252 |
+
1) 'Conversion Playground' Tab:
|
253 |
+
- File upload OR URL-based conversion
|
254 |
+
- Optional OpenAI configuration
|
255 |
+
- Convert button
|
256 |
+
- Display of conversion result as Markdown
|
257 |
+
- Downloadable .md file
|
258 |
+
2) 'Analytics Stats' Tab:
|
259 |
+
- Button to fetch usage statistics
|
260 |
+
- DataFrames for Access Stats and Token Stats
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
Gradio Blocks instance that can be mounted into the FastAPI app.
|
264 |
"""
|
265 |
with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
|
266 |
gr.Markdown(APP_DESCRIPTION)
|
267 |
|
268 |
with gr.Tab("Conversion Playground"):
|
269 |
+
gr.Markdown("### Convert your files or a URL to Markdown with Docsifer.")
|
270 |
|
271 |
with gr.Row():
|
272 |
+
# Left Column: File Upload, URL Input, Settings, Button
|
273 |
with gr.Column():
|
274 |
file_input = gr.File(
|
275 |
+
label="Upload File (optional)",
|
276 |
file_types=[
|
277 |
".pdf",
|
278 |
".docx",
|
|
|
290 |
type="binary",
|
291 |
)
|
292 |
|
293 |
+
url_input = gr.Textbox(
|
294 |
+
label="URL (optional)",
|
295 |
+
placeholder="Enter a URL if no file is uploaded",
|
296 |
+
)
|
297 |
+
|
298 |
with gr.Accordion("OpenAI Configuration (Optional)", open=False):
|
299 |
gr.Markdown(
|
300 |
"Provide these if you'd like **LLM-assisted** extraction. "
|
|
|
319 |
|
320 |
with gr.Accordion("Conversion Settings", open=True):
|
321 |
gr.Markdown(
|
322 |
+
"Enable to remove <style> tags or hidden elements "
|
323 |
+
"from `.html` files before conversion."
|
324 |
)
|
325 |
cleanup_toggle = gr.Checkbox(
|
326 |
label="Enable Cleanup",
|
|
|
329 |
|
330 |
convert_btn = gr.Button("Convert")
|
331 |
|
332 |
+
# Right Column: Conversion Result Display & Download
|
333 |
with gr.Column():
|
334 |
+
# Display the result as Markdown
|
335 |
+
output_md = gr.Markdown(label="Conversion Result (Markdown)")
|
336 |
+
|
337 |
+
# The user can still download the .md file
|
|
|
|
|
338 |
download_file = gr.File(
|
339 |
label="Download",
|
340 |
interactive=False,
|
|
|
353 |
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
354 |
-F "settings={\\"cleanup\\":true}"
|
355 |
```
|
356 |
+
|
357 |
+
**Convert from a URL (no file)**:
|
358 |
+
```bash
|
359 |
+
curl -X POST \\
|
360 |
+
"https://lamhieu-docsifer.hf.space/v1/convert" \\
|
361 |
+
-F "url=https://example.com/page.html" \\
|
362 |
+
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
363 |
+
-F "settings={\\"cleanup\\":true}"
|
364 |
+
```
|
365 |
"""
|
366 |
)
|
367 |
|
368 |
+
# Callback function triggered by convert_btn.click
|
369 |
+
def on_convert(file_bytes, url_str, base_url, api_key, model_id, cleanup):
|
370 |
"""
|
371 |
+
Converts the uploaded file or a URL to Markdown by calling the Docsifer
|
372 |
+
API. Returns the resulting Markdown content and path to the
|
373 |
+
temporary .md file for download.
|
374 |
+
|
375 |
+
Args:
|
376 |
+
file_bytes (bytes): The raw file content (None if not uploaded).
|
377 |
+
url_str (str): The URL to convert (only used if file_bytes is None).
|
378 |
+
base_url (str): The base URL for OpenAI or compatible LLM.
|
379 |
+
api_key (str): The API key for the LLM.
|
380 |
+
model_id (str): The model to use for the LLM.
|
381 |
+
cleanup (bool): Whether to enable cleanup on HTML files.
|
382 |
+
|
383 |
+
Returns:
|
384 |
+
(str, str):
|
385 |
+
- The Markdown content or error message.
|
386 |
+
- The path to the temp .md file for download.
|
387 |
"""
|
388 |
+
# If file is not provided, we attempt the URL approach
|
389 |
+
if not file_bytes and not url_str:
|
390 |
+
return "❌ Please upload a file or provide a URL.", None
|
391 |
+
|
392 |
+
# Create a unique temporary filename if file is present
|
393 |
+
unique_name = f"{scuid()}.tmp" if file_bytes else ""
|
394 |
|
395 |
+
# Call the convert API
|
396 |
markdown, temp_md_path = call_convert_api(
|
397 |
file_obj=file_bytes,
|
398 |
filename=unique_name,
|
399 |
+
url=url_str,
|
400 |
openai_base_url=base_url,
|
401 |
openai_api_key=api_key,
|
402 |
openai_model=model_id,
|
403 |
cleanup=cleanup,
|
404 |
)
|
405 |
+
|
406 |
return markdown, temp_md_path
|
407 |
|
408 |
+
# Link the on_convert function to the convert_btn
|
409 |
convert_btn.click(
|
410 |
fn=on_convert,
|
411 |
inputs=[
|
412 |
file_input,
|
413 |
+
url_input,
|
414 |
openai_base_url,
|
415 |
openai_api_key,
|
416 |
openai_model,
|
|
|
424 |
"View Docsifer usage statistics (access count, token usage, etc.)"
|
425 |
)
|
426 |
stats_btn = gr.Button("Get Stats")
|
427 |
+
|
428 |
access_df = gr.DataFrame(
|
429 |
label="Access Stats",
|
430 |
headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
|
|
|
436 |
interactive=False,
|
437 |
)
|
438 |
|
439 |
+
# When the button is clicked, call_stats_api_df returns two dataframes
|
440 |
stats_btn.click(
|
441 |
fn=call_stats_api_df,
|
442 |
inputs=[],
|
|
|
446 |
return demo
|
447 |
|
448 |
|
|
|
449 |
main_interface = create_main_interface()
|
450 |
mount_gradio_app(app, main_interface, path="/")
|
451 |
|
452 |
|
|
|
453 |
@app.on_event("startup")
|
454 |
async def startup_event():
|
455 |
+
"""
|
456 |
+
Logs a startup message when the Docsifer Service is starting.
|
457 |
+
"""
|
458 |
logger.info("Docsifer Service is starting up...")
|
459 |
|
460 |
|
461 |
@app.on_event("shutdown")
|
462 |
async def shutdown_event():
|
463 |
+
"""
|
464 |
+
Logs a shutdown message when the Docsifer Service is shutting down.
|
465 |
+
"""
|
466 |
logger.info("Docsifer Service is shutting down.")
|
docsifer/router.py
CHANGED
@@ -4,10 +4,12 @@ import logging
|
|
4 |
import json
|
5 |
import tempfile
|
6 |
import os
|
|
|
7 |
from pathlib import Path
|
8 |
|
9 |
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
10 |
from pydantic import BaseModel
|
|
|
11 |
|
12 |
from .service import DocsiferService
|
13 |
from .analytics import Analytics
|
@@ -34,17 +36,21 @@ class ConvertResponse(BaseModel):
|
|
34 |
@router.post("/convert", response_model=ConvertResponse)
|
35 |
async def convert_document(
|
36 |
background_tasks: BackgroundTasks,
|
37 |
-
file: UploadFile = File(
|
|
|
|
|
|
|
38 |
openai: str = Form("{}", description="OpenAI config as a JSON object"),
|
39 |
settings: str = Form("{}", description="Settings as a JSON object"),
|
40 |
):
|
41 |
"""
|
42 |
-
Convert a
|
43 |
-
|
44 |
-
-
|
45 |
-
-
|
46 |
"""
|
47 |
try:
|
|
|
48 |
try:
|
49 |
openai_config = json.loads(openai) if openai else {}
|
50 |
except json.JSONDecodeError:
|
@@ -57,22 +63,43 @@ async def convert_document(
|
|
57 |
|
58 |
cleanup = settings_config.get("cleanup", True)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
)
|
68 |
|
69 |
-
# Track usage
|
70 |
background_tasks.add_task(analytics.access, token_count)
|
71 |
-
|
72 |
return ConvertResponse(**result)
|
73 |
|
74 |
except Exception as e:
|
75 |
-
msg = f"Failed to convert
|
76 |
logger.error(msg)
|
77 |
raise HTTPException(status_code=500, detail=msg)
|
78 |
|
|
|
4 |
import json
|
5 |
import tempfile
|
6 |
import os
|
7 |
+
import aiohttp
|
8 |
from pathlib import Path
|
9 |
|
10 |
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
11 |
from pydantic import BaseModel
|
12 |
+
from scuid import scuid
|
13 |
|
14 |
from .service import DocsiferService
|
15 |
from .analytics import Analytics
|
|
|
36 |
@router.post("/convert", response_model=ConvertResponse)
|
37 |
async def convert_document(
|
38 |
background_tasks: BackgroundTasks,
|
39 |
+
file: UploadFile = File(None, description="File to convert"),
|
40 |
+
url: str = Form(
|
41 |
+
None, description="URL to convert (used only if no file is provided)"
|
42 |
+
),
|
43 |
openai: str = Form("{}", description="OpenAI config as a JSON object"),
|
44 |
settings: str = Form("{}", description="Settings as a JSON object"),
|
45 |
):
|
46 |
"""
|
47 |
+
Convert a file or an HTML page from a URL into Markdown.
|
48 |
+
If 'file' is provided, it has priority over 'url'.
|
49 |
+
- 'openai' is a JSON string with keys: {"api_key": "...", "base_url": "..."}
|
50 |
+
- 'settings' is a JSON string with keys: {"cleanup": bool}
|
51 |
"""
|
52 |
try:
|
53 |
+
# Parse configs
|
54 |
try:
|
55 |
openai_config = json.loads(openai) if openai else {}
|
56 |
except json.JSONDecodeError:
|
|
|
63 |
|
64 |
cleanup = settings_config.get("cleanup", True)
|
65 |
|
66 |
+
# If a file is provided, use the existing flow
|
67 |
+
if file is not None:
|
68 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
69 |
+
temp_path = Path(tmpdir) / file.filename
|
70 |
+
contents = await file.read()
|
71 |
+
temp_path.write_bytes(contents)
|
72 |
+
result, token_count = await docsifer_service.convert_file(
|
73 |
+
file_path=str(temp_path),
|
74 |
+
openai_config=openai_config,
|
75 |
+
cleanup=cleanup,
|
76 |
+
)
|
77 |
+
# Otherwise, fetch HTML from URL and convert
|
78 |
+
elif url:
|
79 |
+
async with aiohttp.ClientSession() as session:
|
80 |
+
async with session.get(url) as resp:
|
81 |
+
if resp.status != 200:
|
82 |
+
raise ValueError(f"Failed to fetch URL: status {resp.status}")
|
83 |
+
data = await resp.read()
|
84 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
85 |
+
temp_path = Path(tmpdir) / f"{scuid()}.html"
|
86 |
+
temp_path.write_bytes(data)
|
87 |
+
result, token_count = await docsifer_service.convert_file(
|
88 |
+
file_path=str(temp_path),
|
89 |
+
openai_config=openai_config,
|
90 |
+
cleanup=cleanup,
|
91 |
+
)
|
92 |
+
else:
|
93 |
+
raise HTTPException(
|
94 |
+
status_code=400, detail="Provide either 'file' or 'url'."
|
95 |
)
|
96 |
|
97 |
+
# Track usage
|
98 |
background_tasks.add_task(analytics.access, token_count)
|
|
|
99 |
return ConvertResponse(**result)
|
100 |
|
101 |
except Exception as e:
|
102 |
+
msg = f"Failed to convert content. Error: {str(e)}"
|
103 |
logger.error(msg)
|
104 |
raise HTTPException(status_code=500, detail=msg)
|
105 |
|
requirements.txt
CHANGED
@@ -13,3 +13,4 @@ scuid
|
|
13 |
python-magic
|
14 |
plotly
|
15 |
matplotlib
|
|
|
|
13 |
python-magic
|
14 |
plotly
|
15 |
matplotlib
|
16 |
+
aiohttp
|