Spaces:

lamhieu
/

docsifer

Running

App Files Files Community

lamhieu commited on 21 days ago

Commit

9f2ec30

1 Parent(s): dbb74f2

chore: update something

Browse files

Files changed (3) hide show

docsifer/__init__.py +32 -8
docsifer/router.py +8 -0
docsifer/service.py +26 -4

docsifer/__init__.py CHANGED Viewed

@@ -101,6 +101,7 @@ def call_convert_api(
     openai_base_url: Optional[str] = None,
     openai_api_key: Optional[str] = None,
     openai_model: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
     Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
@@ -115,6 +116,7 @@ def call_convert_api(
         openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
         openai_api_key (str, optional): API key for the LLM.
         openai_model (str, optional): Model name to use for LLM-based extraction.
     Returns:
         (str, str):
@@ -143,6 +145,14 @@ def call_convert_api(
     if len(openai_dict) <= 3:
         data.pop("openai")
     # Decide if we're sending a file or a URL
     files = {}
     if file_obj:
@@ -220,7 +230,7 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
         all_models = set()
         for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
             period_dict = bucket.get(period_key, {})
-            all_models.update(period_dict.keys())  # typically just "docsifer"
         result_dict = {
             "Model": [],
@@ -251,7 +261,7 @@ def create_main_interface():
     Create a Gradio Blocks interface that includes:
       1) 'Conversion Playground' Tab:
          - File upload OR URL-based conversion
-         - Optional OpenAI configuration
          - Convert button
          - Display of conversion result as Markdown
          - Downloadable .md file
@@ -317,6 +327,17 @@ def create_main_interface():
                             value="gpt-4o-mini",
                         )
                     with gr.Accordion("Conversion Settings", open=True):
                         gr.Markdown(
                             "Enable to remove <style> tags or hidden elements "
@@ -371,11 +392,12 @@ def create_main_interface():
                     )
             # Callback function triggered by convert_btn.click
-            def on_convert(file_bytes, url_str, base_url, api_key, model_id, cleanup):
                 """
-                Converts the uploaded file or a URL to Markdown by calling the Docsifer
-                API. Returns the resulting Markdown content and path to the
-                temporary .md file for download.
                 Args:
                     file_bytes (bytes): The raw file content (None if not uploaded).
@@ -384,20 +406,20 @@ def create_main_interface():
                     api_key (str): The API key for the LLM.
                     model_id (str): The model to use for the LLM.
                     cleanup (bool): Whether to enable cleanup on HTML files.
                 Returns:
                     (str, str):
                         - The Markdown content or error message.
                         - The path to the temp .md file for download.
                 """
-                # If file is not provided, we attempt the URL approach
                 if not file_bytes and not url_str:
                     return "❌ Please upload a file or provide a URL.", None
                 # Create a unique temporary filename if file is present
                 unique_name = f"{scuid()}.tmp" if file_bytes else ""
-                # Call the convert API
                 markdown, temp_md_path = call_convert_api(
                     file_obj=file_bytes,
                     filename=unique_name,
@@ -406,6 +428,7 @@ def create_main_interface():
                     openai_api_key=api_key,
                     openai_model=model_id,
                     cleanup=cleanup,
                 )
                 return markdown, temp_md_path
@@ -420,6 +443,7 @@ def create_main_interface():
                     openai_api_key,
                     openai_model,
                     cleanup_toggle,
                 ],
                 outputs=[output_md, download_file],
             )

     openai_base_url: Optional[str] = None,
     openai_api_key: Optional[str] = None,
     openai_model: Optional[str] = None,
+    http_cookies: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
     Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
         openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
         openai_api_key (str, optional): API key for the LLM.
         openai_model (str, optional): Model name to use for LLM-based extraction.
+        http_cookies (str, optional): JSON-formatted string representing cookies for HTTP requests.
     Returns:
         (str, str):
     if len(openai_dict) <= 3:
         data.pop("openai")
+    # Build the HTTP configuration object
+    if http_cookies and http_cookies.strip():
+        try:
+            cookies_obj = json.loads(http_cookies)
+        except Exception as e:
+            return (f"❌ Invalid JSON for HTTP Cookies: {str(e)}", "")
+        data["http"] = json.dumps({"cookies": cookies_obj})
     # Decide if we're sending a file or a URL
     files = {}
     if file_obj:
         all_models = set()
         for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
             period_dict = bucket.get(period_key, {})
+            all_models.update(period_dict.keys())
         result_dict = {
             "Model": [],
     Create a Gradio Blocks interface that includes:
       1) 'Conversion Playground' Tab:
          - File upload OR URL-based conversion
+         - Optional OpenAI configuration and HTTP configuration
          - Convert button
          - Display of conversion result as Markdown
          - Downloadable .md file
                             value="gpt-4o-mini",
                         )
+                    with gr.Accordion("HTTP Configuration (Optional)", open=False):
+                        gr.Markdown(
+                            "Provide additional HTTP configuration. "
+                            "In particular, you can specify cookies as a JSON object to be included in the request."
+                        )
+                        http_cookies = gr.Textbox(
+                            label="Cookies",
+                            placeholder='e.g. {"session": "abcd1234"}',
+                            lines=3,
+                        )
                     with gr.Accordion("Conversion Settings", open=True):
                         gr.Markdown(
                             "Enable to remove <style> tags or hidden elements "
                     )
             # Callback function triggered by convert_btn.click
+            def on_convert(
+                file_bytes, url_str, base_url, api_key, model_id, cleanup, http_cookies
+            ):
                 """
+                Converts the uploaded file or a URL to Markdown by calling the Docsifer API.
+                Returns the resulting Markdown content and path to the temporary .md file for download.
                 Args:
                     file_bytes (bytes): The raw file content (None if not uploaded).
                     api_key (str): The API key for the LLM.
                     model_id (str): The model to use for the LLM.
                     cleanup (bool): Whether to enable cleanup on HTML files.
+                    http_cookies (str): JSON-formatted string for HTTP cookies.
                 Returns:
                     (str, str):
                         - The Markdown content or error message.
                         - The path to the temp .md file for download.
                 """
                 if not file_bytes and not url_str:
                     return "❌ Please upload a file or provide a URL.", None
                 # Create a unique temporary filename if file is present
                 unique_name = f"{scuid()}.tmp" if file_bytes else ""
+                # Call the convert API with HTTP configuration
                 markdown, temp_md_path = call_convert_api(
                     file_obj=file_bytes,
                     filename=unique_name,
                     openai_api_key=api_key,
                     openai_model=model_id,
                     cleanup=cleanup,
+                    http_cookies=http_cookies,
                 )
                 return markdown, temp_md_path
                     openai_api_key,
                     openai_model,
                     cleanup_toggle,
+                    http_cookies,
                 ],
                 outputs=[output_md, download_file],
             )

docsifer/router.py CHANGED Viewed

@@ -39,6 +39,7 @@ async def convert_document(
         None, description="URL to convert (used only if no file is provided)"
     ),
     openai: str = Form("{}", description="OpenAI config as a JSON object"),
     settings: str = Form("{}", description="Settings as a JSON object"),
 ):
     """
@@ -55,6 +56,11 @@ async def convert_document(
         except json.JSONDecodeError:
             raise ValueError("Invalid JSON in 'openai' parameter.")
         try:
             settings_config = json.loads(settings) if settings else {}
         except json.JSONDecodeError:
@@ -71,6 +77,7 @@ async def convert_document(
                 result, token_count = await docsifer_service.convert_file(
                     source=str(temp_path),
                     openai_config=openai_config,
                     cleanup=cleanup,
                 )
         elif url:
@@ -90,6 +97,7 @@ async def convert_document(
             result, token_count = await docsifer_service.convert_file(
                 source=str(url),
                 openai_config=openai_config,
                 cleanup=cleanup,
             )
         else:

         None, description="URL to convert (used only if no file is provided)"
     ),
     openai: str = Form("{}", description="OpenAI config as a JSON object"),
+    http: str = Form("{}", description="HTTP config as a JSON object"),
     settings: str = Form("{}", description="Settings as a JSON object"),
 ):
     """
         except json.JSONDecodeError:
             raise ValueError("Invalid JSON in 'openai' parameter.")
+        try:
+            http_config = json.loads(http) if http else {}
+        except json.JSONDecodeError:
+            raise ValueError("Invalid JSON in 'http' parameter.")
         try:
             settings_config = json.loads(settings) if settings else {}
         except json.JSONDecodeError:
                 result, token_count = await docsifer_service.convert_file(
                     source=str(temp_path),
                     openai_config=openai_config,
+                    http_config=http_config,
                     cleanup=cleanup,
                 )
         elif url:
             result, token_count = await docsifer_service.convert_file(
                 source=str(url),
                 openai_config=openai_config,
+                http_config=http_config,
                 cleanup=cleanup,
             )
         else:

docsifer/service.py CHANGED Viewed

@@ -3,8 +3,11 @@ from __future__ import annotations
 import asyncio
 import logging
 import tempfile
 import magic
 import mimetypes
 from pathlib import Path
 from typing import Optional, Dict, Tuple, Any
 from scuid import scuid
@@ -107,7 +110,11 @@ class DocsiferService:
             return len(text.split())
     def _convert_sync(
-        self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
     ) -> Tuple[Dict[str, str], int]:
         """
         Synchronously convert a file at `file_path` to Markdown.
@@ -117,6 +124,7 @@ class DocsiferService:
         Args:
             source: Path to the source file or URL to fetch content from.
             openai_config: Optional dictionary with OpenAI configuration.
             cleanup: Whether to perform HTML cleanup if the file is an HTML file.
         Returns:
@@ -164,12 +172,21 @@ class DocsiferService:
         else:
             md_converter = self._basic_markitdown
         try:
             result_obj = md_converter.convert(source)
         except Exception as e:
             logger.error("MarkItDown conversion failed: %s", e)
             raise RuntimeError(f"Conversion failed for '{source}': {e}")
         if isinstance(source, Path) and source.exists():
             source.unlink()
@@ -183,7 +200,11 @@ class DocsiferService:
         return result_dict, token_count
     async def convert_file(
-        self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
     ) -> Tuple[Dict[str, str], int]:
         """
         Asynchronously convert a file at `source` to Markdown.
@@ -192,6 +213,7 @@ class DocsiferService:
         Args:
             source: Path to the file to convert or a URL to fetch content from.
             openai_config: Optional OpenAI configuration dictionary.
             cleanup: Whether to perform HTML cleanup if applicable.
         Returns:
@@ -199,5 +221,5 @@ class DocsiferService:
             and the token count.
         """
         return await asyncio.to_thread(
-            self._convert_sync, source, openai_config, cleanup
         )

 import asyncio
 import logging
 import tempfile
+import requests.cookies
 import magic
 import mimetypes
+import requests
 from pathlib import Path
 from typing import Optional, Dict, Tuple, Any
 from scuid import scuid
             return len(text.split())
     def _convert_sync(
+        self,
+        source: str,
+        openai_config: Optional[dict] = None,
+        http_config: Optional[dict] = None,
+        cleanup: bool = True,
     ) -> Tuple[Dict[str, str], int]:
         """
         Synchronously convert a file at `file_path` to Markdown.
         Args:
             source: Path to the source file or URL to fetch content from.
             openai_config: Optional dictionary with OpenAI configuration.
+            http_config: Optional dictionary with HTTP configuration.
             cleanup: Whether to perform HTML cleanup if the file is an HTML file.
         Returns:
         else:
             md_converter = self._basic_markitdown
+        # Load cookies if provided in the HTTP config.
+        if http_config:
+            if "cookies" in http_config:
+                requests.cookies.cookiejar_from_dict(
+                    http_config["cookies"],
+                    requests.cookies.RequestsCookieJar,
+                    overwrite=True,
+                )
         try:
             result_obj = md_converter.convert(source)
         except Exception as e:
             logger.error("MarkItDown conversion failed: %s", e)
             raise RuntimeError(f"Conversion failed for '{source}': {e}")
         if isinstance(source, Path) and source.exists():
             source.unlink()
         return result_dict, token_count
     async def convert_file(
+        self,
+        source: str,
+        openai_config: Optional[dict] = None,
+        http_config: Optional[dict] = None,
+        cleanup: bool = True,
     ) -> Tuple[Dict[str, str], int]:
         """
         Asynchronously convert a file at `source` to Markdown.
         Args:
             source: Path to the file to convert or a URL to fetch content from.
             openai_config: Optional OpenAI configuration dictionary.
+            http_config: Optional HTTP configuration dictionary.
             cleanup: Whether to perform HTML cleanup if applicable.
         Returns:
             and the token count.
         """
         return await asyncio.to_thread(
+            self._convert_sync, source, openai_config, http_config, cleanup
         )