Spaces:

nyuuzyou
/

urltomarkdown

Sleeping

App Files Files Community

nyuuzyou commited on Jun 17

Commit

c3ac509

verified ·

1 Parent(s): 4c8c0a9

Create app.py

Browse files

Files changed (1) hide show

app.py +231 -0

app.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import html2text
+from readability import Document
+import re
+from urllib.parse import urljoin, urlparse
+import time
+class URLToMarkdownConverter:
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def is_valid_url(self, url):
+        """Check if URL is valid"""
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except:
+            return False
+    def fetch_webpage(self, url, timeout=10):
+        """Fetch webpage content"""
+        try:
+            response = self.session.get(url, timeout=timeout)
+            response.raise_for_status()
+            return response.text, response.status_code
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Error fetching URL: {str(e)}")
+    def extract_title(self, html_content):
+        """Extract page title from HTML"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        title_tag = soup.find('title')
+        return title_tag.get_text().strip() if title_tag else ""
+    def improve_readability(self, html_content):
+        """Use readability to extract main content"""
+        try:
+            doc = Document(html_content)
+            return doc.content()
+        except:
+            return html_content
+    def convert_to_markdown(self, html_content, ignore_links=False):
+        """Convert HTML to Markdown"""
+        h = html2text.HTML2Text()
+        h.ignore_images = False
+        h.ignore_links = ignore_links
+        h.body_width = 0  # Don't wrap lines
+        h.unicode_snob = True
+        h.bypass_tables = False
+        # Convert to markdown
+        markdown = h.handle(html_content)
+        # Clean up excessive whitespace
+        markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown)
+        markdown = markdown.strip()
+        return markdown
+    def process_url(self, url, include_title=True, ignore_links=False, improve_readability=True):
+        """Main processing function"""
+        if not url:
+            return "Please enter a URL", ""
+        if not self.is_valid_url(url):
+            return "Please enter a valid URL", ""
+        try:
+            # Fetch webpage
+            html_content, status_code = self.fetch_webpage(url)
+            # Extract title
+            title = self.extract_title(html_content)
+            # Improve readability if requested
+            if improve_readability:
+                html_content = self.improve_readability(html_content)
+            # Convert to markdown
+            markdown = self.convert_to_markdown(html_content, ignore_links)
+            # Add title if requested
+            if include_title and title:
+                markdown = f"# {title}\n\n{markdown}"
+            return markdown, title
+        except Exception as e:
+            return f"Error processing URL: {str(e)}", ""
+# Initialize converter
+converter = URLToMarkdownConverter()
+def convert_url_to_markdown(url, include_title, ignore_links, improve_readability):
+    """Gradio interface function"""
+    markdown, title = converter.process_url(
+        url=url,
+        include_title=include_title,
+        ignore_links=ignore_links,
+        improve_readability=improve_readability
+    )
+    return markdown, title
+# Create Gradio interface
+with gr.Blocks(title="URL to Markdown Converter", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 🔗 URL to Markdown Converter
+    Convert any webpage to clean, readable Markdown format. Perfect for documentation, note-taking, and content archival.
+    ## How to use:
+    1. Enter a URL in the text box below
+    2. Configure your options
+    3. Click "Convert to Markdown"
+    4. Copy the generated Markdown from the output box
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            url_input = gr.Textbox(
+                label="URL",
+                placeholder="https://example.com",
+                lines=1,
+                info="Enter the URL of the webpage you want to convert"
+            )
+            with gr.Row():
+                include_title = gr.Checkbox(
+                    label="Include Title",
+                    value=True,
+                    info="Add the page title as a heading"
+                )
+                ignore_links = gr.Checkbox(
+                    label="Ignore Links",
+                    value=False,
+                    info="Remove all hyperlinks from output"
+                )
+                improve_readability = gr.Checkbox(
+                    label="Improve Readability",
+                    value=True,
+                    info="Extract main content and remove clutter"
+                )
+            convert_btn = gr.Button("Convert to Markdown", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("""
+            ### Options Explained:
+            **Include Title**: Adds the webpage's title as an H1 heading at the top of the markdown.
+            **Ignore Links**: Removes all hyperlinks, keeping only the link text.
+            **Improve Readability**: Uses Mozilla's Readability algorithm to extract the main content and remove navigation, ads, and other clutter.
+            """)
+    with gr.Row():
+        extracted_title = gr.Textbox(
+            label="Extracted Title",
+            interactive=False,
+            lines=1,
+            info="The title extracted from the webpage"
+        )
+    markdown_output = gr.Textbox(
+        label="Markdown Output",
+        lines=20,
+        max_lines=50,
+        show_copy_button=True,
+        info="The converted Markdown content"
+    )
+    # Event handlers
+    convert_btn.click(
+        fn=convert_url_to_markdown,
+        inputs=[url_input, include_title, ignore_links, improve_readability],
+        outputs=[markdown_output, extracted_title]
+    )
+    # Allow Enter key to trigger conversion
+    url_input.submit(
+        fn=convert_url_to_markdown,
+        inputs=[url_input, include_title, ignore_links, improve_readability],
+        outputs=[markdown_output, extracted_title]
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["https://www.mozilla.org/en-US/firefox/", True, False, True],
+            ["https://github.com/python/cpython", True, False, True],
+            ["https://docs.python.org/3/tutorial/", False, True, True],
+        ],
+        inputs=[url_input, include_title, ignore_links, improve_readability],
+        outputs=[markdown_output, extracted_title],
+        fn=convert_url_to_markdown,
+        cache_examples=False
+    )
+    gr.Markdown("""
+    ---
+    ### Tips:
+    - The converter works best with article-style content
+    - Some websites may block automated requests
+    - Large pages may take a few seconds to process
+    - For best results, keep "Improve Readability" enabled
+    ### Supported Sites:
+    Most standard websites work well. Some sites with heavy JavaScript or anti-bot measures may not work properly.
+    ---
+    ### Credits:
+    This Gradio app was inspired by and is a rewrite of [macsplit/urltomarkdown](https://github.com/macsplit/urltomarkdown).
+    """)
+# Launch the app
+if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )