nyuuzyou commited on
Commit
c3ac509
·
verified ·
1 Parent(s): 4c8c0a9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -0
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import html2text
5
+ from readability import Document
6
+ import re
7
+ from urllib.parse import urljoin, urlparse
8
+ import time
9
+
10
+ class URLToMarkdownConverter:
11
+ def __init__(self):
12
+ self.session = requests.Session()
13
+ self.session.headers.update({
14
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
15
+ })
16
+
17
+ def is_valid_url(self, url):
18
+ """Check if URL is valid"""
19
+ try:
20
+ result = urlparse(url)
21
+ return all([result.scheme, result.netloc])
22
+ except:
23
+ return False
24
+
25
+ def fetch_webpage(self, url, timeout=10):
26
+ """Fetch webpage content"""
27
+ try:
28
+ response = self.session.get(url, timeout=timeout)
29
+ response.raise_for_status()
30
+ return response.text, response.status_code
31
+ except requests.exceptions.RequestException as e:
32
+ raise Exception(f"Error fetching URL: {str(e)}")
33
+
34
+ def extract_title(self, html_content):
35
+ """Extract page title from HTML"""
36
+ soup = BeautifulSoup(html_content, 'html.parser')
37
+ title_tag = soup.find('title')
38
+ return title_tag.get_text().strip() if title_tag else ""
39
+
40
+ def improve_readability(self, html_content):
41
+ """Use readability to extract main content"""
42
+ try:
43
+ doc = Document(html_content)
44
+ return doc.content()
45
+ except:
46
+ return html_content
47
+
48
+ def convert_to_markdown(self, html_content, ignore_links=False):
49
+ """Convert HTML to Markdown"""
50
+ h = html2text.HTML2Text()
51
+ h.ignore_images = False
52
+ h.ignore_links = ignore_links
53
+ h.body_width = 0 # Don't wrap lines
54
+ h.unicode_snob = True
55
+ h.bypass_tables = False
56
+
57
+ # Convert to markdown
58
+ markdown = h.handle(html_content)
59
+
60
+ # Clean up excessive whitespace
61
+ markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown)
62
+ markdown = markdown.strip()
63
+
64
+ return markdown
65
+
66
+ def process_url(self, url, include_title=True, ignore_links=False, improve_readability=True):
67
+ """Main processing function"""
68
+ if not url:
69
+ return "Please enter a URL", ""
70
+
71
+ if not self.is_valid_url(url):
72
+ return "Please enter a valid URL", ""
73
+
74
+ try:
75
+ # Fetch webpage
76
+ html_content, status_code = self.fetch_webpage(url)
77
+
78
+ # Extract title
79
+ title = self.extract_title(html_content)
80
+
81
+ # Improve readability if requested
82
+ if improve_readability:
83
+ html_content = self.improve_readability(html_content)
84
+
85
+ # Convert to markdown
86
+ markdown = self.convert_to_markdown(html_content, ignore_links)
87
+
88
+ # Add title if requested
89
+ if include_title and title:
90
+ markdown = f"# {title}\n\n{markdown}"
91
+
92
+ return markdown, title
93
+
94
+ except Exception as e:
95
+ return f"Error processing URL: {str(e)}", ""
96
+
97
+ # Initialize converter
98
+ converter = URLToMarkdownConverter()
99
+
100
+ def convert_url_to_markdown(url, include_title, ignore_links, improve_readability):
101
+ """Gradio interface function"""
102
+ markdown, title = converter.process_url(
103
+ url=url,
104
+ include_title=include_title,
105
+ ignore_links=ignore_links,
106
+ improve_readability=improve_readability
107
+ )
108
+ return markdown, title
109
+
110
+ # Create Gradio interface
111
+ with gr.Blocks(title="URL to Markdown Converter", theme=gr.themes.Soft()) as app:
112
+ gr.Markdown("""
113
+ # 🔗 URL to Markdown Converter
114
+
115
+ Convert any webpage to clean, readable Markdown format. Perfect for documentation, note-taking, and content archival.
116
+
117
+ ## How to use:
118
+ 1. Enter a URL in the text box below
119
+ 2. Configure your options
120
+ 3. Click "Convert to Markdown"
121
+ 4. Copy the generated Markdown from the output box
122
+ """)
123
+
124
+ with gr.Row():
125
+ with gr.Column(scale=2):
126
+ url_input = gr.Textbox(
127
+ label="URL",
128
+ placeholder="https://example.com",
129
+ lines=1,
130
+ info="Enter the URL of the webpage you want to convert"
131
+ )
132
+
133
+ with gr.Row():
134
+ include_title = gr.Checkbox(
135
+ label="Include Title",
136
+ value=True,
137
+ info="Add the page title as a heading"
138
+ )
139
+ ignore_links = gr.Checkbox(
140
+ label="Ignore Links",
141
+ value=False,
142
+ info="Remove all hyperlinks from output"
143
+ )
144
+ improve_readability = gr.Checkbox(
145
+ label="Improve Readability",
146
+ value=True,
147
+ info="Extract main content and remove clutter"
148
+ )
149
+
150
+ convert_btn = gr.Button("Convert to Markdown", variant="primary", size="lg")
151
+
152
+ with gr.Column(scale=1):
153
+ gr.Markdown("""
154
+ ### Options Explained:
155
+
156
+ **Include Title**: Adds the webpage's title as an H1 heading at the top of the markdown.
157
+
158
+ **Ignore Links**: Removes all hyperlinks, keeping only the link text.
159
+
160
+ **Improve Readability**: Uses Mozilla's Readability algorithm to extract the main content and remove navigation, ads, and other clutter.
161
+ """)
162
+
163
+ with gr.Row():
164
+ extracted_title = gr.Textbox(
165
+ label="Extracted Title",
166
+ interactive=False,
167
+ lines=1,
168
+ info="The title extracted from the webpage"
169
+ )
170
+
171
+ markdown_output = gr.Textbox(
172
+ label="Markdown Output",
173
+ lines=20,
174
+ max_lines=50,
175
+ show_copy_button=True,
176
+ info="The converted Markdown content"
177
+ )
178
+
179
+ # Event handlers
180
+ convert_btn.click(
181
+ fn=convert_url_to_markdown,
182
+ inputs=[url_input, include_title, ignore_links, improve_readability],
183
+ outputs=[markdown_output, extracted_title]
184
+ )
185
+
186
+ # Allow Enter key to trigger conversion
187
+ url_input.submit(
188
+ fn=convert_url_to_markdown,
189
+ inputs=[url_input, include_title, ignore_links, improve_readability],
190
+ outputs=[markdown_output, extracted_title]
191
+ )
192
+
193
+ # Examples
194
+ gr.Examples(
195
+ examples=[
196
+ ["https://www.mozilla.org/en-US/firefox/", True, False, True],
197
+ ["https://github.com/python/cpython", True, False, True],
198
+ ["https://docs.python.org/3/tutorial/", False, True, True],
199
+ ],
200
+ inputs=[url_input, include_title, ignore_links, improve_readability],
201
+ outputs=[markdown_output, extracted_title],
202
+ fn=convert_url_to_markdown,
203
+ cache_examples=False
204
+ )
205
+
206
+ gr.Markdown("""
207
+ ---
208
+
209
+ ### Tips:
210
+ - The converter works best with article-style content
211
+ - Some websites may block automated requests
212
+ - Large pages may take a few seconds to process
213
+ - For best results, keep "Improve Readability" enabled
214
+
215
+ ### Supported Sites:
216
+ Most standard websites work well. Some sites with heavy JavaScript or anti-bot measures may not work properly.
217
+
218
+ ---
219
+
220
+ ### Credits:
221
+ This Gradio app was inspired by and is a rewrite of [macsplit/urltomarkdown](https://github.com/macsplit/urltomarkdown).
222
+ """)
223
+
224
+ # Launch the app
225
+ if __name__ == "__main__":
226
+ app.launch(
227
+ server_name="0.0.0.0",
228
+ server_port=7860,
229
+ share=False,
230
+ show_error=True
231
+ )