martinbowling commited on
Commit
cdbdb61
Β·
verified Β·
1 Parent(s): 034e9ee

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +361 -0
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ import re
5
+ from urllib.parse import urljoin, urlparse
6
+ from typing import List, Dict, Optional, Tuple
7
+ import json
8
+ from groq import Groq
9
+ import time
10
+ import pandas as pd
11
+
12
+
13
+ def normalize_url(url: str) -> str:
14
+ if not url.startswith(('http://', 'https://')):
15
+ url = 'https://' + url
16
+ return url.rstrip('/')
17
+
18
+
19
+ def fetch_with_proxy(url: str) -> str:
20
+ """Fetch URL content with error handling"""
21
+ try:
22
+ response = requests.get(url, timeout=10)
23
+ response.raise_for_status()
24
+ return response.text
25
+ except Exception as e:
26
+ raise Exception(f"Failed to fetch {url}: {str(e)}")
27
+
28
+
29
+ def extract_urls_from_sitemap(content: str) -> List[str]:
30
+ urls = []
31
+ try:
32
+ root = ET.fromstring(content)
33
+ ns = {
34
+ 'ns': root.tag.split('}')[0].strip('{')
35
+ } if '}' in root.tag else {}
36
+
37
+ # Handle sitemap index
38
+ if 'sitemapindex' in root.tag:
39
+ for sitemap in root.findall('.//ns:loc', ns):
40
+ try:
41
+ sitemap_content = fetch_with_proxy(sitemap.text.strip())
42
+ urls.extend(extract_urls_from_sitemap(sitemap_content))
43
+ except Exception:
44
+ continue
45
+ # Handle urlset
46
+ else:
47
+ for url in root.findall('.//ns:loc', ns):
48
+ urls.append(url.text.strip())
49
+ except ET.ParseError:
50
+ pass
51
+ return urls
52
+
53
+
54
+ def get_common_sitemap_urls(base_url: str) -> List[str]:
55
+ domain = urlparse(base_url).hostname
56
+ return [
57
+ f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
58
+ f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
59
+ f"{base_url}/sitemap/{domain}-sitemap.xml"
60
+ ]
61
+
62
+
63
+ def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
64
+ return [
65
+ line.split(': ')[1].strip() for line in robots_content.splitlines()
66
+ if line.lower().startswith('sitemap:')
67
+ ]
68
+
69
+
70
+ def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
71
+ try:
72
+ response = requests.post(
73
+ 'https://api.hyperbolic.xyz/v1/chat/completions',
74
+ headers={
75
+ 'Content-Type': 'application/json',
76
+ 'Authorization': f'Bearer {api_key}',
77
+ },
78
+ json={
79
+ 'model':
80
+ 'meta-llama/Meta-Llama-3.1-8B-Instruct',
81
+ 'messages': [{
82
+ 'role':
83
+ 'user',
84
+ 'content':
85
+ f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
86
+ }],
87
+ 'max_tokens':
88
+ 2048,
89
+ 'temperature':
90
+ 0.7,
91
+ 'top_p':
92
+ 0.9,
93
+ 'stream':
94
+ False
95
+ })
96
+ response.raise_for_status()
97
+ return response.json()['choices'][0]['message']['content']
98
+ except Exception as e:
99
+ return f"Error generating Hyperbolic summary: {str(e)}"
100
+
101
+
102
+ def generate_groq_summary(url: str, content: str, api_key: str) -> str:
103
+ try:
104
+ client = Groq(api_key=api_key)
105
+ completion = client.chat.completions.create(
106
+ messages=[{
107
+ "role":
108
+ "user",
109
+ "content":
110
+ f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
111
+ }],
112
+ model="llama-3.2-1b-preview",
113
+ temperature=0.7,
114
+ max_tokens=200,
115
+ )
116
+ return completion.choices[0].message.content
117
+ except Exception as e:
118
+ return f"Error generating Groq summary: {str(e)}"
119
+
120
+
121
+ def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
122
+ try:
123
+ headers = {"Accept": "text/plain"}
124
+ if markdowner_key:
125
+ headers["Authorization"] = f"Bearer {markdowner_key}"
126
+
127
+ response = requests.post("https://md.dhr.wtf/",
128
+ params={"url": url},
129
+ headers=headers)
130
+ response.raise_for_status()
131
+ return response.text
132
+ except Exception as e:
133
+ return f"Error fetching content: {str(e)}"
134
+
135
+
136
+ def generate_llms_txt(summaries: List[Dict]) -> str:
137
+ if not summaries:
138
+ return "No summaries generated"
139
+
140
+ # Find homepage
141
+ homepage = next(
142
+ (s for s in summaries if urlparse(s['url']).path in ['', '/']),
143
+ summaries[0])
144
+
145
+ content = f"# {urlparse(homepage['url']).hostname}\n\n"
146
+ content += f"> {homepage['summary']}\n\n"
147
+ content += "## Main Pages\n\n"
148
+
149
+ for summary in summaries:
150
+ if summary != homepage:
151
+ path = urlparse(summary['url']).path
152
+ content += f"- [{path}]({summary['url']}): {summary['summary']}\n"
153
+
154
+ return content
155
+
156
+
157
+ def process_website(
158
+ url: str,
159
+ hyperbolic_key: str = "",
160
+ groq_key: str = "",
161
+ markdowner_key: str = "",
162
+ use_hyperbolic: bool = True,
163
+ progress=gr.Progress()
164
+ ) -> Tuple[str, str, List[str]]:
165
+ try:
166
+ if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic
167
+ and groq_key):
168
+ return "Error: Please provide an API key for the selected AI provider", None, []
169
+
170
+ base_url = normalize_url(url)
171
+ progress(0, desc="Initializing...")
172
+
173
+ # Try robots.txt first
174
+ sitemap_urls = []
175
+ try:
176
+ robots_url = urljoin(base_url, '/robots.txt')
177
+ robots_content = fetch_with_proxy(robots_url)
178
+ sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
179
+ except:
180
+ pass
181
+
182
+ progress(0.2, desc="Checking common sitemap locations...")
183
+
184
+ # Try common locations if no sitemaps found
185
+ if not sitemap_urls:
186
+ common_locations = get_common_sitemap_urls(base_url)
187
+ for sitemap_url in common_locations:
188
+ try:
189
+ content = fetch_with_proxy(sitemap_url)
190
+ if '<?xml' in content or '<urlset' in content:
191
+ sitemap_urls.append(sitemap_url)
192
+ break
193
+ except:
194
+ continue
195
+
196
+ if not sitemap_urls:
197
+ return "Error: No sitemaps found", None, []
198
+
199
+ progress(0.4, desc="Processing sitemaps...")
200
+
201
+ # Process sitemaps
202
+ all_urls = []
203
+ for sitemap_url in sitemap_urls:
204
+ try:
205
+ content = fetch_with_proxy(sitemap_url)
206
+ urls = extract_urls_from_sitemap(content)
207
+ all_urls.extend(urls)
208
+ except:
209
+ continue
210
+
211
+ if not all_urls:
212
+ return "Error: No URLs found in sitemaps", None, []
213
+
214
+ progress(0.6, desc="Generating summaries...")
215
+
216
+ # Generate summaries
217
+ summaries = []
218
+ for i, page_url in enumerate(all_urls):
219
+ try:
220
+ # Get content via Markdowner
221
+ content = get_page_content(page_url, markdowner_key)
222
+
223
+ # Generate summary with selected provider
224
+ if use_hyperbolic:
225
+ summary = generate_hyperbolic_summary(
226
+ page_url, content, hyperbolic_key)
227
+ else:
228
+ summary = generate_groq_summary(page_url, content,
229
+ groq_key)
230
+
231
+ summaries.append({"url": page_url, "summary": summary})
232
+
233
+ # Rate limiting
234
+ time.sleep(1) # Basic rate limiting
235
+
236
+ progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
237
+ desc=f"Processing URL {i+1}/{len(all_urls)}")
238
+ except Exception as e:
239
+ print(f"Error processing {page_url}: {e}")
240
+
241
+ # Generate llms.txt
242
+ llms_txt = generate_llms_txt(summaries)
243
+
244
+ return llms_txt, json.dumps(summaries, indent=2), all_urls
245
+
246
+ except Exception as e:
247
+ return f"Error: {str(e)}", None, []
248
+
249
+
250
+ # Gradio Interface
251
+ with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
252
+ gr.Markdown("""
253
+ # llms.txt Generator πŸ€–βœ¨
254
+ Generate AI-powered llms.txt files for any website 🌐
255
+ """)
256
+
257
+ with gr.Row():
258
+ url_input = gr.Textbox(label="Website URL",
259
+ placeholder="Enter website URL")
260
+ markdowner_key = gr.Textbox(label="Markdowner API Key (optional)",
261
+ placeholder="For higher rate limits",
262
+ type="password")
263
+
264
+ # AI Provider Selection
265
+ with gr.Row():
266
+ with gr.Column():
267
+ use_hyperbolic = gr.Checkbox(label="Use Hyperbolic",
268
+ value=False,
269
+ interactive=True)
270
+ hyperbolic_key = gr.Textbox(
271
+ label="Hyperbolic API Key",
272
+ type="password",
273
+ visible=True,
274
+ placeholder="Enter your Hyperbolic API key")
275
+
276
+ with gr.Column():
277
+ use_groq = gr.Checkbox(label="Use Groq",
278
+ value=False,
279
+ interactive=True)
280
+ groq_key = gr.Textbox(label="Groq API Key",
281
+ type="password",
282
+ visible=False,
283
+ placeholder="Enter your Groq API key")
284
+
285
+ def update_provider_visibility(use_hyp: bool, use_grq: bool):
286
+ # Ensure only one provider is selected
287
+ if use_hyp and use_grq:
288
+ use_grq = False
289
+
290
+ return {
291
+ hyperbolic_key: gr.update(visible=use_hyp),
292
+ groq_key: gr.update(visible=use_grq),
293
+ use_groq: gr.update(value=use_grq),
294
+ use_hyperbolic: gr.update(value=use_hyp)
295
+ }
296
+
297
+ # Connect checkbox events
298
+ use_hyperbolic.change(
299
+ fn=update_provider_visibility,
300
+ inputs=[use_hyperbolic, use_groq],
301
+ outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
302
+
303
+ use_groq.change(
304
+ fn=update_provider_visibility,
305
+ inputs=[use_hyperbolic, use_groq],
306
+ outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
307
+
308
+ generate_btn = gr.Button("Generate πŸš€", variant="primary")
309
+
310
+ with gr.Row():
311
+ llms_output = gr.TextArea(
312
+ label="Generated llms.txt",
313
+ placeholder="Generated content will appear here...",
314
+ lines=10,
315
+ show_copy_button=True # Enable built-in copy button
316
+ )
317
+ json_output = gr.JSON(label="Raw Summaries", visible=True)
318
+
319
+ # Add download button
320
+ def download_llms_txt(text):
321
+ """Convert text to downloadable format"""
322
+ if not text:
323
+ return None
324
+ return text.encode('utf-8')
325
+
326
+ download_btn = gr.File(label="Download llms.txt",
327
+ visible=False,
328
+ file_types=[".txt", ".md"])
329
+
330
+ download_trigger = gr.Button("Download llms.txt πŸ“₯")
331
+ download_trigger.click(fn=download_llms_txt,
332
+ inputs=[llms_output],
333
+ outputs=[download_btn])
334
+
335
+ urls_found = gr.Dataframe(headers=["URLs Found"],
336
+ label="Discovered URLs",
337
+ visible=True)
338
+
339
+ def process_and_update(*args):
340
+ result, summaries, urls = process_website(*args)
341
+
342
+ # Create DataFrame for URLs
343
+ urls_df = pd.DataFrame(
344
+ {"URLs Found": urls if urls else ["No URLs found"]})
345
+
346
+ return {
347
+ llms_output: result,
348
+ json_output: summaries if summaries else "",
349
+ urls_found: urls_df,
350
+ download_btn: None # Reset download button
351
+ }
352
+
353
+ generate_btn.click(
354
+ process_and_update,
355
+ inputs=[
356
+ url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic
357
+ ],
358
+ outputs=[llms_output, json_output, urls_found, download_btn])
359
+
360
+ if __name__ == "__main__":
361
+ demo.launch()