Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import xml.etree.ElementTree as ET
|
4 |
+
import re
|
5 |
+
from urllib.parse import urljoin, urlparse
|
6 |
+
from typing import List, Dict, Optional, Tuple
|
7 |
+
import json
|
8 |
+
from groq import Groq
|
9 |
+
import time
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
|
13 |
+
def normalize_url(url: str) -> str:
|
14 |
+
if not url.startswith(('http://', 'https://')):
|
15 |
+
url = 'https://' + url
|
16 |
+
return url.rstrip('/')
|
17 |
+
|
18 |
+
|
19 |
+
def fetch_with_proxy(url: str) -> str:
|
20 |
+
"""Fetch URL content with error handling"""
|
21 |
+
try:
|
22 |
+
response = requests.get(url, timeout=10)
|
23 |
+
response.raise_for_status()
|
24 |
+
return response.text
|
25 |
+
except Exception as e:
|
26 |
+
raise Exception(f"Failed to fetch {url}: {str(e)}")
|
27 |
+
|
28 |
+
|
29 |
+
def extract_urls_from_sitemap(content: str) -> List[str]:
|
30 |
+
urls = []
|
31 |
+
try:
|
32 |
+
root = ET.fromstring(content)
|
33 |
+
ns = {
|
34 |
+
'ns': root.tag.split('}')[0].strip('{')
|
35 |
+
} if '}' in root.tag else {}
|
36 |
+
|
37 |
+
# Handle sitemap index
|
38 |
+
if 'sitemapindex' in root.tag:
|
39 |
+
for sitemap in root.findall('.//ns:loc', ns):
|
40 |
+
try:
|
41 |
+
sitemap_content = fetch_with_proxy(sitemap.text.strip())
|
42 |
+
urls.extend(extract_urls_from_sitemap(sitemap_content))
|
43 |
+
except Exception:
|
44 |
+
continue
|
45 |
+
# Handle urlset
|
46 |
+
else:
|
47 |
+
for url in root.findall('.//ns:loc', ns):
|
48 |
+
urls.append(url.text.strip())
|
49 |
+
except ET.ParseError:
|
50 |
+
pass
|
51 |
+
return urls
|
52 |
+
|
53 |
+
|
54 |
+
def get_common_sitemap_urls(base_url: str) -> List[str]:
|
55 |
+
domain = urlparse(base_url).hostname
|
56 |
+
return [
|
57 |
+
f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
|
58 |
+
f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
|
59 |
+
f"{base_url}/sitemap/{domain}-sitemap.xml"
|
60 |
+
]
|
61 |
+
|
62 |
+
|
63 |
+
def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
|
64 |
+
return [
|
65 |
+
line.split(': ')[1].strip() for line in robots_content.splitlines()
|
66 |
+
if line.lower().startswith('sitemap:')
|
67 |
+
]
|
68 |
+
|
69 |
+
|
70 |
+
def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
|
71 |
+
try:
|
72 |
+
response = requests.post(
|
73 |
+
'https://api.hyperbolic.xyz/v1/chat/completions',
|
74 |
+
headers={
|
75 |
+
'Content-Type': 'application/json',
|
76 |
+
'Authorization': f'Bearer {api_key}',
|
77 |
+
},
|
78 |
+
json={
|
79 |
+
'model':
|
80 |
+
'meta-llama/Meta-Llama-3.1-8B-Instruct',
|
81 |
+
'messages': [{
|
82 |
+
'role':
|
83 |
+
'user',
|
84 |
+
'content':
|
85 |
+
f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
|
86 |
+
}],
|
87 |
+
'max_tokens':
|
88 |
+
2048,
|
89 |
+
'temperature':
|
90 |
+
0.7,
|
91 |
+
'top_p':
|
92 |
+
0.9,
|
93 |
+
'stream':
|
94 |
+
False
|
95 |
+
})
|
96 |
+
response.raise_for_status()
|
97 |
+
return response.json()['choices'][0]['message']['content']
|
98 |
+
except Exception as e:
|
99 |
+
return f"Error generating Hyperbolic summary: {str(e)}"
|
100 |
+
|
101 |
+
|
102 |
+
def generate_groq_summary(url: str, content: str, api_key: str) -> str:
|
103 |
+
try:
|
104 |
+
client = Groq(api_key=api_key)
|
105 |
+
completion = client.chat.completions.create(
|
106 |
+
messages=[{
|
107 |
+
"role":
|
108 |
+
"user",
|
109 |
+
"content":
|
110 |
+
f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
|
111 |
+
}],
|
112 |
+
model="llama-3.2-1b-preview",
|
113 |
+
temperature=0.7,
|
114 |
+
max_tokens=200,
|
115 |
+
)
|
116 |
+
return completion.choices[0].message.content
|
117 |
+
except Exception as e:
|
118 |
+
return f"Error generating Groq summary: {str(e)}"
|
119 |
+
|
120 |
+
|
121 |
+
def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
|
122 |
+
try:
|
123 |
+
headers = {"Accept": "text/plain"}
|
124 |
+
if markdowner_key:
|
125 |
+
headers["Authorization"] = f"Bearer {markdowner_key}"
|
126 |
+
|
127 |
+
response = requests.post("https://md.dhr.wtf/",
|
128 |
+
params={"url": url},
|
129 |
+
headers=headers)
|
130 |
+
response.raise_for_status()
|
131 |
+
return response.text
|
132 |
+
except Exception as e:
|
133 |
+
return f"Error fetching content: {str(e)}"
|
134 |
+
|
135 |
+
|
136 |
+
def generate_llms_txt(summaries: List[Dict]) -> str:
|
137 |
+
if not summaries:
|
138 |
+
return "No summaries generated"
|
139 |
+
|
140 |
+
# Find homepage
|
141 |
+
homepage = next(
|
142 |
+
(s for s in summaries if urlparse(s['url']).path in ['', '/']),
|
143 |
+
summaries[0])
|
144 |
+
|
145 |
+
content = f"# {urlparse(homepage['url']).hostname}\n\n"
|
146 |
+
content += f"> {homepage['summary']}\n\n"
|
147 |
+
content += "## Main Pages\n\n"
|
148 |
+
|
149 |
+
for summary in summaries:
|
150 |
+
if summary != homepage:
|
151 |
+
path = urlparse(summary['url']).path
|
152 |
+
content += f"- [{path}]({summary['url']}): {summary['summary']}\n"
|
153 |
+
|
154 |
+
return content
|
155 |
+
|
156 |
+
|
157 |
+
def process_website(
|
158 |
+
url: str,
|
159 |
+
hyperbolic_key: str = "",
|
160 |
+
groq_key: str = "",
|
161 |
+
markdowner_key: str = "",
|
162 |
+
use_hyperbolic: bool = True,
|
163 |
+
progress=gr.Progress()
|
164 |
+
) -> Tuple[str, str, List[str]]:
|
165 |
+
try:
|
166 |
+
if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic
|
167 |
+
and groq_key):
|
168 |
+
return "Error: Please provide an API key for the selected AI provider", None, []
|
169 |
+
|
170 |
+
base_url = normalize_url(url)
|
171 |
+
progress(0, desc="Initializing...")
|
172 |
+
|
173 |
+
# Try robots.txt first
|
174 |
+
sitemap_urls = []
|
175 |
+
try:
|
176 |
+
robots_url = urljoin(base_url, '/robots.txt')
|
177 |
+
robots_content = fetch_with_proxy(robots_url)
|
178 |
+
sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
|
179 |
+
except:
|
180 |
+
pass
|
181 |
+
|
182 |
+
progress(0.2, desc="Checking common sitemap locations...")
|
183 |
+
|
184 |
+
# Try common locations if no sitemaps found
|
185 |
+
if not sitemap_urls:
|
186 |
+
common_locations = get_common_sitemap_urls(base_url)
|
187 |
+
for sitemap_url in common_locations:
|
188 |
+
try:
|
189 |
+
content = fetch_with_proxy(sitemap_url)
|
190 |
+
if '<?xml' in content or '<urlset' in content:
|
191 |
+
sitemap_urls.append(sitemap_url)
|
192 |
+
break
|
193 |
+
except:
|
194 |
+
continue
|
195 |
+
|
196 |
+
if not sitemap_urls:
|
197 |
+
return "Error: No sitemaps found", None, []
|
198 |
+
|
199 |
+
progress(0.4, desc="Processing sitemaps...")
|
200 |
+
|
201 |
+
# Process sitemaps
|
202 |
+
all_urls = []
|
203 |
+
for sitemap_url in sitemap_urls:
|
204 |
+
try:
|
205 |
+
content = fetch_with_proxy(sitemap_url)
|
206 |
+
urls = extract_urls_from_sitemap(content)
|
207 |
+
all_urls.extend(urls)
|
208 |
+
except:
|
209 |
+
continue
|
210 |
+
|
211 |
+
if not all_urls:
|
212 |
+
return "Error: No URLs found in sitemaps", None, []
|
213 |
+
|
214 |
+
progress(0.6, desc="Generating summaries...")
|
215 |
+
|
216 |
+
# Generate summaries
|
217 |
+
summaries = []
|
218 |
+
for i, page_url in enumerate(all_urls):
|
219 |
+
try:
|
220 |
+
# Get content via Markdowner
|
221 |
+
content = get_page_content(page_url, markdowner_key)
|
222 |
+
|
223 |
+
# Generate summary with selected provider
|
224 |
+
if use_hyperbolic:
|
225 |
+
summary = generate_hyperbolic_summary(
|
226 |
+
page_url, content, hyperbolic_key)
|
227 |
+
else:
|
228 |
+
summary = generate_groq_summary(page_url, content,
|
229 |
+
groq_key)
|
230 |
+
|
231 |
+
summaries.append({"url": page_url, "summary": summary})
|
232 |
+
|
233 |
+
# Rate limiting
|
234 |
+
time.sleep(1) # Basic rate limiting
|
235 |
+
|
236 |
+
progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
|
237 |
+
desc=f"Processing URL {i+1}/{len(all_urls)}")
|
238 |
+
except Exception as e:
|
239 |
+
print(f"Error processing {page_url}: {e}")
|
240 |
+
|
241 |
+
# Generate llms.txt
|
242 |
+
llms_txt = generate_llms_txt(summaries)
|
243 |
+
|
244 |
+
return llms_txt, json.dumps(summaries, indent=2), all_urls
|
245 |
+
|
246 |
+
except Exception as e:
|
247 |
+
return f"Error: {str(e)}", None, []
|
248 |
+
|
249 |
+
|
250 |
+
# Gradio Interface
|
251 |
+
with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
|
252 |
+
gr.Markdown("""
|
253 |
+
# llms.txt Generator π€β¨
|
254 |
+
Generate AI-powered llms.txt files for any website π
|
255 |
+
""")
|
256 |
+
|
257 |
+
with gr.Row():
|
258 |
+
url_input = gr.Textbox(label="Website URL",
|
259 |
+
placeholder="Enter website URL")
|
260 |
+
markdowner_key = gr.Textbox(label="Markdowner API Key (optional)",
|
261 |
+
placeholder="For higher rate limits",
|
262 |
+
type="password")
|
263 |
+
|
264 |
+
# AI Provider Selection
|
265 |
+
with gr.Row():
|
266 |
+
with gr.Column():
|
267 |
+
use_hyperbolic = gr.Checkbox(label="Use Hyperbolic",
|
268 |
+
value=False,
|
269 |
+
interactive=True)
|
270 |
+
hyperbolic_key = gr.Textbox(
|
271 |
+
label="Hyperbolic API Key",
|
272 |
+
type="password",
|
273 |
+
visible=True,
|
274 |
+
placeholder="Enter your Hyperbolic API key")
|
275 |
+
|
276 |
+
with gr.Column():
|
277 |
+
use_groq = gr.Checkbox(label="Use Groq",
|
278 |
+
value=False,
|
279 |
+
interactive=True)
|
280 |
+
groq_key = gr.Textbox(label="Groq API Key",
|
281 |
+
type="password",
|
282 |
+
visible=False,
|
283 |
+
placeholder="Enter your Groq API key")
|
284 |
+
|
285 |
+
def update_provider_visibility(use_hyp: bool, use_grq: bool):
|
286 |
+
# Ensure only one provider is selected
|
287 |
+
if use_hyp and use_grq:
|
288 |
+
use_grq = False
|
289 |
+
|
290 |
+
return {
|
291 |
+
hyperbolic_key: gr.update(visible=use_hyp),
|
292 |
+
groq_key: gr.update(visible=use_grq),
|
293 |
+
use_groq: gr.update(value=use_grq),
|
294 |
+
use_hyperbolic: gr.update(value=use_hyp)
|
295 |
+
}
|
296 |
+
|
297 |
+
# Connect checkbox events
|
298 |
+
use_hyperbolic.change(
|
299 |
+
fn=update_provider_visibility,
|
300 |
+
inputs=[use_hyperbolic, use_groq],
|
301 |
+
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
|
302 |
+
|
303 |
+
use_groq.change(
|
304 |
+
fn=update_provider_visibility,
|
305 |
+
inputs=[use_hyperbolic, use_groq],
|
306 |
+
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
|
307 |
+
|
308 |
+
generate_btn = gr.Button("Generate π", variant="primary")
|
309 |
+
|
310 |
+
with gr.Row():
|
311 |
+
llms_output = gr.TextArea(
|
312 |
+
label="Generated llms.txt",
|
313 |
+
placeholder="Generated content will appear here...",
|
314 |
+
lines=10,
|
315 |
+
show_copy_button=True # Enable built-in copy button
|
316 |
+
)
|
317 |
+
json_output = gr.JSON(label="Raw Summaries", visible=True)
|
318 |
+
|
319 |
+
# Add download button
|
320 |
+
def download_llms_txt(text):
|
321 |
+
"""Convert text to downloadable format"""
|
322 |
+
if not text:
|
323 |
+
return None
|
324 |
+
return text.encode('utf-8')
|
325 |
+
|
326 |
+
download_btn = gr.File(label="Download llms.txt",
|
327 |
+
visible=False,
|
328 |
+
file_types=[".txt", ".md"])
|
329 |
+
|
330 |
+
download_trigger = gr.Button("Download llms.txt π₯")
|
331 |
+
download_trigger.click(fn=download_llms_txt,
|
332 |
+
inputs=[llms_output],
|
333 |
+
outputs=[download_btn])
|
334 |
+
|
335 |
+
urls_found = gr.Dataframe(headers=["URLs Found"],
|
336 |
+
label="Discovered URLs",
|
337 |
+
visible=True)
|
338 |
+
|
339 |
+
def process_and_update(*args):
|
340 |
+
result, summaries, urls = process_website(*args)
|
341 |
+
|
342 |
+
# Create DataFrame for URLs
|
343 |
+
urls_df = pd.DataFrame(
|
344 |
+
{"URLs Found": urls if urls else ["No URLs found"]})
|
345 |
+
|
346 |
+
return {
|
347 |
+
llms_output: result,
|
348 |
+
json_output: summaries if summaries else "",
|
349 |
+
urls_found: urls_df,
|
350 |
+
download_btn: None # Reset download button
|
351 |
+
}
|
352 |
+
|
353 |
+
generate_btn.click(
|
354 |
+
process_and_update,
|
355 |
+
inputs=[
|
356 |
+
url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic
|
357 |
+
],
|
358 |
+
outputs=[llms_output, json_output, urls_found, download_btn])
|
359 |
+
|
360 |
+
if __name__ == "__main__":
|
361 |
+
demo.launch()
|