martinbowling's picture
publication version
d279120 verified
import gradio as gr
import requests
import xml.etree.ElementTree as ET
import re
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional, Tuple
import json
from groq import Groq
import time
import pandas as pd
import os
def normalize_url(url: str) -> str:
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
return url.rstrip('/')
def fetch_with_proxy(url: str) -> str:
"""Fetch URL content with error handling"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except Exception as e:
raise Exception(f"Failed to fetch {url}: {str(e)}")
def extract_urls_from_sitemap(content: str) -> List[str]:
urls = []
try:
root = ET.fromstring(content)
ns = {
'ns': root.tag.split('}')[0].strip('{')
} if '}' in root.tag else {}
# Handle sitemap index
if 'sitemapindex' in root.tag:
for sitemap in root.findall('.//ns:loc', ns):
try:
sitemap_content = fetch_with_proxy(sitemap.text.strip())
urls.extend(extract_urls_from_sitemap(sitemap_content))
except Exception:
continue
# Handle urlset
else:
for url in root.findall('.//ns:loc', ns):
urls.append(url.text.strip())
except ET.ParseError:
pass
return urls
def get_common_sitemap_urls(base_url: str) -> List[str]:
domain = urlparse(base_url).hostname
return [
f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
f"{base_url}/sitemap/{domain}-sitemap.xml"
]
def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
return [
line.split(': ')[1].strip() for line in robots_content.splitlines()
if line.lower().startswith('sitemap:')
]
def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
try:
# Ensure content is properly encoded
content = content.encode('utf-8', errors='ignore').decode('utf-8')
response = requests.post(
'https://api.hyperbolic.xyz/v1/chat/completions',
headers={
'Content-Type': 'application/json; charset=utf-8',
'Authorization': f'Bearer {api_key}',
},
json={
'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
'messages': [{
'role': 'user',
'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.
URL: {url}
Content: {content}
Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
}],
'max_tokens': 200,
'temperature': 0.7,
'top_p': 0.9,
'stream': False
},
timeout=30
)
response.raise_for_status()
result = response.json()
summary = result['choices'][0]['message']['content']
# Extract summary from tags
match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
return match.group(1).strip() if match else summary.strip()
except Exception as e:
print(f"Error in generate_hyperbolic_summary: {str(e)}")
return f"Error generating Hyperbolic summary: {str(e)}"
def generate_groq_summary(url: str, content: str, api_key: str) -> str:
try:
# Ensure content is properly encoded
content = content.encode('utf-8', errors='ignore').decode('utf-8')
client = Groq(api_key=api_key)
completion = client.chat.completions.create(
messages=[{
'role': 'user',
'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.
URL: {url}
Content: {content}
Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
}],
model="llama-3.2-1b-preview",
temperature=0.7,
max_tokens=200,
top_p=0.9,
stream=False
)
summary = completion.choices[0].message.content
# Extract summary from tags
match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
return match.group(1).strip() if match else summary.strip()
except Exception as e:
print(f"Error in generate_groq_summary: {str(e)}")
return f"Error generating Groq summary: {str(e)}"
def generate_llms_txt(summaries: List[Dict[str, str]]) -> str:
if not summaries:
return ""
return "\n".join([
f"# {summary['url']}\n\n{summary['summary']}\n\n---\n"
for summary in summaries
])
def generate_llms_full_txt(summaries: List[Dict]) -> str:
if not summaries:
return "No content generated"
content = ""
for summary in summaries:
content += f"# {summary['url']}\n\n"
content += f"{summary.get('fullContent', 'No content available')}\n\n"
content += "---\n\n"
return content
def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
try:
headers = {
"Accept": "text/plain",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)",
"Origin": "http://localhost:3000",
"Referer": "http://localhost:3000/",
}
if markdowner_key:
headers["Authorization"] = f"Bearer {markdowner_key}"
# Use direct URL construction like the curl command
encoded_url = requests.utils.quote(url)
full_url = f"https://md.dhr.wtf/?url={encoded_url}"
print(f"Requesting URL: {full_url}") # Debug logging
print(f"Headers: {headers}") # Debug logging
response = requests.get( # Changed to GET request
full_url,
headers=headers,
timeout=30
)
response.encoding = 'utf-8'
response.raise_for_status()
if response.status_code == 200:
return response.text
else:
print(f"Response status: {response.status_code}") # Debug logging
print(f"Response headers: {response.headers}") # Debug logging
print(f"Response text: {response.text[:500]}") # Debug logging
return f"Error fetching content: {response.status_code} {response.reason}"
except Exception as e:
print(f"Error fetching content for {url}: {str(e)}")
return f"Error fetching content: {str(e)}"
def process_website(
url: str,
hyperbolic_key: str = "",
groq_key: str = "",
markdowner_key: str = "",
use_hyperbolic: bool = True,
progress=gr.Progress()
) -> Tuple[str, str, List[str], str]:
try:
if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key):
return "Error: Please provide an API key for the selected AI provider", None, [], ""
base_url = normalize_url(url)
progress(0, desc="Initializing...")
# Try robots.txt first
sitemap_urls = []
try:
robots_url = urljoin(base_url, '/robots.txt')
robots_content = fetch_with_proxy(robots_url)
sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
except:
pass
progress(0.2, desc="Checking common sitemap locations...")
# Try common locations if no sitemaps found
if not sitemap_urls:
common_locations = get_common_sitemap_urls(base_url)
for sitemap_url in common_locations:
try:
content = fetch_with_proxy(sitemap_url)
if '<?xml' in content or '<urlset' in content:
sitemap_urls.append(sitemap_url)
break
except:
continue
if not sitemap_urls:
return "Error: No sitemaps found", None, [], ""
progress(0.4, desc="Processing sitemaps...")
# Process sitemaps
all_urls = []
for sitemap_url in sitemap_urls:
try:
content = fetch_with_proxy(sitemap_url)
urls = extract_urls_from_sitemap(content)
all_urls.extend(urls)
except:
continue
if not all_urls:
return "Error: No URLs found in sitemaps", None, [], ""
progress(0.6, desc="Generating summaries...")
# Generate summaries
summaries = []
for i, page_url in enumerate(all_urls):
try:
# Get content via Markdowner
content = get_page_content(page_url, markdowner_key)
# Store full content for llms-full.txt
full_content = content
# Generate summary with selected provider
if use_hyperbolic:
summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key)
else:
summary = generate_groq_summary(page_url, content, groq_key)
summaries.append({
"url": page_url,
"summary": summary,
"fullContent": full_content,
"provider": "hyperbolic" if use_hyperbolic else "groq"
})
# Rate limiting
time.sleep(1)
progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
desc=f"Processing URL {i+1}/{len(all_urls)}")
except Exception as e:
print(f"Error processing {page_url}: {str(e)}")
continue
# Generate both formats
llms_txt = generate_llms_txt(summaries)
llms_full_txt = generate_llms_full_txt(summaries)
return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt
except Exception as e:
print(f"Error in process_website: {str(e)}")
return f"Processing failed: {str(e)}", None, [], ""
# Gradio Interface
with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# llms.txt Generator πŸ€–βœ¨
Generate AI-powered llms.txt files for any website 🌐
""")
with gr.Row():
url_input = gr.Textbox(
label="Website URL",
placeholder="Enter website URL"
)
markdowner_key = gr.Textbox(
label="Markdowner API Key (Optional)",
placeholder="For higher rate limits",
type="password",
container=True,
scale=2
)
# AI Provider Selection
with gr.Row():
with gr.Column():
use_hyperbolic = gr.Checkbox(
label="Use Hyperbolic",
value=True,
interactive=True
)
hyperbolic_key = gr.Textbox(
label="Hyperbolic API Key",
type="password",
visible=True,
placeholder="Enter your Hyperbolic API key",
container=False,
scale=2
)
with gr.Column():
use_groq = gr.Checkbox(
label="Use Groq",
value=False,
interactive=True
)
groq_key = gr.Textbox(
label="Groq API Key",
type="password",
visible=False,
placeholder="Enter your Groq API key",
container=False,
scale=2
)
# Connect checkbox events
def update_provider_visibility(use_hyp: bool, use_grq: bool):
# Ensure only one provider is selected
if use_hyp and use_grq:
use_grq = False
return {
hyperbolic_key: gr.update(visible=use_hyp),
groq_key: gr.update(visible=use_grq),
use_groq: gr.update(value=use_grq),
use_hyperbolic: gr.update(value=use_hyp)
}
# Connect checkbox events
use_hyperbolic.change(
fn=update_provider_visibility,
inputs=[use_hyperbolic, use_groq],
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
)
use_groq.change(
fn=update_provider_visibility,
inputs=[use_hyperbolic, use_groq],
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
)
generate_btn = gr.Button("Generate πŸš€", variant="primary")
with gr.Row():
llms_output = gr.TextArea(
label="Generated llms.txt",
placeholder="Generated content will appear here...",
lines=10,
show_copy_button=True
)
llms_full_output = gr.TextArea(
label="Generated llms-full.txt",
placeholder="Full content will appear here...",
lines=10,
show_copy_button=True
)
# Add JSON output for debugging
json_output = gr.JSON(
label="Debug Output (JSON)",
visible=True
)
# Add download buttons for both files
def download_txt(text: str, filename: str) -> str:
"""Convert text to downloadable format"""
if not text:
return None
# Create a file with the proper name
with open(filename, "w", encoding="utf-8") as f:
f.write(text)
return filename
download_btn = gr.File(
label="Download llms.txt",
visible=True,
file_types=[".txt"]
)
download_full_btn = gr.File(
label="Download llms-full.txt",
visible=True,
file_types=[".txt"]
)
download_trigger = gr.Button("Download llms.txt πŸ“₯")
download_full_trigger = gr.Button("Download llms-full.txt πŸ“₯")
download_trigger.click(
fn=lambda x: download_txt(x, "llms.txt"),
inputs=[llms_output],
outputs=[download_btn]
)
download_full_trigger.click(
fn=lambda x: download_txt(x, "llms-full.txt"),
inputs=[llms_full_output],
outputs=[download_full_btn]
)
# Clean up function to remove temporary files
def cleanup():
try:
if os.path.exists("llms.txt"):
os.remove("llms.txt")
if os.path.exists("llms-full.txt"):
os.remove("llms-full.txt")
except:
pass
urls_found = gr.Dataframe(
headers=["URLs Found"],
label="Discovered URLs",
visible=True
)
def process_and_update(*args):
result, summaries, urls, full_result = process_website(*args)
urls_df = pd.DataFrame({
"URLs Found": urls if urls else ["No URLs found"]
})
# Clean up any existing temporary files
cleanup()
return {
llms_output: result,
llms_full_output: full_result,
json_output: summaries if summaries else "",
urls_found: urls_df,
download_btn: None,
download_full_btn: None
}
generate_btn.click(
process_and_update,
inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic],
outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn]
)
if __name__ == "__main__":
demo.launch()