Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import xml.etree.ElementTree as ET | |
import re | |
from urllib.parse import urljoin, urlparse | |
from typing import List, Dict, Optional, Tuple | |
import json | |
from groq import Groq | |
import time | |
import pandas as pd | |
import os | |
def normalize_url(url: str) -> str: | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
return url.rstrip('/') | |
def fetch_with_proxy(url: str) -> str: | |
"""Fetch URL content with error handling""" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
return response.text | |
except Exception as e: | |
raise Exception(f"Failed to fetch {url}: {str(e)}") | |
def extract_urls_from_sitemap(content: str) -> List[str]: | |
urls = [] | |
try: | |
root = ET.fromstring(content) | |
ns = { | |
'ns': root.tag.split('}')[0].strip('{') | |
} if '}' in root.tag else {} | |
# Handle sitemap index | |
if 'sitemapindex' in root.tag: | |
for sitemap in root.findall('.//ns:loc', ns): | |
try: | |
sitemap_content = fetch_with_proxy(sitemap.text.strip()) | |
urls.extend(extract_urls_from_sitemap(sitemap_content)) | |
except Exception: | |
continue | |
# Handle urlset | |
else: | |
for url in root.findall('.//ns:loc', ns): | |
urls.append(url.text.strip()) | |
except ET.ParseError: | |
pass | |
return urls | |
def get_common_sitemap_urls(base_url: str) -> List[str]: | |
domain = urlparse(base_url).hostname | |
return [ | |
f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml", | |
f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml", | |
f"{base_url}/sitemap/{domain}-sitemap.xml" | |
] | |
def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]: | |
return [ | |
line.split(': ')[1].strip() for line in robots_content.splitlines() | |
if line.lower().startswith('sitemap:') | |
] | |
def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str: | |
try: | |
# Ensure content is properly encoded | |
content = content.encode('utf-8', errors='ignore').decode('utf-8') | |
response = requests.post( | |
'https://api.hyperbolic.xyz/v1/chat/completions', | |
headers={ | |
'Content-Type': 'application/json; charset=utf-8', | |
'Authorization': f'Bearer {api_key}', | |
}, | |
json={ | |
'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', | |
'messages': [{ | |
'role': 'user', | |
'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags. | |
URL: {url} | |
Content: {content} | |
Example response format: | |
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>""" | |
}], | |
'max_tokens': 200, | |
'temperature': 0.7, | |
'top_p': 0.9, | |
'stream': False | |
}, | |
timeout=30 | |
) | |
response.raise_for_status() | |
result = response.json() | |
summary = result['choices'][0]['message']['content'] | |
# Extract summary from tags | |
match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL) | |
return match.group(1).strip() if match else summary.strip() | |
except Exception as e: | |
print(f"Error in generate_hyperbolic_summary: {str(e)}") | |
return f"Error generating Hyperbolic summary: {str(e)}" | |
def generate_groq_summary(url: str, content: str, api_key: str) -> str: | |
try: | |
# Ensure content is properly encoded | |
content = content.encode('utf-8', errors='ignore').decode('utf-8') | |
client = Groq(api_key=api_key) | |
completion = client.chat.completions.create( | |
messages=[{ | |
'role': 'user', | |
'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags. | |
URL: {url} | |
Content: {content} | |
Example response format: | |
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>""" | |
}], | |
model="llama-3.2-1b-preview", | |
temperature=0.7, | |
max_tokens=200, | |
top_p=0.9, | |
stream=False | |
) | |
summary = completion.choices[0].message.content | |
# Extract summary from tags | |
match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL) | |
return match.group(1).strip() if match else summary.strip() | |
except Exception as e: | |
print(f"Error in generate_groq_summary: {str(e)}") | |
return f"Error generating Groq summary: {str(e)}" | |
def generate_llms_txt(summaries: List[Dict[str, str]]) -> str: | |
if not summaries: | |
return "" | |
return "\n".join([ | |
f"# {summary['url']}\n\n{summary['summary']}\n\n---\n" | |
for summary in summaries | |
]) | |
def generate_llms_full_txt(summaries: List[Dict]) -> str: | |
if not summaries: | |
return "No content generated" | |
content = "" | |
for summary in summaries: | |
content += f"# {summary['url']}\n\n" | |
content += f"{summary.get('fullContent', 'No content available')}\n\n" | |
content += "---\n\n" | |
return content | |
def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str: | |
try: | |
headers = { | |
"Accept": "text/plain", | |
"Accept-Language": "en-US,en;q=0.9", | |
"User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)", | |
"Origin": "http://localhost:3000", | |
"Referer": "http://localhost:3000/", | |
} | |
if markdowner_key: | |
headers["Authorization"] = f"Bearer {markdowner_key}" | |
# Use direct URL construction like the curl command | |
encoded_url = requests.utils.quote(url) | |
full_url = f"https://md.dhr.wtf/?url={encoded_url}" | |
print(f"Requesting URL: {full_url}") # Debug logging | |
print(f"Headers: {headers}") # Debug logging | |
response = requests.get( # Changed to GET request | |
full_url, | |
headers=headers, | |
timeout=30 | |
) | |
response.encoding = 'utf-8' | |
response.raise_for_status() | |
if response.status_code == 200: | |
return response.text | |
else: | |
print(f"Response status: {response.status_code}") # Debug logging | |
print(f"Response headers: {response.headers}") # Debug logging | |
print(f"Response text: {response.text[:500]}") # Debug logging | |
return f"Error fetching content: {response.status_code} {response.reason}" | |
except Exception as e: | |
print(f"Error fetching content for {url}: {str(e)}") | |
return f"Error fetching content: {str(e)}" | |
def process_website( | |
url: str, | |
hyperbolic_key: str = "", | |
groq_key: str = "", | |
markdowner_key: str = "", | |
use_hyperbolic: bool = True, | |
progress=gr.Progress() | |
) -> Tuple[str, str, List[str], str]: | |
try: | |
if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key): | |
return "Error: Please provide an API key for the selected AI provider", None, [], "" | |
base_url = normalize_url(url) | |
progress(0, desc="Initializing...") | |
# Try robots.txt first | |
sitemap_urls = [] | |
try: | |
robots_url = urljoin(base_url, '/robots.txt') | |
robots_content = fetch_with_proxy(robots_url) | |
sitemap_urls = extract_sitemap_urls_from_robots(robots_content) | |
except: | |
pass | |
progress(0.2, desc="Checking common sitemap locations...") | |
# Try common locations if no sitemaps found | |
if not sitemap_urls: | |
common_locations = get_common_sitemap_urls(base_url) | |
for sitemap_url in common_locations: | |
try: | |
content = fetch_with_proxy(sitemap_url) | |
if '<?xml' in content or '<urlset' in content: | |
sitemap_urls.append(sitemap_url) | |
break | |
except: | |
continue | |
if not sitemap_urls: | |
return "Error: No sitemaps found", None, [], "" | |
progress(0.4, desc="Processing sitemaps...") | |
# Process sitemaps | |
all_urls = [] | |
for sitemap_url in sitemap_urls: | |
try: | |
content = fetch_with_proxy(sitemap_url) | |
urls = extract_urls_from_sitemap(content) | |
all_urls.extend(urls) | |
except: | |
continue | |
if not all_urls: | |
return "Error: No URLs found in sitemaps", None, [], "" | |
progress(0.6, desc="Generating summaries...") | |
# Generate summaries | |
summaries = [] | |
for i, page_url in enumerate(all_urls): | |
try: | |
# Get content via Markdowner | |
content = get_page_content(page_url, markdowner_key) | |
# Store full content for llms-full.txt | |
full_content = content | |
# Generate summary with selected provider | |
if use_hyperbolic: | |
summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key) | |
else: | |
summary = generate_groq_summary(page_url, content, groq_key) | |
summaries.append({ | |
"url": page_url, | |
"summary": summary, | |
"fullContent": full_content, | |
"provider": "hyperbolic" if use_hyperbolic else "groq" | |
}) | |
# Rate limiting | |
time.sleep(1) | |
progress((0.6 + (0.4 * (i + 1) / len(all_urls))), | |
desc=f"Processing URL {i+1}/{len(all_urls)}") | |
except Exception as e: | |
print(f"Error processing {page_url}: {str(e)}") | |
continue | |
# Generate both formats | |
llms_txt = generate_llms_txt(summaries) | |
llms_full_txt = generate_llms_full_txt(summaries) | |
return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt | |
except Exception as e: | |
print(f"Error in process_website: {str(e)}") | |
return f"Processing failed: {str(e)}", None, [], "" | |
# Gradio Interface | |
with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# llms.txt Generator π€β¨ | |
Generate AI-powered llms.txt files for any website π | |
""") | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="Enter website URL" | |
) | |
markdowner_key = gr.Textbox( | |
label="Markdowner API Key (Optional)", | |
placeholder="For higher rate limits", | |
type="password", | |
container=True, | |
scale=2 | |
) | |
# AI Provider Selection | |
with gr.Row(): | |
with gr.Column(): | |
use_hyperbolic = gr.Checkbox( | |
label="Use Hyperbolic", | |
value=True, | |
interactive=True | |
) | |
hyperbolic_key = gr.Textbox( | |
label="Hyperbolic API Key", | |
type="password", | |
visible=True, | |
placeholder="Enter your Hyperbolic API key", | |
container=False, | |
scale=2 | |
) | |
with gr.Column(): | |
use_groq = gr.Checkbox( | |
label="Use Groq", | |
value=False, | |
interactive=True | |
) | |
groq_key = gr.Textbox( | |
label="Groq API Key", | |
type="password", | |
visible=False, | |
placeholder="Enter your Groq API key", | |
container=False, | |
scale=2 | |
) | |
# Connect checkbox events | |
def update_provider_visibility(use_hyp: bool, use_grq: bool): | |
# Ensure only one provider is selected | |
if use_hyp and use_grq: | |
use_grq = False | |
return { | |
hyperbolic_key: gr.update(visible=use_hyp), | |
groq_key: gr.update(visible=use_grq), | |
use_groq: gr.update(value=use_grq), | |
use_hyperbolic: gr.update(value=use_hyp) | |
} | |
# Connect checkbox events | |
use_hyperbolic.change( | |
fn=update_provider_visibility, | |
inputs=[use_hyperbolic, use_groq], | |
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic] | |
) | |
use_groq.change( | |
fn=update_provider_visibility, | |
inputs=[use_hyperbolic, use_groq], | |
outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic] | |
) | |
generate_btn = gr.Button("Generate π", variant="primary") | |
with gr.Row(): | |
llms_output = gr.TextArea( | |
label="Generated llms.txt", | |
placeholder="Generated content will appear here...", | |
lines=10, | |
show_copy_button=True | |
) | |
llms_full_output = gr.TextArea( | |
label="Generated llms-full.txt", | |
placeholder="Full content will appear here...", | |
lines=10, | |
show_copy_button=True | |
) | |
# Add JSON output for debugging | |
json_output = gr.JSON( | |
label="Debug Output (JSON)", | |
visible=True | |
) | |
# Add download buttons for both files | |
def download_txt(text: str, filename: str) -> str: | |
"""Convert text to downloadable format""" | |
if not text: | |
return None | |
# Create a file with the proper name | |
with open(filename, "w", encoding="utf-8") as f: | |
f.write(text) | |
return filename | |
download_btn = gr.File( | |
label="Download llms.txt", | |
visible=True, | |
file_types=[".txt"] | |
) | |
download_full_btn = gr.File( | |
label="Download llms-full.txt", | |
visible=True, | |
file_types=[".txt"] | |
) | |
download_trigger = gr.Button("Download llms.txt π₯") | |
download_full_trigger = gr.Button("Download llms-full.txt π₯") | |
download_trigger.click( | |
fn=lambda x: download_txt(x, "llms.txt"), | |
inputs=[llms_output], | |
outputs=[download_btn] | |
) | |
download_full_trigger.click( | |
fn=lambda x: download_txt(x, "llms-full.txt"), | |
inputs=[llms_full_output], | |
outputs=[download_full_btn] | |
) | |
# Clean up function to remove temporary files | |
def cleanup(): | |
try: | |
if os.path.exists("llms.txt"): | |
os.remove("llms.txt") | |
if os.path.exists("llms-full.txt"): | |
os.remove("llms-full.txt") | |
except: | |
pass | |
urls_found = gr.Dataframe( | |
headers=["URLs Found"], | |
label="Discovered URLs", | |
visible=True | |
) | |
def process_and_update(*args): | |
result, summaries, urls, full_result = process_website(*args) | |
urls_df = pd.DataFrame({ | |
"URLs Found": urls if urls else ["No URLs found"] | |
}) | |
# Clean up any existing temporary files | |
cleanup() | |
return { | |
llms_output: result, | |
llms_full_output: full_result, | |
json_output: summaries if summaries else "", | |
urls_found: urls_df, | |
download_btn: None, | |
download_full_btn: None | |
} | |
generate_btn.click( | |
process_and_update, | |
inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic], | |
outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn] | |
) | |
if __name__ == "__main__": | |
demo.launch() | |