Spaces:

martinbowling
/

llms.txt_generator

Sleeping

App Files Files Community

llms.txt_generator / app.py

martinbowling

publication version

d279120 verified 4 months ago

raw

history blame contribute delete

15.6 kB

	import gradio as gr
	import requests
	import xml.etree.ElementTree as ET
	import re
	from urllib.parse import urljoin, urlparse
	from typing import List, Dict, Optional, Tuple
	import json
	from groq import Groq
	import time
	import pandas as pd
	import os


	def normalize_url(url: str) -> str:
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	return url.rstrip('/')


	def fetch_with_proxy(url: str) -> str:
	"""Fetch URL content with error handling"""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	return response.text
	except Exception as e:
	raise Exception(f"Failed to fetch {url}: {str(e)}")


	def extract_urls_from_sitemap(content: str) -> List[str]:
	urls = []
	try:
	root = ET.fromstring(content)
	ns = {
	'ns': root.tag.split('}')[0].strip('{')
	} if '}' in root.tag else {}

	# Handle sitemap index
	if 'sitemapindex' in root.tag:
	for sitemap in root.findall('.//ns:loc', ns):
	try:
	sitemap_content = fetch_with_proxy(sitemap.text.strip())
	urls.extend(extract_urls_from_sitemap(sitemap_content))
	except Exception:
	continue
	# Handle urlset
	else:
	for url in root.findall('.//ns:loc', ns):
	urls.append(url.text.strip())
	except ET.ParseError:
	pass
	return urls


	def get_common_sitemap_urls(base_url: str) -> List[str]:
	domain = urlparse(base_url).hostname
	return [
	f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
	f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
	f"{base_url}/sitemap/{domain}-sitemap.xml"
	]


	def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
	return [
	line.split(': ')[1].strip() for line in robots_content.splitlines()
	if line.lower().startswith('sitemap:')
	]


	def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
	try:
	# Ensure content is properly encoded
	content = content.encode('utf-8', errors='ignore').decode('utf-8')

	response = requests.post(
	'https://api.hyperbolic.xyz/v1/chat/completions',
	headers={
	'Content-Type': 'application/json; charset=utf-8',
	'Authorization': f'Bearer {api_key}',
	},
	json={
	'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
	'messages': [{
	'role': 'user',
	'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

	URL: {url}
	Content: {content}

	Example response format:
	<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
	}],
	'max_tokens': 200,
	'temperature': 0.7,
	'top_p': 0.9,
	'stream': False
	},
	timeout=30
	)
	response.raise_for_status()
	result = response.json()
	summary = result['choices'][0]['message']['content']
	# Extract summary from tags
	match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
	return match.group(1).strip() if match else summary.strip()
	except Exception as e:
	print(f"Error in generate_hyperbolic_summary: {str(e)}")
	return f"Error generating Hyperbolic summary: {str(e)}"


	def generate_groq_summary(url: str, content: str, api_key: str) -> str:
	try:
	# Ensure content is properly encoded
	content = content.encode('utf-8', errors='ignore').decode('utf-8')

	client = Groq(api_key=api_key)
	completion = client.chat.completions.create(
	messages=[{
	'role': 'user',
	'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

	URL: {url}
	Content: {content}

	Example response format:
	<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
	}],
	model="llama-3.2-1b-preview",
	temperature=0.7,
	max_tokens=200,
	top_p=0.9,
	stream=False
	)
	summary = completion.choices[0].message.content
	# Extract summary from tags
	match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
	return match.group(1).strip() if match else summary.strip()
	except Exception as e:
	print(f"Error in generate_groq_summary: {str(e)}")
	return f"Error generating Groq summary: {str(e)}"


	def generate_llms_txt(summaries: List[Dict[str, str]]) -> str:
	if not summaries:
	return ""

	return "\n".join([
	f"# {summary['url']}\n\n{summary['summary']}\n\n---\n"
	for summary in summaries
	])


	def generate_llms_full_txt(summaries: List[Dict]) -> str:
	if not summaries:
	return "No content generated"

	content = ""
	for summary in summaries:
	content += f"# {summary['url']}\n\n"
	content += f"{summary.get('fullContent', 'No content available')}\n\n"
	content += "---\n\n"

	return content


	def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
	try:
	headers = {
	"Accept": "text/plain",
	"Accept-Language": "en-US,en;q=0.9",
	"User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)",
	"Origin": "http://localhost:3000",
	"Referer": "http://localhost:3000/",
	}
	if markdowner_key:
	headers["Authorization"] = f"Bearer {markdowner_key}"

	# Use direct URL construction like the curl command
	encoded_url = requests.utils.quote(url)
	full_url = f"https://md.dhr.wtf/?url={encoded_url}"

	print(f"Requesting URL: {full_url}") # Debug logging
	print(f"Headers: {headers}") # Debug logging

	response = requests.get( # Changed to GET request
	full_url,
	headers=headers,
	timeout=30
	)

	response.encoding = 'utf-8'
	response.raise_for_status()

	if response.status_code == 200:
	return response.text
	else:
	print(f"Response status: {response.status_code}") # Debug logging
	print(f"Response headers: {response.headers}") # Debug logging
	print(f"Response text: {response.text[:500]}") # Debug logging
	return f"Error fetching content: {response.status_code} {response.reason}"

	except Exception as e:
	print(f"Error fetching content for {url}: {str(e)}")
	return f"Error fetching content: {str(e)}"


	def process_website(
	url: str,
	hyperbolic_key: str = "",
	groq_key: str = "",
	markdowner_key: str = "",
	use_hyperbolic: bool = True,
	progress=gr.Progress()
	) -> Tuple[str, str, List[str], str]:
	try:
	if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key):
	return "Error: Please provide an API key for the selected AI provider", None, [], ""

	base_url = normalize_url(url)
	progress(0, desc="Initializing...")

	# Try robots.txt first
	sitemap_urls = []
	try:
	robots_url = urljoin(base_url, '/robots.txt')
	robots_content = fetch_with_proxy(robots_url)
	sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
	except:
	pass

	progress(0.2, desc="Checking common sitemap locations...")

	# Try common locations if no sitemaps found
	if not sitemap_urls:
	common_locations = get_common_sitemap_urls(base_url)
	for sitemap_url in common_locations:
	try:
	content = fetch_with_proxy(sitemap_url)
	if '<?xml' in content or '<urlset' in content:
	sitemap_urls.append(sitemap_url)
	break
	except:
	continue

	if not sitemap_urls:
	return "Error: No sitemaps found", None, [], ""

	progress(0.4, desc="Processing sitemaps...")

	# Process sitemaps
	all_urls = []
	for sitemap_url in sitemap_urls:
	try:
	content = fetch_with_proxy(sitemap_url)
	urls = extract_urls_from_sitemap(content)
	all_urls.extend(urls)
	except:
	continue

	if not all_urls:
	return "Error: No URLs found in sitemaps", None, [], ""

	progress(0.6, desc="Generating summaries...")

	# Generate summaries
	summaries = []
	for i, page_url in enumerate(all_urls):
	try:
	# Get content via Markdowner
	content = get_page_content(page_url, markdowner_key)

	# Store full content for llms-full.txt
	full_content = content

	# Generate summary with selected provider
	if use_hyperbolic:
	summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key)
	else:
	summary = generate_groq_summary(page_url, content, groq_key)

	summaries.append({
	"url": page_url,
	"summary": summary,
	"fullContent": full_content,
	"provider": "hyperbolic" if use_hyperbolic else "groq"
	})

	# Rate limiting
	time.sleep(1)

	progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
	desc=f"Processing URL {i+1}/{len(all_urls)}")
	except Exception as e:
	print(f"Error processing {page_url}: {str(e)}")
	continue

	# Generate both formats
	llms_txt = generate_llms_txt(summaries)
	llms_full_txt = generate_llms_full_txt(summaries)

	return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt

	except Exception as e:
	print(f"Error in process_website: {str(e)}")
	return f"Processing failed: {str(e)}", None, [], ""


	# Gradio Interface
	with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# llms.txt Generator 🤖✨
	Generate AI-powered llms.txt files for any website 🌐
	""")

	with gr.Row():
	url_input = gr.Textbox(
	label="Website URL",
	placeholder="Enter website URL"
	)
	markdowner_key = gr.Textbox(
	label="Markdowner API Key (Optional)",
	placeholder="For higher rate limits",
	type="password",
	container=True,
	scale=2
	)

	# AI Provider Selection
	with gr.Row():
	with gr.Column():
	use_hyperbolic = gr.Checkbox(
	label="Use Hyperbolic",
	value=True,
	interactive=True
	)
	hyperbolic_key = gr.Textbox(
	label="Hyperbolic API Key",
	type="password",
	visible=True,
	placeholder="Enter your Hyperbolic API key",
	container=False,
	scale=2
	)

	with gr.Column():
	use_groq = gr.Checkbox(
	label="Use Groq",
	value=False,
	interactive=True
	)
	groq_key = gr.Textbox(
	label="Groq API Key",
	type="password",
	visible=False,
	placeholder="Enter your Groq API key",
	container=False,
	scale=2
	)

	# Connect checkbox events
	def update_provider_visibility(use_hyp: bool, use_grq: bool):
	# Ensure only one provider is selected
	if use_hyp and use_grq:
	use_grq = False

	return {
	hyperbolic_key: gr.update(visible=use_hyp),
	groq_key: gr.update(visible=use_grq),
	use_groq: gr.update(value=use_grq),
	use_hyperbolic: gr.update(value=use_hyp)
	}

	# Connect checkbox events
	use_hyperbolic.change(
	fn=update_provider_visibility,
	inputs=[use_hyperbolic, use_groq],
	outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
	)

	use_groq.change(
	fn=update_provider_visibility,
	inputs=[use_hyperbolic, use_groq],
	outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
	)

	generate_btn = gr.Button("Generate 🚀", variant="primary")

	with gr.Row():
	llms_output = gr.TextArea(
	label="Generated llms.txt",
	placeholder="Generated content will appear here...",
	lines=10,
	show_copy_button=True
	)
	llms_full_output = gr.TextArea(
	label="Generated llms-full.txt",
	placeholder="Full content will appear here...",
	lines=10,
	show_copy_button=True
	)

	# Add JSON output for debugging
	json_output = gr.JSON(
	label="Debug Output (JSON)",
	visible=True
	)

	# Add download buttons for both files
	def download_txt(text: str, filename: str) -> str:
	"""Convert text to downloadable format"""
	if not text:
	return None
	# Create a file with the proper name
	with open(filename, "w", encoding="utf-8") as f:
	f.write(text)
	return filename

	download_btn = gr.File(
	label="Download llms.txt",
	visible=True,
	file_types=[".txt"]
	)

	download_full_btn = gr.File(
	label="Download llms-full.txt",
	visible=True,
	file_types=[".txt"]
	)

	download_trigger = gr.Button("Download llms.txt 📥")
	download_full_trigger = gr.Button("Download llms-full.txt 📥")

	download_trigger.click(
	fn=lambda x: download_txt(x, "llms.txt"),
	inputs=[llms_output],
	outputs=[download_btn]
	)

	download_full_trigger.click(
	fn=lambda x: download_txt(x, "llms-full.txt"),
	inputs=[llms_full_output],
	outputs=[download_full_btn]
	)

	# Clean up function to remove temporary files
	def cleanup():
	try:
	if os.path.exists("llms.txt"):
	os.remove("llms.txt")
	if os.path.exists("llms-full.txt"):
	os.remove("llms-full.txt")
	except:
	pass

	urls_found = gr.Dataframe(
	headers=["URLs Found"],
	label="Discovered URLs",
	visible=True
	)

	def process_and_update(*args):
	result, summaries, urls, full_result = process_website(*args)

	urls_df = pd.DataFrame({
	"URLs Found": urls if urls else ["No URLs found"]
	})

	# Clean up any existing temporary files
	cleanup()

	return {
	llms_output: result,
	llms_full_output: full_result,
	json_output: summaries if summaries else "",
	urls_found: urls_df,
	download_btn: None,
	download_full_btn: None
	}

	generate_btn.click(
	process_and_update,
	inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic],
	outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn]
	)

	if __name__ == "__main__":
	demo.launch()