Spaces:

martinbowling
/

llms.txt_generator

Sleeping

App Files Files Community

martinbowling commited on Nov 23, 2024

Commit

cdbdb61

verified ·

1 Parent(s): 034e9ee

Create app.py

Browse files

Files changed (1) hide show

app.py +361 -0

app.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import gradio as gr
+import requests
+import xml.etree.ElementTree as ET
+import re
+from urllib.parse import urljoin, urlparse
+from typing import List, Dict, Optional, Tuple
+import json
+from groq import Groq
+import time
+import pandas as pd
+def normalize_url(url: str) -> str:
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    return url.rstrip('/')
+def fetch_with_proxy(url: str) -> str:
+    """Fetch URL content with error handling"""
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        raise Exception(f"Failed to fetch {url}: {str(e)}")
+def extract_urls_from_sitemap(content: str) -> List[str]:
+    urls = []
+    try:
+        root = ET.fromstring(content)
+        ns = {
+            'ns': root.tag.split('}')[0].strip('{')
+        } if '}' in root.tag else {}
+        # Handle sitemap index
+        if 'sitemapindex' in root.tag:
+            for sitemap in root.findall('.//ns:loc', ns):
+                try:
+                    sitemap_content = fetch_with_proxy(sitemap.text.strip())
+                    urls.extend(extract_urls_from_sitemap(sitemap_content))
+                except Exception:
+                    continue
+        # Handle urlset
+        else:
+            for url in root.findall('.//ns:loc', ns):
+                urls.append(url.text.strip())
+    except ET.ParseError:
+        pass
+    return urls
+def get_common_sitemap_urls(base_url: str) -> List[str]:
+    domain = urlparse(base_url).hostname
+    return [
+        f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
+        f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
+        f"{base_url}/sitemap/{domain}-sitemap.xml"
+    ]
+def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
+    return [
+        line.split(': ')[1].strip() for line in robots_content.splitlines()
+        if line.lower().startswith('sitemap:')
+    ]
+def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
+    try:
+        response = requests.post(
+            'https://api.hyperbolic.xyz/v1/chat/completions',
+            headers={
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {api_key}',
+            },
+            json={
+                'model':
+                'meta-llama/Meta-Llama-3.1-8B-Instruct',
+                'messages': [{
+                    'role':
+                    'user',
+                    'content':
+                    f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
+                }],
+                'max_tokens':
+                2048,
+                'temperature':
+                0.7,
+                'top_p':
+                0.9,
+                'stream':
+                False
+            })
+        response.raise_for_status()
+        return response.json()['choices'][0]['message']['content']
+    except Exception as e:
+        return f"Error generating Hyperbolic summary: {str(e)}"
+def generate_groq_summary(url: str, content: str, api_key: str) -> str:
+    try:
+        client = Groq(api_key=api_key)
+        completion = client.chat.completions.create(
+            messages=[{
+                "role":
+                "user",
+                "content":
+                f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
+            }],
+            model="llama-3.2-1b-preview",
+            temperature=0.7,
+            max_tokens=200,
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        return f"Error generating Groq summary: {str(e)}"
+def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
+    try:
+        headers = {"Accept": "text/plain"}
+        if markdowner_key:
+            headers["Authorization"] = f"Bearer {markdowner_key}"
+        response = requests.post("https://md.dhr.wtf/",
+                                 params={"url": url},
+                                 headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        return f"Error fetching content: {str(e)}"
+def generate_llms_txt(summaries: List[Dict]) -> str:
+    if not summaries:
+        return "No summaries generated"
+    # Find homepage
+    homepage = next(
+        (s for s in summaries if urlparse(s['url']).path in ['', '/']),
+        summaries[0])
+    content = f"# {urlparse(homepage['url']).hostname}\n\n"
+    content += f"> {homepage['summary']}\n\n"
+    content += "## Main Pages\n\n"
+    for summary in summaries:
+        if summary != homepage:
+            path = urlparse(summary['url']).path
+            content += f"- [{path}]({summary['url']}): {summary['summary']}\n"
+    return content
+def process_website(
+    url: str,
+    hyperbolic_key: str = "",
+    groq_key: str = "",
+    markdowner_key: str = "",
+    use_hyperbolic: bool = True,
+    progress=gr.Progress()
+) -> Tuple[str, str, List[str]]:
+    try:
+        if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic
+                                                            and groq_key):
+            return "Error: Please provide an API key for the selected AI provider", None, []
+        base_url = normalize_url(url)
+        progress(0, desc="Initializing...")
+        # Try robots.txt first
+        sitemap_urls = []
+        try:
+            robots_url = urljoin(base_url, '/robots.txt')
+            robots_content = fetch_with_proxy(robots_url)
+            sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
+        except:
+            pass
+        progress(0.2, desc="Checking common sitemap locations...")
+        # Try common locations if no sitemaps found
+        if not sitemap_urls:
+            common_locations = get_common_sitemap_urls(base_url)
+            for sitemap_url in common_locations:
+                try:
+                    content = fetch_with_proxy(sitemap_url)
+                    if '<?xml' in content or '<urlset' in content:
+                        sitemap_urls.append(sitemap_url)
+                        break
+                except:
+                    continue
+        if not sitemap_urls:
+            return "Error: No sitemaps found", None, []
+        progress(0.4, desc="Processing sitemaps...")
+        # Process sitemaps
+        all_urls = []
+        for sitemap_url in sitemap_urls:
+            try:
+                content = fetch_with_proxy(sitemap_url)
+                urls = extract_urls_from_sitemap(content)
+                all_urls.extend(urls)
+            except:
+                continue
+        if not all_urls:
+            return "Error: No URLs found in sitemaps", None, []
+        progress(0.6, desc="Generating summaries...")
+        # Generate summaries
+        summaries = []
+        for i, page_url in enumerate(all_urls):
+            try:
+                # Get content via Markdowner
+                content = get_page_content(page_url, markdowner_key)
+                # Generate summary with selected provider
+                if use_hyperbolic:
+                    summary = generate_hyperbolic_summary(
+                        page_url, content, hyperbolic_key)
+                else:
+                    summary = generate_groq_summary(page_url, content,
+                                                    groq_key)
+                summaries.append({"url": page_url, "summary": summary})
+                # Rate limiting
+                time.sleep(1)  # Basic rate limiting
+                progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
+                         desc=f"Processing URL {i+1}/{len(all_urls)}")
+            except Exception as e:
+                print(f"Error processing {page_url}: {e}")
+        # Generate llms.txt
+        llms_txt = generate_llms_txt(summaries)
+        return llms_txt, json.dumps(summaries, indent=2), all_urls
+    except Exception as e:
+        return f"Error: {str(e)}", None, []
+# Gradio Interface
+with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # llms.txt Generator 🤖✨
+    Generate AI-powered llms.txt files for any website 🌐
+    """)
+    with gr.Row():
+        url_input = gr.Textbox(label="Website URL",
+                               placeholder="Enter website URL")
+        markdowner_key = gr.Textbox(label="Markdowner API Key (optional)",
+                                    placeholder="For higher rate limits",
+                                    type="password")
+    # AI Provider Selection
+    with gr.Row():
+        with gr.Column():
+            use_hyperbolic = gr.Checkbox(label="Use Hyperbolic",
+                                         value=False,
+                                         interactive=True)
+            hyperbolic_key = gr.Textbox(
+                label="Hyperbolic API Key",
+                type="password",
+                visible=True,
+                placeholder="Enter your Hyperbolic API key")
+        with gr.Column():
+            use_groq = gr.Checkbox(label="Use Groq",
+                                   value=False,
+                                   interactive=True)
+            groq_key = gr.Textbox(label="Groq API Key",
+                                  type="password",
+                                  visible=False,
+                                  placeholder="Enter your Groq API key")
+    def update_provider_visibility(use_hyp: bool, use_grq: bool):
+        # Ensure only one provider is selected
+        if use_hyp and use_grq:
+            use_grq = False
+        return {
+            hyperbolic_key: gr.update(visible=use_hyp),
+            groq_key: gr.update(visible=use_grq),
+            use_groq: gr.update(value=use_grq),
+            use_hyperbolic: gr.update(value=use_hyp)
+        }
+    # Connect checkbox events
+    use_hyperbolic.change(
+        fn=update_provider_visibility,
+        inputs=[use_hyperbolic, use_groq],
+        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
+    use_groq.change(
+        fn=update_provider_visibility,
+        inputs=[use_hyperbolic, use_groq],
+        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
+    generate_btn = gr.Button("Generate 🚀", variant="primary")
+    with gr.Row():
+        llms_output = gr.TextArea(
+            label="Generated llms.txt",
+            placeholder="Generated content will appear here...",
+            lines=10,
+            show_copy_button=True  # Enable built-in copy button
+        )
+        json_output = gr.JSON(label="Raw Summaries", visible=True)
+    # Add download button
+    def download_llms_txt(text):
+        """Convert text to downloadable format"""
+        if not text:
+            return None
+        return text.encode('utf-8')
+    download_btn = gr.File(label="Download llms.txt",
+                           visible=False,
+                           file_types=[".txt", ".md"])
+    download_trigger = gr.Button("Download llms.txt 📥")
+    download_trigger.click(fn=download_llms_txt,
+                           inputs=[llms_output],
+                           outputs=[download_btn])
+    urls_found = gr.Dataframe(headers=["URLs Found"],
+                              label="Discovered URLs",
+                              visible=True)
+    def process_and_update(*args):
+        result, summaries, urls = process_website(*args)
+        # Create DataFrame for URLs
+        urls_df = pd.DataFrame(
+            {"URLs Found": urls if urls else ["No URLs found"]})
+        return {
+            llms_output: result,
+            json_output: summaries if summaries else "",
+            urls_found: urls_df,
+            download_btn: None  # Reset download button
+        }
+    generate_btn.click(
+        process_and_update,
+        inputs=[
+            url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic
+        ],
+        outputs=[llms_output, json_output, urls_found, download_btn])
+if __name__ == "__main__":
+    demo.launch()