Spaces:

martinbowling
/

llms.txt_generator

Sleeping

File size: 15,647 Bytes

cdbdb61
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
 
 
cdbdb61
 
 
d279120
cdbdb61
 
 
d279120
cdbdb61
d279120
 
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
cdbdb61
d279120
cdbdb61
 
 
 
 
d279120
 
 
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
cdbdb61
 
 
 
d279120
 
cdbdb61
d279120
 
 
 
cdbdb61
d279120
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
cdbdb61
 
 
d279120
 
 
cdbdb61
d279120
 
cdbdb61
d279120
 
 
 
 
cdbdb61
d279120
 
cdbdb61
d279120
 
 
 
 
 
 
cdbdb61
d279120
 
 
cdbdb61
 
 
 
 
 
 
 
 
d279120
cdbdb61
d279120
 
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
d279120
 
 
cdbdb61
 
d279120
cdbdb61
d279120
cdbdb61
d279120
 
 
 
 
 
cdbdb61
 
d279120
cdbdb61
 
d279120
cdbdb61
d279120
 
cdbdb61
d279120
cdbdb61
d279120
cdbdb61
d279120
cdbdb61
 
d279120
 
cdbdb61
 
 
 
 
 
 
 
 
 
d279120
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
 
 
d279120
 
 
 
 
cdbdb61
 
 
 
d279120
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
 
cdbdb61
 
 
 
d279120
 
cdbdb61
 
 
 
 
 
 
 
d279120
cdbdb61
d279120
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
 
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
cdbdb61
d279120
 
cdbdb61
 
 
d279120
cdbdb61
 
d279120
 
cdbdb61
 
 
 
d279120
 
 
cdbdb61

import gradio as gr
import requests
import xml.etree.ElementTree as ET
import re
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional, Tuple
import json
from groq import Groq
import time
import pandas as pd
import os


def normalize_url(url: str) -> str:
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    return url.rstrip('/')


def fetch_with_proxy(url: str) -> str:
    """Fetch URL content with error handling"""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        raise Exception(f"Failed to fetch {url}: {str(e)}")


def extract_urls_from_sitemap(content: str) -> List[str]:
    urls = []
    try:
        root = ET.fromstring(content)
        ns = {
            'ns': root.tag.split('}')[0].strip('{')
        } if '}' in root.tag else {}

        # Handle sitemap index
        if 'sitemapindex' in root.tag:
            for sitemap in root.findall('.//ns:loc', ns):
                try:
                    sitemap_content = fetch_with_proxy(sitemap.text.strip())
                    urls.extend(extract_urls_from_sitemap(sitemap_content))
                except Exception:
                    continue
        # Handle urlset
        else:
            for url in root.findall('.//ns:loc', ns):
                urls.append(url.text.strip())
    except ET.ParseError:
        pass
    return urls


def get_common_sitemap_urls(base_url: str) -> List[str]:
    domain = urlparse(base_url).hostname
    return [
        f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
        f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
        f"{base_url}/sitemap/{domain}-sitemap.xml"
    ]


def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
    return [
        line.split(': ')[1].strip() for line in robots_content.splitlines()
        if line.lower().startswith('sitemap:')
    ]


def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
    try:
        # Ensure content is properly encoded
        content = content.encode('utf-8', errors='ignore').decode('utf-8')

        response = requests.post(
            'https://api.hyperbolic.xyz/v1/chat/completions',
            headers={
                'Content-Type': 'application/json; charset=utf-8',
                'Authorization': f'Bearer {api_key}',
            },
            json={
                'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
                'messages': [{
                    'role': 'user',
                    'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

URL: {url}
Content: {content}

Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
                }],
                'max_tokens': 200,
                'temperature': 0.7,
                'top_p': 0.9,
                'stream': False
            },
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        summary = result['choices'][0]['message']['content']
        # Extract summary from tags
        match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
        return match.group(1).strip() if match else summary.strip()
    except Exception as e:
        print(f"Error in generate_hyperbolic_summary: {str(e)}")
        return f"Error generating Hyperbolic summary: {str(e)}"


def generate_groq_summary(url: str, content: str, api_key: str) -> str:
    try:
        # Ensure content is properly encoded
        content = content.encode('utf-8', errors='ignore').decode('utf-8')

        client = Groq(api_key=api_key)
        completion = client.chat.completions.create(
            messages=[{
                'role': 'user',
                'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

URL: {url}
Content: {content}

Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
            }],
            model="llama-3.2-1b-preview",
            temperature=0.7,
            max_tokens=200,
            top_p=0.9,
            stream=False
        )
        summary = completion.choices[0].message.content
        # Extract summary from tags
        match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
        return match.group(1).strip() if match else summary.strip()
    except Exception as e:
        print(f"Error in generate_groq_summary: {str(e)}")
        return f"Error generating Groq summary: {str(e)}"


def generate_llms_txt(summaries: List[Dict[str, str]]) -> str:
    if not summaries:
        return ""

    return "\n".join([
        f"# {summary['url']}\n\n{summary['summary']}\n\n---\n"
        for summary in summaries
    ])


def generate_llms_full_txt(summaries: List[Dict]) -> str:
    if not summaries:
        return "No content generated"

    content = ""
    for summary in summaries:
        content += f"# {summary['url']}\n\n"
        content += f"{summary.get('fullContent', 'No content available')}\n\n"
        content += "---\n\n"

    return content


def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
    try:
        headers = {
            "Accept": "text/plain",
            "Accept-Language": "en-US,en;q=0.9",
            "User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)",
            "Origin": "http://localhost:3000",
            "Referer": "http://localhost:3000/",
        }
        if markdowner_key:
            headers["Authorization"] = f"Bearer {markdowner_key}"

        # Use direct URL construction like the curl command
        encoded_url = requests.utils.quote(url)
        full_url = f"https://md.dhr.wtf/?url={encoded_url}"

        print(f"Requesting URL: {full_url}")  # Debug logging
        print(f"Headers: {headers}")  # Debug logging

        response = requests.get(  # Changed to GET request
            full_url,
            headers=headers,
            timeout=30
        )

        response.encoding = 'utf-8'
        response.raise_for_status()

        if response.status_code == 200:
            return response.text
        else:
            print(f"Response status: {response.status_code}")  # Debug logging
            print(f"Response headers: {response.headers}")  # Debug logging
            print(f"Response text: {response.text[:500]}")  # Debug logging
            return f"Error fetching content: {response.status_code} {response.reason}"

    except Exception as e:
        print(f"Error fetching content for {url}: {str(e)}")
        return f"Error fetching content: {str(e)}"


def process_website(
    url: str,
    hyperbolic_key: str = "",
    groq_key: str = "",
    markdowner_key: str = "",
    use_hyperbolic: bool = True,
    progress=gr.Progress()
) -> Tuple[str, str, List[str], str]:
    try:
        if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key):
            return "Error: Please provide an API key for the selected AI provider", None, [], ""

        base_url = normalize_url(url)
        progress(0, desc="Initializing...")

        # Try robots.txt first
        sitemap_urls = []
        try:
            robots_url = urljoin(base_url, '/robots.txt')
            robots_content = fetch_with_proxy(robots_url)
            sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
        except:
            pass

        progress(0.2, desc="Checking common sitemap locations...")

        # Try common locations if no sitemaps found
        if not sitemap_urls:
            common_locations = get_common_sitemap_urls(base_url)
            for sitemap_url in common_locations:
                try:
                    content = fetch_with_proxy(sitemap_url)
                    if '<?xml' in content or '<urlset' in content:
                        sitemap_urls.append(sitemap_url)
                        break
                except:
                    continue

        if not sitemap_urls:
            return "Error: No sitemaps found", None, [], ""

        progress(0.4, desc="Processing sitemaps...")

        # Process sitemaps
        all_urls = []
        for sitemap_url in sitemap_urls:
            try:
                content = fetch_with_proxy(sitemap_url)
                urls = extract_urls_from_sitemap(content)
                all_urls.extend(urls)
            except:
                continue

        if not all_urls:
            return "Error: No URLs found in sitemaps", None, [], ""

        progress(0.6, desc="Generating summaries...")

        # Generate summaries
        summaries = []
        for i, page_url in enumerate(all_urls):
            try:
                # Get content via Markdowner
                content = get_page_content(page_url, markdowner_key)

                # Store full content for llms-full.txt
                full_content = content

                # Generate summary with selected provider
                if use_hyperbolic:
                    summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key)
                else:
                    summary = generate_groq_summary(page_url, content, groq_key)

                summaries.append({
                    "url": page_url,
                    "summary": summary,
                    "fullContent": full_content,
                    "provider": "hyperbolic" if use_hyperbolic else "groq"
                })

                # Rate limiting
                time.sleep(1)

                progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
                        desc=f"Processing URL {i+1}/{len(all_urls)}")
            except Exception as e:
                print(f"Error processing {page_url}: {str(e)}")
                continue

        # Generate both formats
        llms_txt = generate_llms_txt(summaries)
        llms_full_txt = generate_llms_full_txt(summaries)

        return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt

    except Exception as e:
        print(f"Error in process_website: {str(e)}")
        return f"Processing failed: {str(e)}", None, [], ""


# Gradio Interface
with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # llms.txt Generator 🤖✨
    Generate AI-powered llms.txt files for any website 🌐
    """)

    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL",
            placeholder="Enter website URL"
        )
        markdowner_key = gr.Textbox(
            label="Markdowner API Key (Optional)",
            placeholder="For higher rate limits",
            type="password",
            container=True,
            scale=2
        )

    # AI Provider Selection
    with gr.Row():
        with gr.Column():
            use_hyperbolic = gr.Checkbox(
                label="Use Hyperbolic",
                value=True,
                interactive=True
            )
            hyperbolic_key = gr.Textbox(
                label="Hyperbolic API Key",
                type="password",
                visible=True,
                placeholder="Enter your Hyperbolic API key",
                container=False,
                scale=2
            )

        with gr.Column():
            use_groq = gr.Checkbox(
                label="Use Groq",
                value=False,
                interactive=True
            )
            groq_key = gr.Textbox(
                label="Groq API Key",
                type="password",
                visible=False,
                placeholder="Enter your Groq API key",
                container=False,
                scale=2
            )

    # Connect checkbox events
    def update_provider_visibility(use_hyp: bool, use_grq: bool):
        # Ensure only one provider is selected
        if use_hyp and use_grq:
            use_grq = False

        return {
            hyperbolic_key: gr.update(visible=use_hyp),
            groq_key: gr.update(visible=use_grq),
            use_groq: gr.update(value=use_grq),
            use_hyperbolic: gr.update(value=use_hyp)
        }

    # Connect checkbox events
    use_hyperbolic.change(
        fn=update_provider_visibility,
        inputs=[use_hyperbolic, use_groq],
        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
    )

    use_groq.change(
        fn=update_provider_visibility,
        inputs=[use_hyperbolic, use_groq],
        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
    )

    generate_btn = gr.Button("Generate 🚀", variant="primary")

    with gr.Row():
        llms_output = gr.TextArea(
            label="Generated llms.txt",
            placeholder="Generated content will appear here...",
            lines=10,
            show_copy_button=True
        )
        llms_full_output = gr.TextArea(
            label="Generated llms-full.txt",
            placeholder="Full content will appear here...",
            lines=10,
            show_copy_button=True
        )

    # Add JSON output for debugging
    json_output = gr.JSON(
        label="Debug Output (JSON)",
        visible=True
    )

    # Add download buttons for both files
    def download_txt(text: str, filename: str) -> str:
        """Convert text to downloadable format"""
        if not text:
            return None
        # Create a file with the proper name
        with open(filename, "w", encoding="utf-8") as f:
            f.write(text)
        return filename

    download_btn = gr.File(
        label="Download llms.txt",
        visible=True,
        file_types=[".txt"]
    )

    download_full_btn = gr.File(
        label="Download llms-full.txt",
        visible=True,
        file_types=[".txt"]
    )

    download_trigger = gr.Button("Download llms.txt 📥")
    download_full_trigger = gr.Button("Download llms-full.txt 📥")

    download_trigger.click(
        fn=lambda x: download_txt(x, "llms.txt"),
        inputs=[llms_output],
        outputs=[download_btn]
    )

    download_full_trigger.click(
        fn=lambda x: download_txt(x, "llms-full.txt"),
        inputs=[llms_full_output],
        outputs=[download_full_btn]
    )

    # Clean up function to remove temporary files
    def cleanup():
        try:
            if os.path.exists("llms.txt"):
                os.remove("llms.txt")
            if os.path.exists("llms-full.txt"):
                os.remove("llms-full.txt")
        except:
            pass

    urls_found = gr.Dataframe(
        headers=["URLs Found"],
        label="Discovered URLs",
        visible=True
    )

    def process_and_update(*args):
        result, summaries, urls, full_result = process_website(*args)

        urls_df = pd.DataFrame({
            "URLs Found": urls if urls else ["No URLs found"]
        })

        # Clean up any existing temporary files
        cleanup()

        return {
            llms_output: result,
            llms_full_output: full_result,
            json_output: summaries if summaries else "",
            urls_found: urls_df,
            download_btn: None,
            download_full_btn: None
        }

    generate_btn.click(
        process_and_update,
        inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic],
        outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn]
    )

if __name__ == "__main__":
    demo.launch()