File size: 15,647 Bytes
cdbdb61
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
 
 
cdbdb61
 
 
d279120
cdbdb61
 
 
d279120
cdbdb61
d279120
 
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
cdbdb61
d279120
cdbdb61
 
 
 
 
d279120
 
 
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
cdbdb61
 
 
 
d279120
 
cdbdb61
d279120
 
 
 
cdbdb61
d279120
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
cdbdb61
 
 
d279120
 
 
cdbdb61
d279120
 
cdbdb61
d279120
 
 
 
 
cdbdb61
d279120
 
cdbdb61
d279120
 
 
 
 
 
 
cdbdb61
d279120
 
 
cdbdb61
 
 
 
 
 
 
 
 
d279120
cdbdb61
d279120
 
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
d279120
 
 
cdbdb61
 
d279120
cdbdb61
d279120
cdbdb61
d279120
 
 
 
 
 
cdbdb61
 
d279120
cdbdb61
 
d279120
cdbdb61
d279120
 
cdbdb61
d279120
cdbdb61
d279120
cdbdb61
d279120
cdbdb61
 
d279120
 
cdbdb61
 
 
 
 
 
 
 
 
 
d279120
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
 
 
d279120
 
 
 
 
cdbdb61
 
 
 
d279120
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
cdbdb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d279120
 
cdbdb61
 
 
 
d279120
 
cdbdb61
 
 
 
 
 
 
 
d279120
cdbdb61
d279120
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
 
cdbdb61
 
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbdb61
d279120
 
 
 
 
cdbdb61
 
d279120
 
 
 
 
cdbdb61
d279120
 
cdbdb61
 
 
d279120
cdbdb61
 
d279120
 
cdbdb61
 
 
 
d279120
 
 
cdbdb61
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
import gradio as gr
import requests
import xml.etree.ElementTree as ET
import re
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional, Tuple
import json
from groq import Groq
import time
import pandas as pd
import os


def normalize_url(url: str) -> str:
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    return url.rstrip('/')


def fetch_with_proxy(url: str) -> str:
    """Fetch URL content with error handling"""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        raise Exception(f"Failed to fetch {url}: {str(e)}")


def extract_urls_from_sitemap(content: str) -> List[str]:
    urls = []
    try:
        root = ET.fromstring(content)
        ns = {
            'ns': root.tag.split('}')[0].strip('{')
        } if '}' in root.tag else {}

        # Handle sitemap index
        if 'sitemapindex' in root.tag:
            for sitemap in root.findall('.//ns:loc', ns):
                try:
                    sitemap_content = fetch_with_proxy(sitemap.text.strip())
                    urls.extend(extract_urls_from_sitemap(sitemap_content))
                except Exception:
                    continue
        # Handle urlset
        else:
            for url in root.findall('.//ns:loc', ns):
                urls.append(url.text.strip())
    except ET.ParseError:
        pass
    return urls


def get_common_sitemap_urls(base_url: str) -> List[str]:
    domain = urlparse(base_url).hostname
    return [
        f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml",
        f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml",
        f"{base_url}/sitemap/{domain}-sitemap.xml"
    ]


def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
    return [
        line.split(': ')[1].strip() for line in robots_content.splitlines()
        if line.lower().startswith('sitemap:')
    ]


def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
    try:
        # Ensure content is properly encoded
        content = content.encode('utf-8', errors='ignore').decode('utf-8')

        response = requests.post(
            'https://api.hyperbolic.xyz/v1/chat/completions',
            headers={
                'Content-Type': 'application/json; charset=utf-8',
                'Authorization': f'Bearer {api_key}',
            },
            json={
                'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
                'messages': [{
                    'role': 'user',
                    'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

URL: {url}
Content: {content}

Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
                }],
                'max_tokens': 200,
                'temperature': 0.7,
                'top_p': 0.9,
                'stream': False
            },
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        summary = result['choices'][0]['message']['content']
        # Extract summary from tags
        match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
        return match.group(1).strip() if match else summary.strip()
    except Exception as e:
        print(f"Error in generate_hyperbolic_summary: {str(e)}")
        return f"Error generating Hyperbolic summary: {str(e)}"


def generate_groq_summary(url: str, content: str, api_key: str) -> str:
    try:
        # Ensure content is properly encoded
        content = content.encode('utf-8', errors='ignore').decode('utf-8')

        client = Groq(api_key=api_key)
        completion = client.chat.completions.create(
            messages=[{
                'role': 'user',
                'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.

URL: {url}
Content: {content}

Example response format:
<summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
            }],
            model="llama-3.2-1b-preview",
            temperature=0.7,
            max_tokens=200,
            top_p=0.9,
            stream=False
        )
        summary = completion.choices[0].message.content
        # Extract summary from tags
        match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
        return match.group(1).strip() if match else summary.strip()
    except Exception as e:
        print(f"Error in generate_groq_summary: {str(e)}")
        return f"Error generating Groq summary: {str(e)}"


def generate_llms_txt(summaries: List[Dict[str, str]]) -> str:
    if not summaries:
        return ""

    return "\n".join([
        f"# {summary['url']}\n\n{summary['summary']}\n\n---\n"
        for summary in summaries
    ])


def generate_llms_full_txt(summaries: List[Dict]) -> str:
    if not summaries:
        return "No content generated"

    content = ""
    for summary in summaries:
        content += f"# {summary['url']}\n\n"
        content += f"{summary.get('fullContent', 'No content available')}\n\n"
        content += "---\n\n"

    return content


def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
    try:
        headers = {
            "Accept": "text/plain",
            "Accept-Language": "en-US,en;q=0.9",
            "User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)",
            "Origin": "http://localhost:3000",
            "Referer": "http://localhost:3000/",
        }
        if markdowner_key:
            headers["Authorization"] = f"Bearer {markdowner_key}"

        # Use direct URL construction like the curl command
        encoded_url = requests.utils.quote(url)
        full_url = f"https://md.dhr.wtf/?url={encoded_url}"

        print(f"Requesting URL: {full_url}")  # Debug logging
        print(f"Headers: {headers}")  # Debug logging

        response = requests.get(  # Changed to GET request
            full_url,
            headers=headers,
            timeout=30
        )

        response.encoding = 'utf-8'
        response.raise_for_status()

        if response.status_code == 200:
            return response.text
        else:
            print(f"Response status: {response.status_code}")  # Debug logging
            print(f"Response headers: {response.headers}")  # Debug logging
            print(f"Response text: {response.text[:500]}")  # Debug logging
            return f"Error fetching content: {response.status_code} {response.reason}"

    except Exception as e:
        print(f"Error fetching content for {url}: {str(e)}")
        return f"Error fetching content: {str(e)}"


def process_website(
    url: str,
    hyperbolic_key: str = "",
    groq_key: str = "",
    markdowner_key: str = "",
    use_hyperbolic: bool = True,
    progress=gr.Progress()
) -> Tuple[str, str, List[str], str]:
    try:
        if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key):
            return "Error: Please provide an API key for the selected AI provider", None, [], ""

        base_url = normalize_url(url)
        progress(0, desc="Initializing...")

        # Try robots.txt first
        sitemap_urls = []
        try:
            robots_url = urljoin(base_url, '/robots.txt')
            robots_content = fetch_with_proxy(robots_url)
            sitemap_urls = extract_sitemap_urls_from_robots(robots_content)
        except:
            pass

        progress(0.2, desc="Checking common sitemap locations...")

        # Try common locations if no sitemaps found
        if not sitemap_urls:
            common_locations = get_common_sitemap_urls(base_url)
            for sitemap_url in common_locations:
                try:
                    content = fetch_with_proxy(sitemap_url)
                    if '<?xml' in content or '<urlset' in content:
                        sitemap_urls.append(sitemap_url)
                        break
                except:
                    continue

        if not sitemap_urls:
            return "Error: No sitemaps found", None, [], ""

        progress(0.4, desc="Processing sitemaps...")

        # Process sitemaps
        all_urls = []
        for sitemap_url in sitemap_urls:
            try:
                content = fetch_with_proxy(sitemap_url)
                urls = extract_urls_from_sitemap(content)
                all_urls.extend(urls)
            except:
                continue

        if not all_urls:
            return "Error: No URLs found in sitemaps", None, [], ""

        progress(0.6, desc="Generating summaries...")

        # Generate summaries
        summaries = []
        for i, page_url in enumerate(all_urls):
            try:
                # Get content via Markdowner
                content = get_page_content(page_url, markdowner_key)

                # Store full content for llms-full.txt
                full_content = content

                # Generate summary with selected provider
                if use_hyperbolic:
                    summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key)
                else:
                    summary = generate_groq_summary(page_url, content, groq_key)

                summaries.append({
                    "url": page_url,
                    "summary": summary,
                    "fullContent": full_content,
                    "provider": "hyperbolic" if use_hyperbolic else "groq"
                })

                # Rate limiting
                time.sleep(1)

                progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
                        desc=f"Processing URL {i+1}/{len(all_urls)}")
            except Exception as e:
                print(f"Error processing {page_url}: {str(e)}")
                continue

        # Generate both formats
        llms_txt = generate_llms_txt(summaries)
        llms_full_txt = generate_llms_full_txt(summaries)

        return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt

    except Exception as e:
        print(f"Error in process_website: {str(e)}")
        return f"Processing failed: {str(e)}", None, [], ""


# Gradio Interface
with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # llms.txt Generator πŸ€–βœ¨
    Generate AI-powered llms.txt files for any website 🌐
    """)

    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL",
            placeholder="Enter website URL"
        )
        markdowner_key = gr.Textbox(
            label="Markdowner API Key (Optional)",
            placeholder="For higher rate limits",
            type="password",
            container=True,
            scale=2
        )

    # AI Provider Selection
    with gr.Row():
        with gr.Column():
            use_hyperbolic = gr.Checkbox(
                label="Use Hyperbolic",
                value=True,
                interactive=True
            )
            hyperbolic_key = gr.Textbox(
                label="Hyperbolic API Key",
                type="password",
                visible=True,
                placeholder="Enter your Hyperbolic API key",
                container=False,
                scale=2
            )

        with gr.Column():
            use_groq = gr.Checkbox(
                label="Use Groq",
                value=False,
                interactive=True
            )
            groq_key = gr.Textbox(
                label="Groq API Key",
                type="password",
                visible=False,
                placeholder="Enter your Groq API key",
                container=False,
                scale=2
            )

    # Connect checkbox events
    def update_provider_visibility(use_hyp: bool, use_grq: bool):
        # Ensure only one provider is selected
        if use_hyp and use_grq:
            use_grq = False

        return {
            hyperbolic_key: gr.update(visible=use_hyp),
            groq_key: gr.update(visible=use_grq),
            use_groq: gr.update(value=use_grq),
            use_hyperbolic: gr.update(value=use_hyp)
        }

    # Connect checkbox events
    use_hyperbolic.change(
        fn=update_provider_visibility,
        inputs=[use_hyperbolic, use_groq],
        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
    )

    use_groq.change(
        fn=update_provider_visibility,
        inputs=[use_hyperbolic, use_groq],
        outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
    )

    generate_btn = gr.Button("Generate πŸš€", variant="primary")

    with gr.Row():
        llms_output = gr.TextArea(
            label="Generated llms.txt",
            placeholder="Generated content will appear here...",
            lines=10,
            show_copy_button=True
        )
        llms_full_output = gr.TextArea(
            label="Generated llms-full.txt",
            placeholder="Full content will appear here...",
            lines=10,
            show_copy_button=True
        )

    # Add JSON output for debugging
    json_output = gr.JSON(
        label="Debug Output (JSON)",
        visible=True
    )

    # Add download buttons for both files
    def download_txt(text: str, filename: str) -> str:
        """Convert text to downloadable format"""
        if not text:
            return None
        # Create a file with the proper name
        with open(filename, "w", encoding="utf-8") as f:
            f.write(text)
        return filename

    download_btn = gr.File(
        label="Download llms.txt",
        visible=True,
        file_types=[".txt"]
    )

    download_full_btn = gr.File(
        label="Download llms-full.txt",
        visible=True,
        file_types=[".txt"]
    )

    download_trigger = gr.Button("Download llms.txt πŸ“₯")
    download_full_trigger = gr.Button("Download llms-full.txt πŸ“₯")

    download_trigger.click(
        fn=lambda x: download_txt(x, "llms.txt"),
        inputs=[llms_output],
        outputs=[download_btn]
    )

    download_full_trigger.click(
        fn=lambda x: download_txt(x, "llms-full.txt"),
        inputs=[llms_full_output],
        outputs=[download_full_btn]
    )

    # Clean up function to remove temporary files
    def cleanup():
        try:
            if os.path.exists("llms.txt"):
                os.remove("llms.txt")
            if os.path.exists("llms-full.txt"):
                os.remove("llms-full.txt")
        except:
            pass

    urls_found = gr.Dataframe(
        headers=["URLs Found"],
        label="Discovered URLs",
        visible=True
    )

    def process_and_update(*args):
        result, summaries, urls, full_result = process_website(*args)

        urls_df = pd.DataFrame({
            "URLs Found": urls if urls else ["No URLs found"]
        })

        # Clean up any existing temporary files
        cleanup()

        return {
            llms_output: result,
            llms_full_output: full_result,
            json_output: summaries if summaries else "",
            urls_found: urls_df,
            download_btn: None,
            download_full_btn: None
        }

    generate_btn.click(
        process_and_update,
        inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic],
        outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn]
    )

if __name__ == "__main__":
    demo.launch()