martinbowling commited on
Commit
d279120
Β·
verified Β·
1 Parent(s): d22cdce

publication version

Browse files
Files changed (1) hide show
  1. app.py +232 -106
app.py CHANGED
@@ -8,6 +8,7 @@ import json
8
  from groq import Groq
9
  import time
10
  import pandas as pd
 
11
 
12
 
13
  def normalize_url(url: str) -> str:
@@ -69,89 +70,139 @@ def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]:
69
 
70
  def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
71
  try:
 
 
 
72
  response = requests.post(
73
  'https://api.hyperbolic.xyz/v1/chat/completions',
74
  headers={
75
- 'Content-Type': 'application/json',
76
  'Authorization': f'Bearer {api_key}',
77
  },
78
  json={
79
- 'model':
80
- 'meta-llama/Meta-Llama-3.1-8B-Instruct',
81
  'messages': [{
82
- 'role':
83
- 'user',
84
- 'content':
85
- f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
 
 
 
 
86
  }],
87
- 'max_tokens':
88
- 2048,
89
- 'temperature':
90
- 0.7,
91
- 'top_p':
92
- 0.9,
93
- 'stream':
94
- False
95
- })
96
  response.raise_for_status()
97
- return response.json()['choices'][0]['message']['content']
 
 
 
 
98
  except Exception as e:
 
99
  return f"Error generating Hyperbolic summary: {str(e)}"
100
 
101
 
102
  def generate_groq_summary(url: str, content: str, api_key: str) -> str:
103
  try:
 
 
 
104
  client = Groq(api_key=api_key)
105
  completion = client.chat.completions.create(
106
  messages=[{
107
- "role":
108
- "user",
109
- "content":
110
- f"Generate a concise 1-sentence summary of this webpage content:\n\nURL: {url}\n\nContent: {content}"
 
 
 
 
111
  }],
112
  model="llama-3.2-1b-preview",
113
  temperature=0.7,
114
  max_tokens=200,
 
 
115
  )
116
- return completion.choices[0].message.content
 
 
 
117
  except Exception as e:
 
118
  return f"Error generating Groq summary: {str(e)}"
119
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
122
  try:
123
- headers = {"Accept": "text/plain"}
 
 
 
 
 
 
124
  if markdowner_key:
125
  headers["Authorization"] = f"Bearer {markdowner_key}"
126
 
127
- response = requests.post("https://md.dhr.wtf/",
128
- params={"url": url},
129
- headers=headers)
130
- response.raise_for_status()
131
- return response.text
132
- except Exception as e:
133
- return f"Error fetching content: {str(e)}"
134
 
 
 
135
 
136
- def generate_llms_txt(summaries: List[Dict]) -> str:
137
- if not summaries:
138
- return "No summaries generated"
139
-
140
- # Find homepage
141
- homepage = next(
142
- (s for s in summaries if urlparse(s['url']).path in ['', '/']),
143
- summaries[0])
144
 
145
- content = f"# {urlparse(homepage['url']).hostname}\n\n"
146
- content += f"> {homepage['summary']}\n\n"
147
- content += "## Main Pages\n\n"
148
 
149
- for summary in summaries:
150
- if summary != homepage:
151
- path = urlparse(summary['url']).path
152
- content += f"- [{path}]({summary['url']}): {summary['summary']}\n"
 
 
 
153
 
154
- return content
 
 
155
 
156
 
157
  def process_website(
@@ -161,11 +212,10 @@ def process_website(
161
  markdowner_key: str = "",
162
  use_hyperbolic: bool = True,
163
  progress=gr.Progress()
164
- ) -> Tuple[str, str, List[str]]:
165
  try:
166
- if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic
167
- and groq_key):
168
- return "Error: Please provide an API key for the selected AI provider", None, []
169
 
170
  base_url = normalize_url(url)
171
  progress(0, desc="Initializing...")
@@ -194,7 +244,7 @@ def process_website(
194
  continue
195
 
196
  if not sitemap_urls:
197
- return "Error: No sitemaps found", None, []
198
 
199
  progress(0.4, desc="Processing sitemaps...")
200
 
@@ -209,7 +259,7 @@ def process_website(
209
  continue
210
 
211
  if not all_urls:
212
- return "Error: No URLs found in sitemaps", None, []
213
 
214
  progress(0.6, desc="Generating summaries...")
215
 
@@ -220,31 +270,40 @@ def process_website(
220
  # Get content via Markdowner
221
  content = get_page_content(page_url, markdowner_key)
222
 
 
 
 
223
  # Generate summary with selected provider
224
  if use_hyperbolic:
225
- summary = generate_hyperbolic_summary(
226
- page_url, content, hyperbolic_key)
227
  else:
228
- summary = generate_groq_summary(page_url, content,
229
- groq_key)
230
 
231
- summaries.append({"url": page_url, "summary": summary})
 
 
 
 
 
232
 
233
  # Rate limiting
234
- time.sleep(1) # Basic rate limiting
235
 
236
  progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
237
- desc=f"Processing URL {i+1}/{len(all_urls)}")
238
  except Exception as e:
239
- print(f"Error processing {page_url}: {e}")
 
240
 
241
- # Generate llms.txt
242
  llms_txt = generate_llms_txt(summaries)
 
243
 
244
- return llms_txt, json.dumps(summaries, indent=2), all_urls
245
 
246
  except Exception as e:
247
- return f"Error: {str(e)}", None, []
 
248
 
249
 
250
  # Gradio Interface
@@ -255,33 +314,51 @@ with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
255
  """)
256
 
257
  with gr.Row():
258
- url_input = gr.Textbox(label="Website URL",
259
- placeholder="Enter website URL")
260
- markdowner_key = gr.Textbox(label="Markdowner API Key (optional)",
261
- placeholder="For higher rate limits",
262
- type="password")
 
 
 
 
 
 
263
 
264
  # AI Provider Selection
265
  with gr.Row():
266
  with gr.Column():
267
- use_hyperbolic = gr.Checkbox(label="Use Hyperbolic",
268
- value=False,
269
- interactive=True)
 
 
270
  hyperbolic_key = gr.Textbox(
271
  label="Hyperbolic API Key",
272
  type="password",
273
  visible=True,
274
- placeholder="Enter your Hyperbolic API key")
 
 
 
275
 
276
  with gr.Column():
277
- use_groq = gr.Checkbox(label="Use Groq",
278
- value=False,
279
- interactive=True)
280
- groq_key = gr.Textbox(label="Groq API Key",
281
- type="password",
282
- visible=False,
283
- placeholder="Enter your Groq API key")
 
 
 
 
 
 
284
 
 
285
  def update_provider_visibility(use_hyp: bool, use_grq: bool):
286
  # Ensure only one provider is selected
287
  if use_hyp and use_grq:
@@ -298,12 +375,14 @@ with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
298
  use_hyperbolic.change(
299
  fn=update_provider_visibility,
300
  inputs=[use_hyperbolic, use_groq],
301
- outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
 
302
 
303
  use_groq.change(
304
  fn=update_provider_visibility,
305
  inputs=[use_hyperbolic, use_groq],
306
- outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic])
 
307
 
308
  generate_btn = gr.Button("Generate πŸš€", variant="primary")
309
 
@@ -312,51 +391,98 @@ with gr.Blocks(title="llms.txt Generator", theme=gr.themes.Soft()) as demo:
312
  label="Generated llms.txt",
313
  placeholder="Generated content will appear here...",
314
  lines=10,
315
- show_copy_button=True # Enable built-in copy button
316
  )
317
- json_output = gr.JSON(label="Raw Summaries", visible=True)
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- # Add download button
320
- def download_llms_txt(text):
321
  """Convert text to downloadable format"""
322
  if not text:
323
  return None
324
- return text.encode('utf-8')
325
-
326
- download_btn = gr.File(label="Download llms.txt",
327
- visible=False,
328
- file_types=[".txt", ".md"])
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  download_trigger = gr.Button("Download llms.txt πŸ“₯")
331
- download_trigger.click(fn=download_llms_txt,
332
- inputs=[llms_output],
333
- outputs=[download_btn])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
- urls_found = gr.Dataframe(headers=["URLs Found"],
336
- label="Discovered URLs",
337
- visible=True)
 
 
338
 
339
  def process_and_update(*args):
340
- result, summaries, urls = process_website(*args)
 
 
 
 
341
 
342
- # Create DataFrame for URLs
343
- urls_df = pd.DataFrame(
344
- {"URLs Found": urls if urls else ["No URLs found"]})
345
 
346
  return {
347
  llms_output: result,
 
348
  json_output: summaries if summaries else "",
349
  urls_found: urls_df,
350
- download_btn: None # Reset download button
 
351
  }
352
 
353
  generate_btn.click(
354
  process_and_update,
355
- inputs=[
356
- url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic
357
- ],
358
- outputs=[llms_output, json_output, urls_found, download_btn])
359
 
360
  if __name__ == "__main__":
361
  demo.launch()
362
-
 
8
  from groq import Groq
9
  import time
10
  import pandas as pd
11
+ import os
12
 
13
 
14
  def normalize_url(url: str) -> str:
 
70
 
71
  def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str:
72
  try:
73
+ # Ensure content is properly encoded
74
+ content = content.encode('utf-8', errors='ignore').decode('utf-8')
75
+
76
  response = requests.post(
77
  'https://api.hyperbolic.xyz/v1/chat/completions',
78
  headers={
79
+ 'Content-Type': 'application/json; charset=utf-8',
80
  'Authorization': f'Bearer {api_key}',
81
  },
82
  json={
83
+ 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
 
84
  'messages': [{
85
+ 'role': 'user',
86
+ 'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.
87
+
88
+ URL: {url}
89
+ Content: {content}
90
+
91
+ Example response format:
92
+ <summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
93
  }],
94
+ 'max_tokens': 200,
95
+ 'temperature': 0.7,
96
+ 'top_p': 0.9,
97
+ 'stream': False
98
+ },
99
+ timeout=30
100
+ )
 
 
101
  response.raise_for_status()
102
+ result = response.json()
103
+ summary = result['choices'][0]['message']['content']
104
+ # Extract summary from tags
105
+ match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
106
+ return match.group(1).strip() if match else summary.strip()
107
  except Exception as e:
108
+ print(f"Error in generate_hyperbolic_summary: {str(e)}")
109
  return f"Error generating Hyperbolic summary: {str(e)}"
110
 
111
 
112
  def generate_groq_summary(url: str, content: str, api_key: str) -> str:
113
  try:
114
+ # Ensure content is properly encoded
115
+ content = content.encode('utf-8', errors='ignore').decode('utf-8')
116
+
117
  client = Groq(api_key=api_key)
118
  completion = client.chat.completions.create(
119
  messages=[{
120
+ 'role': 'user',
121
+ 'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in <summary> tags.
122
+
123
+ URL: {url}
124
+ Content: {content}
125
+
126
+ Example response format:
127
+ <summary>This is a clear and concise one-sentence summary of the webpage.</summary>"""
128
  }],
129
  model="llama-3.2-1b-preview",
130
  temperature=0.7,
131
  max_tokens=200,
132
+ top_p=0.9,
133
+ stream=False
134
  )
135
+ summary = completion.choices[0].message.content
136
+ # Extract summary from tags
137
+ match = re.search(r'<summary>(.*?)</summary>', summary, re.DOTALL)
138
+ return match.group(1).strip() if match else summary.strip()
139
  except Exception as e:
140
+ print(f"Error in generate_groq_summary: {str(e)}")
141
  return f"Error generating Groq summary: {str(e)}"
142
 
143
 
144
+ def generate_llms_txt(summaries: List[Dict[str, str]]) -> str:
145
+ if not summaries:
146
+ return ""
147
+
148
+ return "\n".join([
149
+ f"# {summary['url']}\n\n{summary['summary']}\n\n---\n"
150
+ for summary in summaries
151
+ ])
152
+
153
+
154
+ def generate_llms_full_txt(summaries: List[Dict]) -> str:
155
+ if not summaries:
156
+ return "No content generated"
157
+
158
+ content = ""
159
+ for summary in summaries:
160
+ content += f"# {summary['url']}\n\n"
161
+ content += f"{summary.get('fullContent', 'No content available')}\n\n"
162
+ content += "---\n\n"
163
+
164
+ return content
165
+
166
+
167
  def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str:
168
  try:
169
+ headers = {
170
+ "Accept": "text/plain",
171
+ "Accept-Language": "en-US,en;q=0.9",
172
+ "User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)",
173
+ "Origin": "http://localhost:3000",
174
+ "Referer": "http://localhost:3000/",
175
+ }
176
  if markdowner_key:
177
  headers["Authorization"] = f"Bearer {markdowner_key}"
178
 
179
+ # Use direct URL construction like the curl command
180
+ encoded_url = requests.utils.quote(url)
181
+ full_url = f"https://md.dhr.wtf/?url={encoded_url}"
 
 
 
 
182
 
183
+ print(f"Requesting URL: {full_url}") # Debug logging
184
+ print(f"Headers: {headers}") # Debug logging
185
 
186
+ response = requests.get( # Changed to GET request
187
+ full_url,
188
+ headers=headers,
189
+ timeout=30
190
+ )
 
 
 
191
 
192
+ response.encoding = 'utf-8'
193
+ response.raise_for_status()
 
194
 
195
+ if response.status_code == 200:
196
+ return response.text
197
+ else:
198
+ print(f"Response status: {response.status_code}") # Debug logging
199
+ print(f"Response headers: {response.headers}") # Debug logging
200
+ print(f"Response text: {response.text[:500]}") # Debug logging
201
+ return f"Error fetching content: {response.status_code} {response.reason}"
202
 
203
+ except Exception as e:
204
+ print(f"Error fetching content for {url}: {str(e)}")
205
+ return f"Error fetching content: {str(e)}"
206
 
207
 
208
  def process_website(
 
212
  markdowner_key: str = "",
213
  use_hyperbolic: bool = True,
214
  progress=gr.Progress()
215
+ ) -> Tuple[str, str, List[str], str]:
216
  try:
217
+ if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key):
218
+ return "Error: Please provide an API key for the selected AI provider", None, [], ""
 
219
 
220
  base_url = normalize_url(url)
221
  progress(0, desc="Initializing...")
 
244
  continue
245
 
246
  if not sitemap_urls:
247
+ return "Error: No sitemaps found", None, [], ""
248
 
249
  progress(0.4, desc="Processing sitemaps...")
250
 
 
259
  continue
260
 
261
  if not all_urls:
262
+ return "Error: No URLs found in sitemaps", None, [], ""
263
 
264
  progress(0.6, desc="Generating summaries...")
265
 
 
270
  # Get content via Markdowner
271
  content = get_page_content(page_url, markdowner_key)
272
 
273
+ # Store full content for llms-full.txt
274
+ full_content = content
275
+
276
  # Generate summary with selected provider
277
  if use_hyperbolic:
278
+ summary = generate_hyperbolic_summary(page_url, content, hyperbolic_key)
 
279
  else:
280
+ summary = generate_groq_summary(page_url, content, groq_key)
 
281
 
282
+ summaries.append({
283
+ "url": page_url,
284
+ "summary": summary,
285
+ "fullContent": full_content,
286
+ "provider": "hyperbolic" if use_hyperbolic else "groq"
287
+ })
288
 
289
  # Rate limiting
290
+ time.sleep(1)
291
 
292
  progress((0.6 + (0.4 * (i + 1) / len(all_urls))),
293
+ desc=f"Processing URL {i+1}/{len(all_urls)}")
294
  except Exception as e:
295
+ print(f"Error processing {page_url}: {str(e)}")
296
+ continue
297
 
298
+ # Generate both formats
299
  llms_txt = generate_llms_txt(summaries)
300
+ llms_full_txt = generate_llms_full_txt(summaries)
301
 
302
+ return llms_txt, json.dumps(summaries, ensure_ascii=False, indent=2), all_urls, llms_full_txt
303
 
304
  except Exception as e:
305
+ print(f"Error in process_website: {str(e)}")
306
+ return f"Processing failed: {str(e)}", None, [], ""
307
 
308
 
309
  # Gradio Interface
 
314
  """)
315
 
316
  with gr.Row():
317
+ url_input = gr.Textbox(
318
+ label="Website URL",
319
+ placeholder="Enter website URL"
320
+ )
321
+ markdowner_key = gr.Textbox(
322
+ label="Markdowner API Key (Optional)",
323
+ placeholder="For higher rate limits",
324
+ type="password",
325
+ container=True,
326
+ scale=2
327
+ )
328
 
329
  # AI Provider Selection
330
  with gr.Row():
331
  with gr.Column():
332
+ use_hyperbolic = gr.Checkbox(
333
+ label="Use Hyperbolic",
334
+ value=True,
335
+ interactive=True
336
+ )
337
  hyperbolic_key = gr.Textbox(
338
  label="Hyperbolic API Key",
339
  type="password",
340
  visible=True,
341
+ placeholder="Enter your Hyperbolic API key",
342
+ container=False,
343
+ scale=2
344
+ )
345
 
346
  with gr.Column():
347
+ use_groq = gr.Checkbox(
348
+ label="Use Groq",
349
+ value=False,
350
+ interactive=True
351
+ )
352
+ groq_key = gr.Textbox(
353
+ label="Groq API Key",
354
+ type="password",
355
+ visible=False,
356
+ placeholder="Enter your Groq API key",
357
+ container=False,
358
+ scale=2
359
+ )
360
 
361
+ # Connect checkbox events
362
  def update_provider_visibility(use_hyp: bool, use_grq: bool):
363
  # Ensure only one provider is selected
364
  if use_hyp and use_grq:
 
375
  use_hyperbolic.change(
376
  fn=update_provider_visibility,
377
  inputs=[use_hyperbolic, use_groq],
378
+ outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
379
+ )
380
 
381
  use_groq.change(
382
  fn=update_provider_visibility,
383
  inputs=[use_hyperbolic, use_groq],
384
+ outputs=[hyperbolic_key, groq_key, use_groq, use_hyperbolic]
385
+ )
386
 
387
  generate_btn = gr.Button("Generate πŸš€", variant="primary")
388
 
 
391
  label="Generated llms.txt",
392
  placeholder="Generated content will appear here...",
393
  lines=10,
394
+ show_copy_button=True
395
  )
396
+ llms_full_output = gr.TextArea(
397
+ label="Generated llms-full.txt",
398
+ placeholder="Full content will appear here...",
399
+ lines=10,
400
+ show_copy_button=True
401
+ )
402
+
403
+ # Add JSON output for debugging
404
+ json_output = gr.JSON(
405
+ label="Debug Output (JSON)",
406
+ visible=True
407
+ )
408
 
409
+ # Add download buttons for both files
410
+ def download_txt(text: str, filename: str) -> str:
411
  """Convert text to downloadable format"""
412
  if not text:
413
  return None
414
+ # Create a file with the proper name
415
+ with open(filename, "w", encoding="utf-8") as f:
416
+ f.write(text)
417
+ return filename
418
+
419
+ download_btn = gr.File(
420
+ label="Download llms.txt",
421
+ visible=True,
422
+ file_types=[".txt"]
423
+ )
424
+
425
+ download_full_btn = gr.File(
426
+ label="Download llms-full.txt",
427
+ visible=True,
428
+ file_types=[".txt"]
429
+ )
430
 
431
  download_trigger = gr.Button("Download llms.txt πŸ“₯")
432
+ download_full_trigger = gr.Button("Download llms-full.txt πŸ“₯")
433
+
434
+ download_trigger.click(
435
+ fn=lambda x: download_txt(x, "llms.txt"),
436
+ inputs=[llms_output],
437
+ outputs=[download_btn]
438
+ )
439
+
440
+ download_full_trigger.click(
441
+ fn=lambda x: download_txt(x, "llms-full.txt"),
442
+ inputs=[llms_full_output],
443
+ outputs=[download_full_btn]
444
+ )
445
+
446
+ # Clean up function to remove temporary files
447
+ def cleanup():
448
+ try:
449
+ if os.path.exists("llms.txt"):
450
+ os.remove("llms.txt")
451
+ if os.path.exists("llms-full.txt"):
452
+ os.remove("llms-full.txt")
453
+ except:
454
+ pass
455
 
456
+ urls_found = gr.Dataframe(
457
+ headers=["URLs Found"],
458
+ label="Discovered URLs",
459
+ visible=True
460
+ )
461
 
462
  def process_and_update(*args):
463
+ result, summaries, urls, full_result = process_website(*args)
464
+
465
+ urls_df = pd.DataFrame({
466
+ "URLs Found": urls if urls else ["No URLs found"]
467
+ })
468
 
469
+ # Clean up any existing temporary files
470
+ cleanup()
 
471
 
472
  return {
473
  llms_output: result,
474
+ llms_full_output: full_result,
475
  json_output: summaries if summaries else "",
476
  urls_found: urls_df,
477
+ download_btn: None,
478
+ download_full_btn: None
479
  }
480
 
481
  generate_btn.click(
482
  process_and_update,
483
+ inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic],
484
+ outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn]
485
+ )
 
486
 
487
  if __name__ == "__main__":
488
  demo.launch()