patrickacraig commited on
Commit
20b4c83
·
1 Parent(s): a24c8fd

renaming files

Browse files
Files changed (3) hide show
  1. app.py +75 -45
  2. core.py +87 -0
  3. web_ui.py +0 -117
app.py CHANGED
@@ -1,25 +1,16 @@
1
- from firecrawl import FirecrawlApp
2
  import os
3
  import time
4
  import asyncio
5
  from dotenv import load_dotenv
6
  from urllib.parse import urlparse
 
 
7
 
8
  load_dotenv()
9
 
10
- base_url = os.getenv('TARGET_URL')
11
- api_key = os.getenv('FIRECRAWL_API_KEY')
12
- limit_rate = os.getenv('LIMIT_RATE', 'False').lower() == 'true'
13
-
14
- print(f"base_url: {base_url}")
15
- print(f"api_key: {api_key}")
16
- print(f"limit_rate: {limit_rate}")
17
-
18
- # Get Firecrawl App instance
19
  def get_firecrawl_app(api_key):
20
  return FirecrawlApp(api_key=api_key)
21
 
22
- # Asynchronous scrape URL
23
  async def async_scrape_url(app, url):
24
  try:
25
  scrape_status = app.scrape_url(url)
@@ -33,11 +24,9 @@ async def async_scrape_url(app, url):
33
  print(f"Error scraping {url}: {e}")
34
  return ""
35
 
36
- # Synchronously map website URLs
37
  def map_website(app, url):
38
  try:
39
  map_status = app.map_url(url)
40
- print(f"Map status for {url}: {map_status}")
41
  if isinstance(map_status, list):
42
  return map_status
43
  else:
@@ -47,41 +36,82 @@ def map_website(app, url):
47
  print(f"Error mapping website {url}: {e}")
48
  return []
49
 
50
- # Asynchronously scrape all URLs
51
- def scrape_all_urls(base_url, api_key, limit_rate):
52
- async def scrape_process():
53
- app = get_firecrawl_app(api_key)
54
- urls = map_website(app, base_url)
55
- if not urls:
56
- print("No URLs found. Please check if the base URL is correct.")
57
- return
58
 
59
- parsed_url = urlparse(base_url)
60
- domain = parsed_url.netloc.replace("www.", "")
61
- os.makedirs('scraped_documentation', exist_ok=True)
62
- output_file = os.path.join('scraped_documentation', f"{domain}.md")
63
 
64
- with open(output_file, 'w', encoding='utf-8') as md_file:
65
- for i, url in enumerate(urls):
66
- print(f"Scraping {url} ({i+1}/{len(urls)})")
67
- markdown_content = await async_scrape_url(app, url)
68
- md_file.write(f"# {url}\n\n")
69
- md_file.write(markdown_content)
70
- md_file.write("\n\n---\n\n")
71
-
72
- # Rate limiting: 10 scrapes per minute
73
- if limit_rate and (i + 1) % 10 == 0:
74
- print("Rate limit reached, waiting for 60 seconds...")
75
- time.sleep(60)
76
 
77
- print(f"Scraping completed. Output saved to {output_file}")
78
 
79
- asyncio.run(scrape_process())
 
 
 
 
 
 
 
 
80
 
81
- if __name__ == "__main__":
 
 
82
  if not base_url:
83
- print("Error: BASE_URL not specified in environment variables.")
84
- elif not api_key:
85
- print("Error: FIRECRAWL_API_KEY not specified in environment variables.")
86
- else:
87
- scrape_all_urls(base_url, api_key, limit_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import time
3
  import asyncio
4
  from dotenv import load_dotenv
5
  from urllib.parse import urlparse
6
+ from firecrawl import FirecrawlApp
7
+ import gradio as gr
8
 
9
  load_dotenv()
10
 
 
 
 
 
 
 
 
 
 
11
  def get_firecrawl_app(api_key):
12
  return FirecrawlApp(api_key=api_key)
13
 
 
14
  async def async_scrape_url(app, url):
15
  try:
16
  scrape_status = app.scrape_url(url)
 
24
  print(f"Error scraping {url}: {e}")
25
  return ""
26
 
 
27
  def map_website(app, url):
28
  try:
29
  map_status = app.map_url(url)
 
30
  if isinstance(map_status, list):
31
  return map_status
32
  else:
 
36
  print(f"Error mapping website {url}: {e}")
37
  return []
38
 
39
+ async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
40
+ app = get_firecrawl_app(api_key)
41
+ urls = map_website(app, base_url)
42
+ if not urls:
43
+ return "No URLs found. Please check if the base URL is correct."
 
 
 
44
 
45
+ parsed_url = urlparse(base_url)
46
+ domain = parsed_url.netloc.replace("www.", "")
47
+ os.makedirs('scraped_documentation', exist_ok=True)
48
+ output_file = os.path.join('scraped_documentation', f"{domain}.md")
49
 
50
+ with open(output_file, 'w', encoding='utf-8') as md_file:
51
+ for i, url in enumerate(progress.tqdm(urls)):
52
+ progress(i / len(urls), f"Scraping {url}")
53
+ markdown_content = await async_scrape_url(app, url)
54
+ md_file.write(f"# {url}\n\n")
55
+ md_file.write(markdown_content)
56
+ md_file.write("\n\n---\n\n")
57
+ if limit_rate and (i + 1) % 10 == 0:
58
+ time.sleep(60)
 
 
 
59
 
60
+ return f"Scraping completed. Output saved to {output_file}"
61
 
62
+ def count_urls(base_url, api_key):
63
+ if not api_key:
64
+ return "Please enter your Firecrawl API key first."
65
+ app = get_firecrawl_app(api_key)
66
+ urls = map_website(app, base_url)
67
+ if urls:
68
+ return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
69
+ else:
70
+ return "No URLs found. Please check the base URL or API key."
71
 
72
+ async def gradio_scrape(base_url, api_key, limit_rate):
73
+ if not api_key:
74
+ return "Please enter your Firecrawl API key."
75
  if not base_url:
76
+ return "Please enter a base URL to scrape."
77
+ return await scrape_all_urls(base_url, api_key, limit_rate)
78
+
79
+ with gr.Blocks() as iface:
80
+ gr.Markdown("# Docs Scraper")
81
+ gr.Markdown("""
82
+ ## Map and Scrape Website URLs with Firecrawl API
83
+ Enter a base URL, your Firecrawl API key, and choose whether to limit the scraping rate.
84
+ Scraped content will be saved as a markdown file named after the domain.
85
+ """)
86
+ gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
87
+
88
+ with gr.Row():
89
+ base_url = gr.Textbox(label="Base URL", placeholder="Enter the base URL to scrape")
90
+ api_key = gr.Textbox(label="Firecrawl API Key", type="password")
91
+ limit_rate = gr.Checkbox(
92
+ label="Limit Rate",
93
+ value=True,
94
+ info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
95
+ )
96
+
97
+ gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process.")
98
+
99
+ with gr.Row():
100
+ count_button = gr.Button("Count URLs")
101
+ url_count = gr.Textbox(label="URL Count")
102
+
103
+ with gr.Row():
104
+ scrape_button = gr.Button("Scrape URLs")
105
+ output = gr.Textbox(label="Output", elem_id="output_textbox")
106
+
107
+ gr.Markdown("""
108
+ #### Note:
109
+ The free tier of the Firecrawl API allows for 500 credits per month.
110
+ If you need to scrape more, consider upgrading to a paid plan.
111
+ """)
112
+
113
+ count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
114
+ scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])
115
+
116
+ if __name__ == "__main__":
117
+ iface.launch()
core.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from firecrawl import FirecrawlApp
2
+ import os
3
+ import time
4
+ import asyncio
5
+ from dotenv import load_dotenv
6
+ from urllib.parse import urlparse
7
+
8
+ load_dotenv()
9
+
10
+ base_url = os.getenv('TARGET_URL')
11
+ api_key = os.getenv('FIRECRAWL_API_KEY')
12
+ limit_rate = os.getenv('LIMIT_RATE', 'False').lower() == 'true'
13
+
14
+ print(f"base_url: {base_url}")
15
+ print(f"api_key: {api_key}")
16
+ print(f"limit_rate: {limit_rate}")
17
+
18
+ # Get Firecrawl App instance
19
+ def get_firecrawl_app(api_key):
20
+ return FirecrawlApp(api_key=api_key)
21
+
22
+ # Asynchronous scrape URL
23
+ async def async_scrape_url(app, url):
24
+ try:
25
+ scrape_status = app.scrape_url(url)
26
+ print(f"Scrape status for {url}: {scrape_status}")
27
+ if 'markdown' in scrape_status:
28
+ return scrape_status['markdown']
29
+ else:
30
+ print(f"Failed to scrape {url}: {scrape_status}")
31
+ return ""
32
+ except Exception as e:
33
+ print(f"Error scraping {url}: {e}")
34
+ return ""
35
+
36
+ # Synchronously map website URLs
37
+ def map_website(app, url):
38
+ try:
39
+ map_status = app.map_url(url)
40
+ print(f"Map status for {url}: {map_status}")
41
+ if isinstance(map_status, list):
42
+ return map_status
43
+ else:
44
+ print("Failed to map the website:", map_status)
45
+ return []
46
+ except Exception as e:
47
+ print(f"Error mapping website {url}: {e}")
48
+ return []
49
+
50
+ # Asynchronously scrape all URLs
51
+ def scrape_all_urls(base_url, api_key, limit_rate):
52
+ async def scrape_process():
53
+ app = get_firecrawl_app(api_key)
54
+ urls = map_website(app, base_url)
55
+ if not urls:
56
+ print("No URLs found. Please check if the base URL is correct.")
57
+ return
58
+
59
+ parsed_url = urlparse(base_url)
60
+ domain = parsed_url.netloc.replace("www.", "")
61
+ os.makedirs('scraped_documentation', exist_ok=True)
62
+ output_file = os.path.join('scraped_documentation', f"{domain}.md")
63
+
64
+ with open(output_file, 'w', encoding='utf-8') as md_file:
65
+ for i, url in enumerate(urls):
66
+ print(f"Scraping {url} ({i+1}/{len(urls)})")
67
+ markdown_content = await async_scrape_url(app, url)
68
+ md_file.write(f"# {url}\n\n")
69
+ md_file.write(markdown_content)
70
+ md_file.write("\n\n---\n\n")
71
+
72
+ # Rate limiting: 10 scrapes per minute
73
+ if limit_rate and (i + 1) % 10 == 0:
74
+ print("Rate limit reached, waiting for 60 seconds...")
75
+ time.sleep(60)
76
+
77
+ print(f"Scraping completed. Output saved to {output_file}")
78
+
79
+ asyncio.run(scrape_process())
80
+
81
+ if __name__ == "__main__":
82
+ if not base_url:
83
+ print("Error: BASE_URL not specified in environment variables.")
84
+ elif not api_key:
85
+ print("Error: FIRECRAWL_API_KEY not specified in environment variables.")
86
+ else:
87
+ scrape_all_urls(base_url, api_key, limit_rate)
web_ui.py DELETED
@@ -1,117 +0,0 @@
1
- import os
2
- import time
3
- import asyncio
4
- from dotenv import load_dotenv
5
- from urllib.parse import urlparse
6
- from firecrawl import FirecrawlApp
7
- import gradio as gr
8
-
9
- load_dotenv()
10
-
11
- def get_firecrawl_app(api_key):
12
- return FirecrawlApp(api_key=api_key)
13
-
14
- async def async_scrape_url(app, url):
15
- try:
16
- scrape_status = app.scrape_url(url)
17
- print(f"Scrape status for {url}: {scrape_status}")
18
- if 'markdown' in scrape_status:
19
- return scrape_status['markdown']
20
- else:
21
- print(f"Failed to scrape {url}: {scrape_status}")
22
- return ""
23
- except Exception as e:
24
- print(f"Error scraping {url}: {e}")
25
- return ""
26
-
27
- def map_website(app, url):
28
- try:
29
- map_status = app.map_url(url)
30
- if isinstance(map_status, list):
31
- return map_status
32
- else:
33
- print("Failed to map the website:", map_status)
34
- return []
35
- except Exception as e:
36
- print(f"Error mapping website {url}: {e}")
37
- return []
38
-
39
- async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
40
- app = get_firecrawl_app(api_key)
41
- urls = map_website(app, base_url)
42
- if not urls:
43
- return "No URLs found. Please check if the base URL is correct."
44
-
45
- parsed_url = urlparse(base_url)
46
- domain = parsed_url.netloc.replace("www.", "")
47
- os.makedirs('scraped_documentation', exist_ok=True)
48
- output_file = os.path.join('scraped_documentation', f"{domain}.md")
49
-
50
- with open(output_file, 'w', encoding='utf-8') as md_file:
51
- for i, url in enumerate(progress.tqdm(urls)):
52
- progress(i / len(urls), f"Scraping {url}")
53
- markdown_content = await async_scrape_url(app, url)
54
- md_file.write(f"# {url}\n\n")
55
- md_file.write(markdown_content)
56
- md_file.write("\n\n---\n\n")
57
- if limit_rate and (i + 1) % 10 == 0:
58
- time.sleep(60)
59
-
60
- return f"Scraping completed. Output saved to {output_file}"
61
-
62
- def count_urls(base_url, api_key):
63
- if not api_key:
64
- return "Please enter your Firecrawl API key first."
65
- app = get_firecrawl_app(api_key)
66
- urls = map_website(app, base_url)
67
- if urls:
68
- return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
69
- else:
70
- return "No URLs found. Please check the base URL or API key."
71
-
72
- async def gradio_scrape(base_url, api_key, limit_rate):
73
- if not api_key:
74
- return "Please enter your Firecrawl API key."
75
- if not base_url:
76
- return "Please enter a base URL to scrape."
77
- return await scrape_all_urls(base_url, api_key, limit_rate)
78
-
79
- with gr.Blocks() as iface:
80
- gr.Markdown("# Docs Scraper")
81
- gr.Markdown("""
82
- ## Map and Scrape Website URLs with Firecrawl API
83
- Enter a base URL, your Firecrawl API key, and choose whether to limit the scraping rate.
84
- Scraped content will be saved as a markdown file named after the domain.
85
- """)
86
- gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
87
-
88
- with gr.Row():
89
- base_url = gr.Textbox(label="Base URL", placeholder="Enter the base URL to scrape")
90
- api_key = gr.Textbox(label="Firecrawl API Key", type="password")
91
- limit_rate = gr.Checkbox(
92
- label="Limit Rate",
93
- value=True,
94
- info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
95
- )
96
-
97
- gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process.")
98
-
99
- with gr.Row():
100
- count_button = gr.Button("Count URLs")
101
- url_count = gr.Textbox(label="URL Count")
102
-
103
- with gr.Row():
104
- scrape_button = gr.Button("Scrape URLs")
105
- output = gr.Textbox(label="Output", elem_id="output_textbox")
106
-
107
- gr.Markdown("""
108
- #### Note:
109
- The free tier of the Firecrawl API allows for 500 credits per month.
110
- If you need to scrape more, consider upgrading to a paid plan.
111
- """)
112
-
113
- count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
114
- scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])
115
-
116
- if __name__ == "__main__":
117
- iface.launch()