File size: 2,727 Bytes
03c0888 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os, time
# append the path to the root of the project
import sys
import asyncio
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from firecrawl import FirecrawlApp
from crawl4ai import AsyncWebCrawler
__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
async def compare():
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
# Tet Firecrawl with a simple crawl
start = time.time()
scrape_status = app.scrape_url(
'https://www.nbcnews.com/business',
params={'formats': ['markdown', 'html']}
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(scrape_status['markdown']))
# save the markdown content with provider name
with open(f"{__data__}/firecrawl_simple.md", "w") as f:
f.write(scrape_status['markdown'])
# Count how many "cldnry.s-nbcnews.com" are in the markdown
print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
async with AsyncWebCrawler() as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
# js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
word_count_threshold=0,
bypass_cache=True,
verbose=False
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(result.markdown))
# save the markdown content with provider name
with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
f.write(result.markdown)
# count how many "cldnry.s-nbcnews.com" are in the markdown
print(result.markdown.count("cldnry.s-nbcnews.com"))
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
word_count_threshold=0,
bypass_cache=True,
verbose=False
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(result.markdown))
# save the markdown content with provider name
with open(f"{__data__}/crawl4ai_js.md", "w") as f:
f.write(result.markdown)
# count how many "cldnry.s-nbcnews.com" are in the markdown
print(result.markdown.count("cldnry.s-nbcnews.com"))
if __name__ == "__main__":
asyncio.run(compare())
|