marks commited on
Commit
ec1f53c
·
1 Parent(s): 30225d1

Added scrappingbee

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. scraper.py +40 -7
requirements.txt CHANGED
@@ -11,4 +11,5 @@ fastapi
11
  langchain_anthropic
12
  langchain_openai
13
  langchain_google_genai
14
- trafilatura
 
 
11
  langchain_anthropic
12
  langchain_openai
13
  langchain_google_genai
14
+ scrapingbee
15
+
scraper.py CHANGED
@@ -1,8 +1,41 @@
1
- import trafilatura
 
 
 
2
 
3
- def scrape_url(url):
4
- downloaded = trafilatura.fetch_url(url)
5
- if downloaded:
6
- text = trafilatura.extract(downloaded, include_links=False, include_formatting=False)
7
- return text if text else "No content could be extracted from the URL"
8
- return "Failed to download the URL"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from scrapingbee import ScrapingBeeClient
3
+ from logger import setup_logger
4
+ import json
5
 
6
+ logger = setup_logger("scraper")
7
+
8
+ # Initialize the ScrapingBee client with API key
9
+ client = ScrapingBeeClient(api_key=os.getenv('SCRAPINGBEE_API_KEY', ''))
10
+
11
+ def scrape_url(url: str) -> str:
12
+ """
13
+ Scrape content from URL using ScrapingBee with AI extraction
14
+
15
+ Args:
16
+ url: The URL to scrape
17
+
18
+ Returns:
19
+ str: Extracted text content or error message
20
+ """
21
+ try:
22
+ logger.info(f"Scraping URL: {url}")
23
+ response = client.get(
24
+ url,
25
+ params={
26
+ 'stealth_proxy': True,
27
+ 'country_code': 'us',
28
+ 'ai_query': 'Extract the main text content from this page'
29
+ }
30
+ )
31
+
32
+ if response.status_code == 200:
33
+ logger.info(f"Successfully scraped URL: {url}")
34
+ return response.text if response.text else "No content could be extracted from the URL"
35
+ else:
36
+ logger.error(f"Failed to scrape URL: {url}, Status: {response.status_code}")
37
+ return f"Failed to download the URL. Status code: {response.status_code}"
38
+
39
+ except Exception as e:
40
+ logger.error(f"Error scraping URL: {url}", exc_info=True)
41
+ return f"Error scraping the URL: {str(e)}"