|
import os |
|
from scrapingbee import ScrapingBeeClient |
|
from logger import setup_logger |
|
import json |
|
|
|
logger = setup_logger("scraper") |
|
|
|
|
|
client = ScrapingBeeClient(api_key=os.getenv('SCRAPINGBEE_API_KEY', '')) |
|
|
|
def scrape_url(url: str) -> str: |
|
""" |
|
Scrape content from URL using ScrapingBee with AI extraction |
|
|
|
Args: |
|
url: The URL to scrape |
|
|
|
Returns: |
|
str: Extracted text content or error message |
|
""" |
|
try: |
|
logger.info(f"Scraping URL: {url}") |
|
response = client.get( |
|
url, |
|
params={ |
|
'stealth_proxy': True, |
|
'country_code': 'us', |
|
'ai_query': 'Extract the main text content from this page' |
|
} |
|
) |
|
|
|
if response.status_code == 200: |
|
logger.info(f"Successfully scraped URL: {url}") |
|
return response.text if response.text else "No content could be extracted from the URL" |
|
else: |
|
logger.error(f"Failed to scrape URL: {url}, Status: {response.status_code}") |
|
return f"Failed to download the URL. Status code: {response.status_code}" |
|
|
|
except Exception as e: |
|
logger.error(f"Error scraping URL: {url}", exc_info=True) |
|
return f"Error scraping the URL: {str(e)}" |