Shreyas094's picture
Upload 528 files
372531f verified
from typing import List, Dict
from ..actions.utils import stream_output
from ..actions.web_scraping import scrape_urls
from ..scraper.utils import get_image_hash # Add this import
class BrowserManager:
"""Manages context for the researcher agent."""
def __init__(self, researcher):
self.researcher = researcher
async def browse_urls(self, urls: List[str]) -> List[Dict]:
"""
Scrape content from a list of URLs.
Args:
urls (List[str]): List of URLs to scrape.
Returns:
List[Dict]: List of scraped content results.
"""
if self.researcher.verbose:
await stream_output(
"logs",
"scraping_urls",
f"🌐 Scraping content from {len(urls)} URLs...",
self.researcher.websocket,
)
scraped_content, images = scrape_urls(urls, self.researcher.cfg)
self.researcher.add_research_sources(scraped_content)
new_images = self.select_top_images(images, k=4) # Select top 2 images
self.researcher.add_research_images(new_images)
if self.researcher.verbose:
await stream_output(
"logs",
"scraping_content",
f"πŸ“„ Scraped {len(scraped_content)} pages of content",
self.researcher.websocket,
)
await stream_output(
"logs",
"scraping_images",
f"πŸ–ΌοΈ Selected {len(new_images)} new images from {len(images)} total images",
self.researcher.websocket,
True,
new_images
)
await stream_output(
"logs",
"scraping_complete",
f"🌐 Scraping complete",
self.researcher.websocket,
)
return scraped_content
def select_top_images(self, images: List[Dict], k: int = 2) -> List[str]:
"""
Select most relevant images and remove duplicates based on image content.
Args:
images (List[Dict]): List of image dictionaries with 'url' and 'score' keys.
k (int): Number of top images to select if no high-score images are found.
Returns:
List[str]: List of selected image URLs.
"""
unique_images = []
seen_hashes = set()
current_research_images = self.researcher.get_research_images()
# First, select all score 2 and 3 images
high_score_images = [img for img in images if img['score'] >= 2]
for img in high_score_images + images: # Process high-score images first, then all images
img_hash = get_image_hash(img['url'])
if img_hash and img_hash not in seen_hashes and img['url'] not in current_research_images:
seen_hashes.add(img_hash)
unique_images.append(img['url'])
if len(unique_images) == k:
break
return unique_images