from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, parse_qs import logging import hashlib def get_relevant_images(soup: BeautifulSoup, url: str) -> list: """Extract relevant images from the page""" image_urls = [] try: # Find all img tags with src attribute all_images = soup.find_all('img', src=True) for img in all_images: img_src = urljoin(url, img['src']) if img_src.startswith(('http://', 'https://')): score = 0 # Check for relevant classes if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']): score = 4 # Higher score # Check for size attributes elif img.get('width') and img.get('height'): width = parse_dimension(img['width']) height = parse_dimension(img['height']) if width and height: if width >= 2000 and height >= 1000: score = 3 # Medium score (very large images) elif width >= 1600 or height >= 800: score = 2 # Lower score elif width >= 800 or height >= 500: score = 1 # Lowest score elif width >= 500 or height >= 300: score = 0 # Lowest score else: continue # Skip small images image_urls.append({'url': img_src, 'score': score}) # Sort images by score (highest first) sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True) # Select all images with score 3 and 2, then add score 1 images up to a total of 10 high_score_images = [img for img in sorted_images if img['score'] in [3, 2]] low_score_images = [img for img in sorted_images if img['score'] == 1] result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))] return result[:10] # Ensure we don't return more than 10 images in total except Exception as e: logging.error(f"Error in get_relevant_images: {e}") return [] def parse_dimension(value: str) -> int: """Parse dimension value, handling px units""" if value.lower().endswith('px'): value = value[:-2] # Remove 'px' suffix try: return int(value) # Convert to float first to handle decimal values except ValueError as e: print(f"Error parsing dimension value {value}: {e}") return None def extract_title(soup: BeautifulSoup) -> str: """Extract the title from the BeautifulSoup object""" return soup.title.string if soup.title else "" def get_image_hash(image_url: str) -> str: """Calculate a simple hash based on the image filename and essential query parameters""" try: parsed_url = urlparse(image_url) # Extract the filename filename = parsed_url.path.split('/')[-1] # Extract essential query parameters (e.g., 'url' for CDN-served images) query_params = parse_qs(parsed_url.query) essential_params = query_params.get('url', []) # Combine filename and essential parameters image_identifier = filename + ''.join(essential_params) # Calculate hash return hashlib.md5(image_identifier.encode()).hexdigest() except Exception as e: logging.error(f"Error calculating image hash for {image_url}: {e}") return None