Shreyas094's picture
Upload 528 files
372531f verified
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import logging
import hashlib
def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
"""Extract relevant images from the page"""
image_urls = []
try:
# Find all img tags with src attribute
all_images = soup.find_all('img', src=True)
for img in all_images:
img_src = urljoin(url, img['src'])
if img_src.startswith(('http://', 'https://')):
score = 0
# Check for relevant classes
if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']):
score = 4 # Higher score
# Check for size attributes
elif img.get('width') and img.get('height'):
width = parse_dimension(img['width'])
height = parse_dimension(img['height'])
if width and height:
if width >= 2000 and height >= 1000:
score = 3 # Medium score (very large images)
elif width >= 1600 or height >= 800:
score = 2 # Lower score
elif width >= 800 or height >= 500:
score = 1 # Lowest score
elif width >= 500 or height >= 300:
score = 0 # Lowest score
else:
continue # Skip small images
image_urls.append({'url': img_src, 'score': score})
# Sort images by score (highest first)
sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True)
# Select all images with score 3 and 2, then add score 1 images up to a total of 10
high_score_images = [img for img in sorted_images if img['score'] in [3, 2]]
low_score_images = [img for img in sorted_images if img['score'] == 1]
result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))]
return result[:10] # Ensure we don't return more than 10 images in total
except Exception as e:
logging.error(f"Error in get_relevant_images: {e}")
return []
def parse_dimension(value: str) -> int:
"""Parse dimension value, handling px units"""
if value.lower().endswith('px'):
value = value[:-2] # Remove 'px' suffix
try:
return int(value) # Convert to float first to handle decimal values
except ValueError as e:
print(f"Error parsing dimension value {value}: {e}")
return None
def extract_title(soup: BeautifulSoup) -> str:
"""Extract the title from the BeautifulSoup object"""
return soup.title.string if soup.title else ""
def get_image_hash(image_url: str) -> str:
"""Calculate a simple hash based on the image filename and essential query parameters"""
try:
parsed_url = urlparse(image_url)
# Extract the filename
filename = parsed_url.path.split('/')[-1]
# Extract essential query parameters (e.g., 'url' for CDN-served images)
query_params = parse_qs(parsed_url.query)
essential_params = query_params.get('url', [])
# Combine filename and essential parameters
image_identifier = filename + ''.join(essential_params)
# Calculate hash
return hashlib.md5(image_identifier.encode()).hexdigest()
except Exception as e:
logging.error(f"Error calculating image hash for {image_url}: {e}")
return None