# benchmark.py (Final Version) import os import json import time from datetime import datetime from typing import List, Dict, Optional, Tuple from pathlib import Path import math from geo_bot import GeoBot from config import DATA_PATHS, MODELS_CONFIG, SUCCESS_THRESHOLD_KM class MapGuesserBenchmark: def __init__(self, headless: bool = False): self.headless = headless self.golden_labels = self.load_golden_labels() print(f"šŸ“Š Loaded {len(self.golden_labels)} golden label samples") def load_golden_labels(self) -> List[Dict]: try: with open(DATA_PATHS["golden_labels"], "r") as f: return json.load(f).get("samples", []) except Exception: return [] def get_model_class(self, model_name: str): config = MODELS_CONFIG.get(model_name) if not config: raise ValueError(f"Unknown model: {model_name}") class_name, model_class_name = config["class"], config["model_name"] if class_name == "ChatOpenAI": from langchain_openai import ChatOpenAI return ChatOpenAI, model_class_name if class_name == "ChatAnthropic": from langchain_anthropic import ChatAnthropic return ChatAnthropic, model_class_name if class_name == "ChatGoogleGenerativeAI": from langchain_google_genai import ChatGoogleGenerativeAI return ChatGoogleGenerativeAI, model_class_name raise ValueError(f"Unknown model class: {class_name}") def calculate_distance( self, true_coords: Dict, predicted_coords: Optional[Tuple[float, float]] ) -> Optional[float]: """Calculates distance between true (lat,lon) and predicted (lat,lon).""" if not predicted_coords: return None try: true_lat, true_lng = true_coords["lat"], true_coords["lng"] pred_lat, pred_lng = predicted_coords R = 6371 lat1, lon1, lat2, lon2 = map( math.radians, [true_lat, true_lng, pred_lat, pred_lng] ) dlat = lat2 - lat1 dlon = lon2 - lon1 a = ( math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2 ) c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) return R * c except (TypeError, KeyError, IndexError) as e: print(f"Error in distance calculation: {e}") return None def run_benchmark( self, models: Optional[List[str]] = None, max_samples: Optional[int] = None, **kwargs, ) -> Dict: if not self.golden_labels: raise ValueError("No golden labels available.") models_to_test = models or list(MODELS_CONFIG.keys()) test_samples = self.golden_labels[:max_samples] print(f"šŸš€ Starting LIVE benchmark:") print(f" Models: {models_to_test}") print(f" Samples: {len(test_samples)}") all_results = [] for model_name in models_to_test: print(f"\nšŸ¤– Testing model: {model_name}") model_class, model_class_name = self.get_model_class(model_name) try: with GeoBot( model=model_class, model_name=model_class_name, use_selenium=True, headless=self.headless, ) as bot: for i, sample in enumerate(test_samples): print(f" šŸ“ Sample {i + 1}/{len(test_samples)}") try: result = self.run_single_test_with_bot(bot, sample) all_results.append(result) status = ( "āœ… Success" if result.get("success") else "āŒ Failed" ) distance = result.get("distance_km") dist_str = ( f"{distance:.1f} km" if distance is not None else "N/A" ) print(f" {status} (Distance: {dist_str})") except KeyboardInterrupt: print("\nā¹ļø Benchmark inner loop interrupted.") raise except Exception as e: print(f" āŒ Test failed with unhandled exception: {e}") all_results.append( { "model": model_name, "sample_id": sample["id"], "success": False, "error": str(e), } ) except KeyboardInterrupt: print("\nā¹ļø Benchmark outer loop interrupted.") break self.save_results(all_results) return self.generate_summary(all_results) def run_single_test_with_bot(self, bot: GeoBot, location_data: Dict) -> Dict: start_time = time.time() assert bot.controller is not None if not bot.controller.load_location_from_data(location_data): return { "success": False, "error": "Failed to load location", "model": bot.model_name, "sample_id": location_data["id"], } screenshot = bot.take_screenshot() if not screenshot: return { "success": False, "error": "Failed to take screenshot", "model": bot.model_name, "sample_id": location_data["id"], } predicted_lat_lon = bot.analyze_image(screenshot) inference_time = time.time() - start_time true_coords = location_data["coordinates"] distance_km = self.calculate_distance(true_coords, predicted_lat_lon) is_success = distance_km is not None and distance_km <= SUCCESS_THRESHOLD_KM return { "sample_id": location_data["id"], "model": bot.model_name, "true_coordinates": true_coords, "predicted_coordinates": predicted_lat_lon, "distance_km": distance_km, "inference_time": inference_time, "success": is_success, } def save_results(self, results: List[Dict]): if not results: return try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_dir = Path(DATA_PATHS["results"]) results_dir.mkdir(parents=True, exist_ok=True) results_file = results_dir / f"benchmark_results_{timestamp}.json" output_data = { "metadata": {"timestamp": datetime.now().isoformat()}, "results": results, } with open(results_file, "w") as f: json.dump(output_data, f, indent=2, default=str) print(f"šŸ’¾ Results saved to {results_file}") except Exception as e: print(f"āŒ Error saving results: {e}") def generate_summary(self, results: List[Dict]) -> Dict: summary = {} by_model = {} for r in results: model = r.get("model", "unknown") if model not in by_model: by_model[model] = [] by_model[model].append(r) for model, model_results in by_model.items(): successful_runs = [r for r in model_results if r.get("success")] distances = [ r["distance_km"] for r in model_results if r.get("distance_km") is not None ] if not model_results: continue summary[model] = { "success_rate": len(successful_runs) / len(model_results) if model_results else 0, "average_distance_km": sum(distances) / len(distances) if distances else None, "median_distance_km": sorted(distances)[len(distances) // 2] if distances else None, "min_distance_km": min(distances) if distances else None, "max_distance_km": max(distances) if distances else None, } return summary