Omniscient / app.py
Andy Lee
fix: the decisions sometimes are not followed
a2d3b74
raw
history blame
11.8 kB
import streamlit as st
import json
import os
import time
from io import BytesIO
from PIL import Image
from pathlib import Path
from geo_bot import GeoBot, AGENT_PROMPT_TEMPLATE
from benchmark import MapGuesserBenchmark
from config import MODELS_CONFIG, get_data_paths, SUCCESS_THRESHOLD_KM
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from hf_chat import HuggingFaceChat
# Simple API key setup
if "OPENAI_API_KEY" in st.secrets:
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
if "ANTHROPIC_API_KEY" in st.secrets:
os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"]
if "GOOGLE_API_KEY" in st.secrets:
os.environ["GOOGLE_API_KEY"] = st.secrets["GOOGLE_API_KEY"]
if "HF_TOKEN" in st.secrets:
os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
def get_available_datasets():
datasets_dir = Path("datasets")
if not datasets_dir.exists():
return ["default"]
datasets = []
for dataset_dir in datasets_dir.iterdir():
if dataset_dir.is_dir():
data_paths = get_data_paths(dataset_dir.name)
if os.path.exists(data_paths["golden_labels"]):
datasets.append(dataset_dir.name)
return datasets if datasets else ["default"]
def get_model_class(class_name):
if class_name == "ChatOpenAI":
return ChatOpenAI
elif class_name == "ChatAnthropic":
return ChatAnthropic
elif class_name == "ChatGoogleGenerativeAI":
return ChatGoogleGenerativeAI
elif class_name == "HuggingFaceChat":
return HuggingFaceChat
else:
raise ValueError(f"Unknown model class: {class_name}")
# UI Setup
st.set_page_config(page_title="MapCrunch AI Agent", layout="wide")
st.title("πŸ—ΊοΈ MapCrunch AI Agent")
# Sidebar
with st.sidebar:
st.header("Configuration")
dataset_choice = st.selectbox("Dataset", get_available_datasets())
model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
steps_per_sample = st.slider("Max Steps", 3, 20, 10)
# Load dataset
data_paths = get_data_paths(dataset_choice)
with open(data_paths["golden_labels"], "r") as f:
golden_labels = json.load(f).get("samples", [])
st.info(f"Dataset has {len(golden_labels)} samples")
num_samples = st.slider(
"Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
)
start_button = st.button("πŸš€ Start", type="primary")
# Main Logic
if start_button:
test_samples = golden_labels[:num_samples]
config = MODELS_CONFIG[model_choice]
model_class = get_model_class(config["class"])
benchmark_helper = MapGuesserBenchmark(dataset_name=dataset_choice)
all_results = []
progress_bar = st.progress(0)
with GeoBot(
model=model_class, model_name=config["model_name"], headless=True
) as bot:
for i, sample in enumerate(test_samples):
st.divider()
st.header(f"Sample {i + 1}/{num_samples} - ID: {sample.get('id', 'N/A')}")
bot.controller.load_location_from_data(sample)
bot.controller.setup_clean_environment()
# Create scrollable container for this sample
sample_container = st.container()
with sample_container:
# Initialize step tracking
history = []
final_guess = None
for step in range(steps_per_sample):
step_num = step + 1
# Create step container
with st.container():
st.subheader(f"Step {step_num}/{steps_per_sample}")
# Take screenshot and show
bot.controller.label_arrows_on_screen()
screenshot_bytes = bot.controller.take_street_view_screenshot()
col1, col2 = st.columns([1, 2])
with col1:
st.image(
screenshot_bytes,
caption=f"What AI sees",
use_column_width=True,
)
with col2:
# Build history for AI
current_step = {
"image_b64": bot.pil_to_base64(
Image.open(BytesIO(screenshot_bytes))
),
"action": "N/A",
}
history.append(current_step)
available_actions = bot.controller.get_available_actions()
history_text = "\n".join(
[
f"Step {j + 1}: {h['action']}"
for j, h in enumerate(history[:-1])
]
)
if not history_text:
history_text = "First step."
prompt = AGENT_PROMPT_TEMPLATE.format(
remaining_steps=steps_per_sample - step,
history_text=history_text,
available_actions=json.dumps(available_actions),
)
# Show AI context
st.write("**Available Actions:**")
st.code(json.dumps(available_actions, indent=2))
st.write("**AI Context:**")
st.text_area(
"History",
history_text,
height=100,
disabled=True,
key=f"history_{i}_{step}",
)
# Force guess on last step or get AI decision
if step_num == steps_per_sample:
action = "GUESS"
st.warning("Max steps reached. Forcing GUESS.")
else:
# Get AI response
message = bot._create_message_with_history(
prompt, [h["image_b64"] for h in history]
)
response = bot.model.invoke(message)
decision = bot._parse_agent_response(response)
if decision is None:
raise ValueError(
f"Failed to parse AI response: {response.content}"
)
action = decision["action_details"]["action"]
history[-1]["action"] = action
# Show AI decision
st.write("**AI Reasoning:**")
st.info(decision.get("reasoning", "N/A"))
st.write("**AI Action:**")
st.success(f"`{action}`")
# Show raw response
with st.expander("Raw AI Response"):
st.text(response.content)
# Execute action
if action == "GUESS":
if step_num == steps_per_sample:
# Forced guess - use fallback coordinates
lat, lon = 0.0, 0.0
st.error("Forced guess with fallback coordinates")
else:
lat = decision.get("action_details", {}).get("lat")
lon = decision.get("action_details", {}).get("lon")
if lat is not None and lon is not None:
final_guess = (lat, lon)
st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
break
elif action == "MOVE_FORWARD":
bot.controller.move("forward")
elif action == "MOVE_BACKWARD":
bot.controller.move("backward")
elif action == "PAN_LEFT":
bot.controller.pan_view("left")
elif action == "PAN_RIGHT":
bot.controller.pan_view("right")
# Auto scroll to bottom
st.empty() # Force refresh to show latest content
time.sleep(1)
# Sample Results
st.subheader("Sample Result")
true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
distance_km = None
is_success = False
if final_guess:
distance_km = benchmark_helper.calculate_distance(
true_coords, final_guess
)
if distance_km is not None:
is_success = distance_km <= SUCCESS_THRESHOLD_KM
col1, col2, col3 = st.columns(3)
col1.metric(
"Final Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
)
col2.metric(
"Ground Truth",
f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
)
col3.metric(
"Distance",
f"{distance_km:.1f} km",
delta="Success" if is_success else "Failed",
)
else:
st.error("No final guess made")
all_results.append(
{
"sample_id": sample.get("id"),
"model": model_choice,
"true_coordinates": true_coords,
"predicted_coordinates": final_guess,
"distance_km": distance_km,
"success": is_success,
}
)
progress_bar.progress((i + 1) / num_samples)
# Final Summary
st.divider()
st.header("🏁 Final Results")
summary = benchmark_helper.generate_summary(all_results)
if summary and model_choice in summary:
stats = summary[model_choice]
# Overall metrics
col1, col2, col3 = st.columns(3)
col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
col2.metric("Average Distance", f"{stats.get('average_distance_km', 0):.1f} km")
col3.metric("Total Samples", len(all_results))
# Detailed results table
st.subheader("Detailed Results")
st.dataframe(all_results, use_container_width=True)
# Success breakdown
successes = [r for r in all_results if r["success"]]
failures = [r for r in all_results if not r["success"]]
if successes:
st.subheader("Successful Samples")
st.dataframe(successes, use_container_width=True)
if failures:
st.subheader("Failed Samples")
st.dataframe(failures, use_container_width=True)
else:
st.error("Could not generate summary")
st.dataframe(all_results, use_container_width=True)