File size: 14,090 Bytes
6fda968
233b170
 
 
 
 
6fda968
bf35ece
233b170
f2b6ded
6fda968
 
 
 
f2b6ded
233b170
bf35ece
233b170
 
 
 
bf35ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2b6ded
6fda968
 
f2b6ded
 
 
233b170
f2b6ded
6fda968
f2b6ded
233b170
f2b6ded
6fda968
 
 
233b170
bf35ece
 
 
 
f2b6ded
6fda968
f2b6ded
6fda968
233b170
bf35ece
 
6fda968
bf35ece
6fda968
 
bf35ece
 
 
6fda968
bf35ece
 
 
 
6fda968
 
f2b6ded
bf35ece
f2b6ded
6fda968
 
233b170
6fda968
f2b6ded
6fda968
233b170
f2b6ded
6fda968
f2b6ded
6fda968
233b170
6fda968
 
 
233b170
f2b6ded
bf35ece
6fda968
233b170
6fda968
bf35ece
6fda968
233b170
f2b6ded
6fda968
f2b6ded
 
 
6fda968
 
f2b6ded
6fda968
 
 
f2b6ded
6fda968
 
f2b6ded
6fda968
 
 
 
f2b6ded
6fda968
 
 
 
 
 
 
f2b6ded
6fda968
 
4d37e51
6fda968
 
 
 
4d37e51
233b170
6fda968
233b170
f2b6ded
6fda968
 
4d37e51
 
6fda968
4d37e51
 
 
6fda968
 
f2b6ded
4d37e51
 
 
 
 
 
 
233b170
f2b6ded
4d37e51
 
 
 
 
 
 
6fda968
 
4d37e51
 
6fda968
4d37e51
 
 
 
 
 
 
 
 
 
 
6fda968
 
 
4d37e51
 
6fda968
 
233b170
6fda968
 
 
4d37e51
6fda968
233b170
6fda968
 
4d37e51
 
 
 
 
233b170
4d37e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fda968
f2b6ded
6fda968
4d37e51
6fda968
4d37e51
 
 
 
 
 
 
 
6fda968
f2b6ded
6fda968
 
 
 
 
 
 
 
f2b6ded
4d37e51
f2b6ded
 
6fda968
 
4d37e51
 
6fda968
4d37e51
 
6fda968
4d37e51
 
6fda968
4d37e51
 
6fda968
f2b6ded
6fda968
f2b6ded
6fda968
 
 
 
 
 
 
 
 
f2b6ded
6fda968
 
f2b6ded
233b170
6fda968
f2b6ded
6fda968
233b170
6fda968
f2b6ded
6fda968
f2b6ded
6fda968
 
 
f2b6ded
6fda968
 
 
 
 
 
 
 
 
 
 
233b170
f2b6ded
6fda968
f2b6ded
 
6fda968
 
f2b6ded
 
6fda968
f2b6ded
6fda968
 
 
 
 
f2b6ded
 
 
 
 
 
 
6fda968
f2b6ded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import streamlit as st
import json
import os
import time
from io import BytesIO
from PIL import Image
from typing import Dict, List, Any
from pathlib import Path

# Import core logic and configurations from the project
from geo_bot import (
    GeoBot,
    AGENT_PROMPT_TEMPLATE,
    BENCHMARK_PROMPT,
)
from benchmark import MapGuesserBenchmark
from config import MODELS_CONFIG, get_data_paths, SUCCESS_THRESHOLD_KM
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI


# --- Helper function ---
def get_available_datasets():
    """Get list of available datasets"""
    datasets_dir = Path("datasets")
    if not datasets_dir.exists():
        return ["default"]

    datasets = []
    for dataset_dir in datasets_dir.iterdir():
        if dataset_dir.is_dir():
            dataset_name = dataset_dir.name
            data_paths = get_data_paths(dataset_name)
            if os.path.exists(data_paths["golden_labels"]):
                datasets.append(dataset_name)

    return datasets if datasets else ["default"]


# --- Page UI Setup ---
st.set_page_config(page_title="MapCrunch AI Agent", layout="wide")
st.title("πŸ—ΊοΈ MapCrunch AI Agent")
st.caption(
    "An AI agent that explores and identifies geographic locations through multi-step interaction."
)

# --- Sidebar for Configuration ---
with st.sidebar:
    st.header("βš™οΈ Agent Configuration")

    # Get API keys from HF Secrets (must be set in Space settings when deploying)
    os.environ["OPENAI_API_KEY"] = st.secrets.get("OPENAI_API_KEY", "")
    os.environ["ANTHROPIC_API_KEY"] = st.secrets.get("ANTHROPIC_API_KEY", "")
    # os.environ['GOOGLE_API_KEY'] = st.secrets.get("GOOGLE_API_KEY", "")

    # Dataset selection
    available_datasets = get_available_datasets()
    dataset_choice = st.selectbox("Select Dataset", available_datasets)

    model_choice = st.selectbox("Select AI Model", list(MODELS_CONFIG.keys()))
    steps_per_sample = st.slider(
        "Max Exploration Steps per Sample", min_value=3, max_value=20, value=10
    )

    # Load golden labels for selected dataset
    data_paths = get_data_paths(dataset_choice)
    try:
        with open(data_paths["golden_labels"], "r", encoding="utf-8") as f:
            golden_labels = json.load(f).get("samples", [])
        total_samples = len(golden_labels)

        st.info(f"Dataset '{dataset_choice}' has {total_samples} samples")

        num_samples_to_run = st.slider(
            "Number of Samples to Test",
            min_value=1,
            max_value=total_samples,
            value=min(3, total_samples),
        )
    except FileNotFoundError:
        st.error(
            f"Dataset '{dataset_choice}' not found at {data_paths['golden_labels']}. Please create the dataset first."
        )
        golden_labels = []
        num_samples_to_run = 0

    start_button = st.button(
        "πŸš€ Start Agent Benchmark", disabled=(num_samples_to_run == 0), type="primary"
    )

# --- Agent Execution Logic ---
if start_button:
    # Prepare the environment
    test_samples = golden_labels[:num_samples_to_run]

    config = MODELS_CONFIG.get(model_choice)
    model_class = globals()[config["class"]]
    model_instance_name = config["model_name"]

    # Initialize helpers and result lists
    benchmark_helper = MapGuesserBenchmark(dataset_name=dataset_choice)
    all_results = []

    st.info(
        f"Starting Agent Benchmark... Dataset: {dataset_choice}, Model: {model_choice}, Steps: {steps_per_sample}, Samples: {num_samples_to_run}"
    )

    overall_progress_bar = st.progress(0, text="Overall Progress")

    # Initialize the bot outside the loop to reuse the browser instance for efficiency
    with st.spinner("Initializing browser and AI model..."):
        # Note: Must run in headless mode on HF Spaces
        bot = GeoBot(model=model_class, model_name=model_instance_name, headless=True)

    # Main loop to iterate through all selected test samples
    for i, sample in enumerate(test_samples):
        sample_id = sample.get("id", "N/A")
        st.divider()
        st.header(f"▢️ Running Sample {i + 1}/{num_samples_to_run} (ID: {sample_id})")

        if not bot.controller.load_location_from_data(sample):
            st.error(f"Failed to load location for sample {sample_id}. Skipping.")
            continue

        bot.controller.setup_clean_environment()

        # Create the visualization layout for the current sample
        col1, col2 = st.columns([2, 3])
        with col1:
            image_placeholder = st.empty()
        with col2:
            reasoning_placeholder = st.empty()
            action_placeholder = st.empty()

        # --- Inner agent exploration loop ---
        history = []
        final_guess = None
        step_history_container = st.container()

        for step in range(steps_per_sample):
            step_num = step + 1
            reasoning_placeholder.info(
                f"πŸ€” Thinking... (Step {step_num}/{steps_per_sample})"
            )
            action_placeholder.empty()

            # Observe and label arrows
            bot.controller.label_arrows_on_screen()
            screenshot_bytes = bot.controller.take_street_view_screenshot()

            # Current view
            image_placeholder.image(
                screenshot_bytes,
                caption=f"πŸ” Step {step_num} - What AI Sees Now",
                use_column_width=True,
            )

            # Update history
            current_step_data = {
                "image_b64": bot.pil_to_base64(Image.open(BytesIO(screenshot_bytes))),
                "action": "N/A",
                "screenshot_bytes": screenshot_bytes,
                "step_num": step_num,
            }
            history.append(current_step_data)

            # Think
            available_actions = bot.controller.get_available_actions()
            history_text = "\n".join(
                [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history[:-1])]
            )
            if not history_text:
                history_text = "No history yet. This is the first step."

            prompt = AGENT_PROMPT_TEMPLATE.format(
                remaining_steps=steps_per_sample - step,
                history_text=history_text,
                available_actions=json.dumps(available_actions),
            )

            # Show what AI is considering
            with reasoning_placeholder:
                st.info("🧠 **AI is analyzing the situation...**")
                with st.expander("πŸ” Available Actions", expanded=False):
                    st.json(available_actions)
                with st.expander("πŸ“ Context Being Considered", expanded=False):
                    st.text_area(
                        "History Context:", history_text, height=100, disabled=True
                    )

            message = bot._create_message_with_history(
                prompt, [h["image_b64"] for h in history]
            )

            # Get AI response
            response = bot.model.invoke(message)
            decision = bot._parse_agent_response(response)

            if not decision:  # Fallback
                decision = {
                    "action_details": {"action": "PAN_RIGHT"},
                    "reasoning": "⚠️ Response parsing failed. Using default recovery action.",
                }

            action = decision.get("action_details", {}).get("action")
            history[-1]["action"] = action
            history[-1]["reasoning"] = decision.get("reasoning", "N/A")
            history[-1]["raw_response"] = (
                response.content[:500] + "..."
                if len(response.content) > 500
                else response.content
            )

            # Display AI's decision process
            reasoning_placeholder.success("βœ… **AI Decision Made!**")

            with action_placeholder:
                st.success(f"🎯 **AI Action:** `{action}`")

                # Detailed reasoning display
                with st.expander("🧠 AI's Detailed Thinking Process", expanded=True):
                    col_reason, col_raw = st.columns([2, 1])

                    with col_reason:
                        st.markdown("**πŸ€” AI's Reasoning:**")
                        st.info(decision.get("reasoning", "N/A"))

                        if action == "GUESS":
                            lat = decision.get("action_details", {}).get("lat")
                            lon = decision.get("action_details", {}).get("lon")
                            if lat and lon:
                                st.success(f"πŸ“ **Final Guess:** {lat:.4f}, {lon:.4f}")

                    with col_raw:
                        st.markdown("**πŸ”€ Raw AI Response:**")
                        st.text_area(
                            "Full Response:",
                            history[-1]["raw_response"],
                            height=200,
                            disabled=True,
                            key=f"raw_response_{step_num}",
                        )

            # Store step in history display
            with step_history_container:
                with st.expander(f"πŸ“š Step {step_num} History", expanded=False):
                    hist_col1, hist_col2 = st.columns([1, 2])
                    with hist_col1:
                        st.image(
                            screenshot_bytes, caption=f"Step {step_num} View", width=200
                        )
                    with hist_col2:
                        st.write(f"**Action:** {action}")
                        st.write(
                            f"**Reasoning:** {decision.get('reasoning', 'N/A')[:150]}..."
                        )

            # Force a GUESS on the last step
            if step_num == steps_per_sample and action != "GUESS":
                st.warning("⏰ Max steps reached. Forcing a GUESS action.")
                action = "GUESS"
                # Force coordinates if missing
                if not decision.get("action_details", {}).get("lat"):
                    st.error("❌ AI didn't provide coordinates. Using fallback guess.")
                    decision["action_details"] = {
                        "action": "GUESS",
                        "lat": 0.0,
                        "lon": 0.0,
                    }

            # Act
            if action == "GUESS":
                lat, lon = (
                    decision.get("action_details", {}).get("lat"),
                    decision.get("action_details", {}).get("lon"),
                )
                if lat is not None and lon is not None:
                    final_guess = (lat, lon)
                else:
                    st.error(
                        "❌ GUESS action was missing coordinates. Guess failed for this sample."
                    )
                break  # End exploration for the current sample

            elif action == "MOVE_FORWARD":
                with st.spinner("🚢 Moving forward..."):
                    bot.controller.move("forward")
            elif action == "MOVE_BACKWARD":
                with st.spinner("πŸ”„ Moving backward..."):
                    bot.controller.move("backward")
            elif action == "PAN_LEFT":
                with st.spinner("⬅️ Panning left..."):
                    bot.controller.pan_view("left")
            elif action == "PAN_RIGHT":
                with st.spinner("➑️ Panning right..."):
                    bot.controller.pan_view("right")

            time.sleep(1)  # A brief pause between steps for better visualization

        # --- End of single sample run, calculate and display results ---
        true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
        distance_km = None
        is_success = False

        if final_guess:
            distance_km = benchmark_helper.calculate_distance(true_coords, final_guess)
            if distance_km is not None:
                is_success = distance_km <= SUCCESS_THRESHOLD_KM

            st.subheader("🎯 Round Result")
            res_col1, res_col2, res_col3 = st.columns(3)
            res_col1.metric(
                "Final Guess (Lat, Lon)", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
            )
            res_col2.metric(
                "Ground Truth (Lat, Lon)",
                f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
            )
            res_col3.metric(
                "Distance Error",
                f"{distance_km:.1f} km" if distance_km is not None else "N/A",
                delta=f"{'Success' if is_success else 'Failure'}",
                delta_color=("inverse" if is_success else "off"),
            )
        else:
            st.error("Agent failed to make a final guess.")

        all_results.append(
            {
                "sample_id": sample_id,
                "model": model_choice,
                "true_coordinates": true_coords,
                "predicted_coordinates": final_guess,
                "distance_km": distance_km,
                "success": is_success,
            }
        )

        # Update overall progress bar
        overall_progress_bar.progress(
            (i + 1) / num_samples_to_run,
            text=f"Overall Progress: {i + 1}/{num_samples_to_run}",
        )

    # --- End of all samples, display final summary ---
    bot.close()  # Close the browser
    st.divider()
    st.header("🏁 Benchmark Summary")

    summary = benchmark_helper.generate_summary(all_results)
    if summary and model_choice in summary:
        stats = summary[model_choice]
        sum_col1, sum_col2 = st.columns(2)
        sum_col1.metric(
            "Overall Success Rate", f"{stats.get('success_rate', 0) * 100:.1f} %"
        )
        sum_col2.metric(
            "Average Distance Error", f"{stats.get('average_distance_km', 0):.1f} km"
        )
        st.dataframe(all_results)  # Display the detailed results table
    else:
        st.warning("Not enough results to generate a summary.")