Alovestocode commited on
Commit
e5713dc
·
verified ·
1 Parent(s): bf05e9e

Initial scaffold

Browse files
Files changed (4) hide show
  1. __pycache__/app.cpython-313.pyc +0 -0
  2. app.py +720 -0
  3. requirements.txt +4 -0
  4. space_config.json +13 -0
__pycache__/app.cpython-313.pyc ADDED
Binary file (32.2 kB). View file
 
app.py ADDED
@@ -0,0 +1,720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ import importlib.util
11
+ import re
12
+
13
+ import gradio as gr
14
+
15
+ # Ensure Milestone 5 evaluation utilities are importable when running inside the Space.
16
+ REPO_ROOT = Path(__file__).resolve().parents[3]
17
+ EVAL_DIR = REPO_ROOT / "Milestone-5" / "router-agent"
18
+ if EVAL_DIR.exists():
19
+ sys.path.insert(0, str(EVAL_DIR))
20
+
21
+ try:
22
+ from schema_score import ( # type: ignore
23
+ run_schema_evaluation,
24
+ tool_sequence,
25
+ todo_covers_all_tools,
26
+ todo_tool_alignment,
27
+ )
28
+ except Exception as exc: # pragma: no cover - handled gracefully in UI.
29
+ run_schema_evaluation = None
30
+ tool_sequence = None
31
+ todo_covers_all_tools = None
32
+ todo_tool_alignment = None
33
+ SCHEMA_IMPORT_ERROR = str(exc)
34
+ else:
35
+ SCHEMA_IMPORT_ERROR = ""
36
+
37
+ try:
38
+ from router_benchmark_runner import ( # type: ignore
39
+ load_thresholds,
40
+ evaluate_thresholds,
41
+ )
42
+ except Exception as exc: # pragma: no cover
43
+ load_thresholds = None
44
+ evaluate_thresholds = None
45
+ THRESHOLD_IMPORT_ERROR = str(exc)
46
+ else:
47
+ THRESHOLD_IMPORT_ERROR = ""
48
+
49
+ try:
50
+ from huggingface_hub import InferenceClient
51
+ except Exception: # pragma: no cover
52
+ InferenceClient = None # type: ignore
53
+
54
+
55
+ HF_ROUTER_REPO = os.environ.get("HF_ROUTER_REPO", "")
56
+ HF_TOKEN = os.environ.get("HF_TOKEN")
57
+
58
+ BENCH_GOLD_PATH = EVAL_DIR / "benchmarks" / "router_benchmark_hard.jsonl"
59
+ THRESHOLDS_PATH = EVAL_DIR / "router_benchmark_thresholds.json"
60
+
61
+ client = None
62
+ if HF_ROUTER_REPO and InferenceClient is not None:
63
+ try:
64
+ client = InferenceClient(model=HF_ROUTER_REPO, token=HF_TOKEN)
65
+ except Exception as exc: # pragma: no cover
66
+ client = None
67
+ ROUTER_LOAD_ERROR = str(exc)
68
+ else:
69
+ ROUTER_LOAD_ERROR = ""
70
+ else:
71
+ ROUTER_LOAD_ERROR = "InferenceClient unavailable or HF_ROUTER_REPO unset."
72
+
73
+
74
+ SYSTEM_PROMPT = (
75
+ "You are the Router Agent coordinating Math, Code, and General-Search specialists.\n"
76
+ "Emit ONLY strict JSON with keys route_plan, route_rationale, expected_artifacts,\n"
77
+ "thinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics."
78
+ )
79
+
80
+ AGENT_LOAD_LOG: List[str] = []
81
+
82
+
83
+ def _load_module(module_name: str, file_path: Path):
84
+ if not file_path.exists():
85
+ AGENT_LOAD_LOG.append(f"Missing module: {file_path}")
86
+ return None
87
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
88
+ if spec is None or spec.loader is None:
89
+ AGENT_LOAD_LOG.append(f"Unable to load spec for {file_path}")
90
+ return None
91
+ module = importlib.util.module_from_spec(spec)
92
+ try:
93
+ spec.loader.exec_module(module) # type: ignore[attr-defined]
94
+ except Exception as exc:
95
+ AGENT_LOAD_LOG.append(f"Failed to import {file_path.name}: {exc}")
96
+ return None
97
+ return module
98
+
99
+
100
+ M6_ROOT = REPO_ROOT / "Milestone-6"
101
+ AGENT_BASE_PATH = M6_ROOT / "agents" / "base.py"
102
+ BASE_MODULE = _load_module("router_agents_base", AGENT_BASE_PATH)
103
+
104
+ if BASE_MODULE:
105
+ AgentRequest = getattr(BASE_MODULE, "AgentRequest", None)
106
+ AgentResult = getattr(BASE_MODULE, "AgentResult", None)
107
+ else:
108
+ AgentRequest = None
109
+ AgentResult = None
110
+ AGENT_LOAD_LOG.append("Agent base definitions unavailable; agent execution disabled.")
111
+
112
+
113
+ class GeminiFallbackManager:
114
+ """Fallback generator powered by Gemini 2.5 Pro (if configured)."""
115
+
116
+ def __init__(self) -> None:
117
+ self.available = False
118
+ self.error: Optional[str] = None
119
+ self.model = None
120
+ self.model_name = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro-exp-0801")
121
+ api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
122
+ try:
123
+ import google.generativeai as genai # type: ignore
124
+ except Exception as exc: # pragma: no cover
125
+ self.error = f"google-generativeai import failed: {exc}"
126
+ AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}")
127
+ return
128
+ if not api_key:
129
+ self.error = "GOOGLE_API_KEY (or GEMINI_API_KEY) not set."
130
+ AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}")
131
+ return
132
+ try:
133
+ genai.configure(api_key=api_key)
134
+ self.model = genai.GenerativeModel(self.model_name)
135
+ except Exception as exc: # pragma: no cover
136
+ self.error = f"Failed to initialise Gemini model: {exc}"
137
+ AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}")
138
+ return
139
+ self.available = True
140
+ AGENT_LOAD_LOG.append(f"Gemini fallback ready (model={self.model_name}).")
141
+
142
+ def generate(self, tool_name: str, request: Any, error: Optional[str] = None) -> Any:
143
+ if not self.available or self.model is None or AgentResult is None:
144
+ raise RuntimeError("Gemini fallback not available.")
145
+ if isinstance(request, dict):
146
+ context = request.get("context") or {}
147
+ step_instruction = request.get("user_query", "")
148
+ else:
149
+ context = getattr(request, "context", {}) or {}
150
+ step_instruction = getattr(request, "user_query", "")
151
+ original_query = context.get("original_query", "")
152
+
153
+ prompt = (
154
+ f"You are the fallback specialist for router tool `{tool_name}`.\n"
155
+ "Provide a thoughtful, self-contained response even when primary agents fail.\n"
156
+ "Instructions:\n"
157
+ "- Derive or explain any mathematics rigorously with step-by-step reasoning.\n"
158
+ "- When code is required, output Python snippets and describe expected outputs; "
159
+ "assume execution in a safe environment but do not fabricate results without caveats.\n"
160
+ "- When internet search is needed, hypothesise likely high-quality sources and cite them "
161
+ "as inline references (e.g., [search:keyword] or known publications).\n"
162
+ "- Make assumptions explicit, and flag any gaps that require real execution or live search.\n"
163
+ "- Return the final answer in Markdown.\n"
164
+ )
165
+ prompt += f"\nOriginal user query:\n{original_query or 'N/A'}\n"
166
+ prompt += f"\nCurrent routed instruction:\n{step_instruction}\n"
167
+ if error:
168
+ prompt += f"\nPrevious agent error: {error}\n"
169
+ try:
170
+ response = self.model.generate_content(
171
+ prompt,
172
+ generation_config={"temperature": 0.2, "top_p": 0.8},
173
+ )
174
+ text = getattr(response, "text", None)
175
+ if text is None and hasattr(response, "candidates"):
176
+ text = response.candidates[0].content.parts[0].text # type: ignore
177
+ except Exception as exc: # pragma: no cover
178
+ raise RuntimeError(f"Gemini fallback generation failed: {exc}") from exc
179
+ if not text:
180
+ text = "Fallback model did not return content."
181
+ metrics = {"status": "fallback", "model": self.model_name}
182
+ if error:
183
+ metrics["upstream_error"] = error
184
+ return AgentResult(content=text, metrics=metrics)
185
+
186
+
187
+ fallback_manager = GeminiFallbackManager()
188
+
189
+
190
+ def _load_agent_class(
191
+ agent_name: str,
192
+ primary_path: Path,
193
+ primary_class: str,
194
+ fallback_path: Optional[Path] = None,
195
+ fallback_class: Optional[str] = None,
196
+ ):
197
+ module = _load_module(f"{agent_name}_primary", primary_path)
198
+ if module and hasattr(module, primary_class):
199
+ AGENT_LOAD_LOG.append(f"Loaded {primary_class} from {primary_path}")
200
+ return getattr(module, primary_class)
201
+ if fallback_path and fallback_class:
202
+ fallback_module = _load_module(f"{agent_name}_fallback", fallback_path)
203
+ if fallback_module and hasattr(fallback_module, fallback_class):
204
+ AGENT_LOAD_LOG.append(f"Using fallback {fallback_class} for {agent_name}")
205
+ return getattr(fallback_module, fallback_class)
206
+ AGENT_LOAD_LOG.append(f"No implementation available for {agent_name}")
207
+ return None
208
+
209
+
210
+ AGENT_REGISTRY: Dict[str, Any] = {}
211
+
212
+
213
+ def _register_agent(name: str, agent_obj: Any) -> None:
214
+ AGENT_REGISTRY[name] = agent_obj
215
+ if name.startswith("/"):
216
+ AGENT_REGISTRY[name.lstrip("/")] = agent_obj
217
+ else:
218
+ AGENT_REGISTRY[f"/{name}"] = agent_obj
219
+
220
+
221
+ if AgentRequest is not None and AgentResult is not None:
222
+ # Math agent
223
+ math_class = _load_agent_class(
224
+ "math_agent",
225
+ M6_ROOT / "math-agent" / "handler.py",
226
+ "MathAgent",
227
+ fallback_path=M6_ROOT / "math-agent" / "math_agent_template.py",
228
+ fallback_class="TemplateMathAgent",
229
+ )
230
+ # Code agent
231
+ code_class = _load_agent_class(
232
+ "code_agent",
233
+ M6_ROOT / "code-agent" / "handler.py",
234
+ "CodeAgent",
235
+ )
236
+ # General-search agent
237
+ general_class = _load_agent_class(
238
+ "general_agent",
239
+ M6_ROOT / "general-agent" / "handler.py",
240
+ "GeneralSearchAgent",
241
+ )
242
+
243
+ class _StubAgent:
244
+ def __init__(self, tool_name: str, message: str):
245
+ self.name = tool_name
246
+ self._message = message
247
+
248
+ def invoke(self, request: Any) -> Any:
249
+ if fallback_manager.available:
250
+ try:
251
+ return fallback_manager.generate(self.name, request)
252
+ except Exception as exc: # pragma: no cover
253
+ AGENT_LOAD_LOG.append(f"Gemini fallback failed for {self.name}: {exc}")
254
+ return AgentResult(
255
+ content=self._message,
256
+ metrics={"status": "stub", "tool": self.name},
257
+ )
258
+
259
+ if math_class is None:
260
+ math_agent = _StubAgent("/math", "Math agent not yet implemented.")
261
+ else:
262
+ try:
263
+ math_agent = math_class()
264
+ except Exception as exc:
265
+ AGENT_LOAD_LOG.append(f"MathAgent instantiation failed: {exc}")
266
+ math_agent = _StubAgent("/math", f"Math agent load error: {exc}")
267
+ _register_agent("/math", math_agent)
268
+
269
+ if code_class is None:
270
+ code_agent = _StubAgent("/code", "Code agent not yet implemented.")
271
+ else:
272
+ try:
273
+ code_agent = code_class()
274
+ except Exception as exc:
275
+ AGENT_LOAD_LOG.append(f"CodeAgent instantiation failed: {exc}")
276
+ code_agent = _StubAgent("/code", f"Code agent load error: {exc}")
277
+ _register_agent("/code", code_agent)
278
+
279
+ if general_class is None:
280
+ general_agent = _StubAgent("/general-search", "General-search agent not yet implemented.")
281
+ else:
282
+ try:
283
+ general_agent = general_class()
284
+ except Exception as exc:
285
+ AGENT_LOAD_LOG.append(f"GeneralSearchAgent instantiation failed: {exc}")
286
+ general_agent = _StubAgent("/general-search", f"General agent load error: {exc}")
287
+ _register_agent("/general-search", general_agent)
288
+ else:
289
+ AGENT_LOAD_LOG.append("AgentRequest/AgentResult undefined; skipping agent registry.")
290
+
291
+
292
+ AGENT_STATUS_MARKDOWN = (
293
+ "\n".join(f"- {line}" for line in AGENT_LOAD_LOG) if AGENT_LOAD_LOG else "- Agent stubs loaded successfully."
294
+ )
295
+
296
+ STARTUP_BENCHMARK_RESULT = run_startup_benchmark()
297
+
298
+ def load_sample_plan() -> Dict[str, Any]:
299
+ try:
300
+ if BENCH_GOLD_PATH.exists():
301
+ first_line = BENCH_GOLD_PATH.read_text().splitlines()[0]
302
+ record = json.loads(first_line)
303
+ completion = json.loads(record["completion"])
304
+ return completion
305
+ except Exception:
306
+ pass
307
+ # Fallback minimal example.
308
+ return {
309
+ "route_plan": [
310
+ "/general-search(query=\"site:arxiv.org meta-learning survey\", mode=web)",
311
+ "/math(Outline a theoretical summary of Model-Agnostic Meta-Learning (MAML) and explain the inner/outer-loop updates.)",
312
+ "/code(Implement a minimal MAML pseudo-code example to clarify the algorithm flow., using Python)",
313
+ ],
314
+ "route_rationale": (
315
+ "Search surfaces authoritative meta-learning references; "
316
+ "math distills the theory; code converts the derivation into an executable sketch."
317
+ ),
318
+ "expected_artifacts": [
319
+ "Three bullet summary of seminal MAML papers.",
320
+ "Equation block describing the meta-gradient.",
321
+ "`maml_pseudocode.py` script with comments.",
322
+ ],
323
+ "thinking_outline": [
324
+ "1. Gather citations describing MAML.",
325
+ "2. Express the loss formulation and gradient steps.",
326
+ "3. Provide annotated pseudo-code for the inner/outer loop.",
327
+ ],
328
+ "handoff_plan": "/general-search -> /math -> /code -> router QA",
329
+ "todo_list": [
330
+ "- [ ] /general-search: Collect recent survey or benchmark sources for MAML.",
331
+ "- [ ] /math: Write the meta-objective and gradient derivation.",
332
+ "- [ ] /code: Produce pseudo-code and comment on hyperparameters.",
333
+ "- [ ] router QA: Ensure JSON schema compliance and cite sources.",
334
+ ],
335
+ "difficulty": "intermediate",
336
+ "tags": ["meta-learning", "few-shot-learning"],
337
+ "acceptance_criteria": [
338
+ "- Includes at least two citations to reputable sources.",
339
+ "- Meta-gradient expression matches the pseudo-code implementation.",
340
+ "- JSON validates against the router schema.",
341
+ ],
342
+ "metrics": {
343
+ "primary": ["Route accuracy >= 0.8 on benchmark."],
344
+ "secondary": ["Report token count and inference latency."],
345
+ },
346
+ }
347
+
348
+
349
+ SAMPLE_PLAN = load_sample_plan()
350
+
351
+ TOOL_REGEX = re.compile(r"^\s*(/[a-zA-Z0-9_-]+)")
352
+
353
+
354
+ def extract_json_from_text(raw_text: str) -> Dict[str, Any]:
355
+ try:
356
+ start = raw_text.index("{")
357
+ end = raw_text.rfind("}")
358
+ candidate = raw_text[start : end + 1]
359
+ return json.loads(candidate)
360
+ except Exception as exc:
361
+ raise ValueError(f"Router output is not valid JSON: {exc}") from exc
362
+
363
+
364
+ def call_router_model(user_query: str) -> Dict[str, Any]:
365
+ if client is None:
366
+ return SAMPLE_PLAN
367
+
368
+ prompt = f"{SYSTEM_PROMPT}\n\nUser query:\n{user_query.strip()}\n"
369
+ try:
370
+ raw = client.text_generation(
371
+ prompt,
372
+ max_new_tokens=900,
373
+ temperature=0.2,
374
+ top_p=0.9,
375
+ repetition_penalty=1.05,
376
+ )
377
+ return extract_json_from_text(raw)
378
+ except Exception as exc: # pragma: no cover
379
+ return {
380
+ "error": f"Router call failed ({exc}). Falling back to sample plan.",
381
+ "sample_plan": SAMPLE_PLAN,
382
+ }
383
+
384
+
385
+ def generate_plan(user_query: str) -> Dict[str, Any]:
386
+ if not user_query.strip():
387
+ raise gr.Error("Please provide a user query to route.")
388
+ plan = call_router_model(user_query)
389
+ return plan
390
+
391
+
392
+ def generate_plan_and_store(user_query: str) -> tuple[Dict[str, Any], str]:
393
+ plan = generate_plan(user_query)
394
+ return plan, user_query
395
+
396
+
397
+ def _resolve_plan_object(plan_input: Any) -> Optional[Dict[str, Any]]:
398
+ plan_obj: Optional[Dict[str, Any]]
399
+ if isinstance(plan_input, str):
400
+ try:
401
+ plan_obj = json.loads(plan_input)
402
+ except json.JSONDecodeError:
403
+ return None
404
+ elif isinstance(plan_input, dict):
405
+ plan_obj = plan_input
406
+ else:
407
+ return None
408
+ if "route_plan" not in plan_obj and isinstance(plan_obj.get("sample_plan"), dict):
409
+ plan_obj = plan_obj["sample_plan"]
410
+ return plan_obj if isinstance(plan_obj, dict) else None
411
+
412
+
413
+ def execute_plan(plan_input: Any, original_query: str) -> Dict[str, Any]:
414
+ if AgentRequest is None or AgentResult is None:
415
+ return {"success": False, "error": "Agent interfaces unavailable; cannot execute plan."}
416
+ plan_obj = _resolve_plan_object(plan_input)
417
+ if not plan_obj:
418
+ return {"success": False, "error": "Plan must be valid JSON with a route_plan field."}
419
+ route_plan = plan_obj.get("route_plan")
420
+ if not isinstance(route_plan, list):
421
+ return {"success": False, "error": "Plan is missing a route_plan list."}
422
+
423
+ results: List[Dict[str, Any]] = []
424
+ for step_index, step in enumerate(route_plan):
425
+ if not isinstance(step, str):
426
+ results.append(
427
+ {
428
+ "step_index": step_index,
429
+ "status": "invalid_step",
430
+ "message": "Route step must be a string.",
431
+ }
432
+ )
433
+ continue
434
+ match = TOOL_REGEX.match(step)
435
+ tool_name = match.group(1) if match else "unknown"
436
+ agent = AGENT_REGISTRY.get(tool_name) or AGENT_REGISTRY.get(tool_name.lstrip("/"))
437
+ if agent is None:
438
+ results.append(
439
+ {
440
+ "step_index": step_index,
441
+ "tool": tool_name,
442
+ "status": "skipped",
443
+ "message": "No agent registered for this tool.",
444
+ }
445
+ )
446
+ continue
447
+
448
+ request = AgentRequest(
449
+ user_query=step,
450
+ context={"original_query": original_query},
451
+ plan_metadata={"step_index": step_index, "raw_step": step},
452
+ )
453
+ try:
454
+ agent_result = agent.invoke(request)
455
+ except Exception as exc:
456
+ if fallback_manager.available:
457
+ try:
458
+ agent_result = fallback_manager.generate(tool_name, request, error=str(exc))
459
+ except Exception as fallback_exc: # pragma: no cover
460
+ results.append(
461
+ {
462
+ "step_index": step_index,
463
+ "tool": tool_name,
464
+ "status": "error",
465
+ "message": f"{exc}; fallback failed: {fallback_exc}",
466
+ }
467
+ )
468
+ continue
469
+ else:
470
+ results.append(
471
+ {
472
+ "step_index": step_index,
473
+ "tool": tool_name,
474
+ "status": "error",
475
+ "message": str(exc),
476
+ }
477
+ )
478
+ continue
479
+ results.append(
480
+ {
481
+ "step_index": step_index,
482
+ "tool": tool_name,
483
+ "content": getattr(agent_result, "content", ""),
484
+ "citations": getattr(agent_result, "citations", []),
485
+ "artifacts": getattr(agent_result, "artifacts", []),
486
+ "metrics": getattr(agent_result, "metrics", {}),
487
+ }
488
+ )
489
+ return {"success": True, "results": results}
490
+
491
+
492
+ def run_startup_benchmark() -> Dict[str, Any]:
493
+ if run_schema_evaluation is None or load_thresholds is None or evaluate_thresholds is None:
494
+ return {"status": "unavailable", "message": "Benchmark utilities not available in this environment."}
495
+ prediction_path = os.environ.get("ROUTER_BENCHMARK_PREDICTIONS")
496
+ if not prediction_path:
497
+ return {"status": "skipped", "message": "Set ROUTER_BENCHMARK_PREDICTIONS to auto-run benchmarks."}
498
+ pred_path = Path(prediction_path)
499
+ if not pred_path.exists():
500
+ return {"status": "error", "message": f"Predictions file not found: {pred_path}"}
501
+ if not BENCH_GOLD_PATH.exists() or not THRESHOLDS_PATH.exists():
502
+ return {"status": "error", "message": "Benchmark gold or thresholds file missing."}
503
+ try:
504
+ schema_report = run_schema_evaluation(
505
+ str(BENCH_GOLD_PATH),
506
+ str(pred_path),
507
+ max_error_examples=5,
508
+ )
509
+ thresholds = load_thresholds(THRESHOLDS_PATH)
510
+ threshold_results = evaluate_thresholds(schema_report["metrics"], thresholds)
511
+ except Exception as exc:
512
+ return {"status": "error", "message": f"Benchmark run failed: {exc}"}
513
+ status = "pass" if threshold_results.get("overall_pass") else "fail"
514
+ return {
515
+ "status": status,
516
+ "message": f"Benchmark {status.upper()} on startup.",
517
+ "report": {
518
+ "schema_report": schema_report,
519
+ "threshold_results": threshold_results,
520
+ },
521
+ "predictions_path": str(pred_path),
522
+ }
523
+
524
+
525
+ def compute_structural_metrics(plan: Dict[str, Any]) -> Dict[str, Any]:
526
+ metrics: Dict[str, Any] = {}
527
+ route_plan = plan.get("route_plan", [])
528
+ if tool_sequence is not None and isinstance(route_plan, list):
529
+ tools = tool_sequence(route_plan)
530
+ todo_list = plan.get("todo_list", []) if isinstance(plan.get("todo_list"), list) else []
531
+ if todo_tool_alignment is not None:
532
+ metrics["todo_tool_alignment"] = todo_tool_alignment(todo_list, tools)
533
+ if todo_covers_all_tools is not None:
534
+ metrics["todo_covers_all_tools"] = todo_covers_all_tools(todo_list, tools)
535
+ handoff = plan.get("handoff_plan", "")
536
+ metrics["handoff_mentions_all_tools"] = all(
537
+ tool.lower() in (handoff or "").lower() for tool in tools
538
+ )
539
+ metrics["expected_artifacts_count"] = len(plan.get("expected_artifacts", []) or [])
540
+ metrics["acceptance_criteria_count"] = len(plan.get("acceptance_criteria", []) or [])
541
+ return metrics
542
+
543
+
544
+ def validate_plan(plan_input: Any) -> Dict[str, Any]:
545
+ if isinstance(plan_input, str):
546
+ try:
547
+ plan = json.loads(plan_input)
548
+ except json.JSONDecodeError as exc:
549
+ return {"valid": False, "errors": [f"Invalid JSON: {exc}"]}
550
+ else:
551
+ plan = plan_input or {}
552
+ errors = []
553
+ required_keys = [
554
+ "route_plan",
555
+ "route_rationale",
556
+ "expected_artifacts",
557
+ "thinking_outline",
558
+ "handoff_plan",
559
+ "todo_list",
560
+ "difficulty",
561
+ "tags",
562
+ "acceptance_criteria",
563
+ "metrics",
564
+ ]
565
+ for key in required_keys:
566
+ if key not in plan:
567
+ errors.append(f"Missing required field: {key}")
568
+ route_plan = plan.get("route_plan")
569
+ if not isinstance(route_plan, list) or not route_plan:
570
+ errors.append("route_plan must be a non-empty list of tool invocations.")
571
+ else:
572
+ for step in route_plan:
573
+ if not isinstance(step, str):
574
+ errors.append("Each route_plan entry must be a string.")
575
+ break
576
+ todo_list = plan.get("todo_list")
577
+ if todo_list is not None and not isinstance(todo_list, list):
578
+ errors.append("todo_list must be a list of strings.")
579
+ metrics_block = plan.get("metrics")
580
+ if metrics_block is not None and not isinstance(metrics_block, dict):
581
+ errors.append("metrics must be a dictionary with primary/secondary lists.")
582
+
583
+ structural = compute_structural_metrics(plan)
584
+
585
+ return {
586
+ "valid": len(errors) == 0,
587
+ "errors": errors,
588
+ "structural_metrics": structural,
589
+ "tool_count": len(route_plan) if isinstance(route_plan, list) else 0,
590
+ }
591
+
592
+
593
+ def benchmark_predictions(pred_file: Any) -> Dict[str, Any]:
594
+ if run_schema_evaluation is None or load_thresholds is None or evaluate_thresholds is None:
595
+ return {
596
+ "success": False,
597
+ "error": "Benchmark utilities are unavailable.",
598
+ "schema_import_error": SCHEMA_IMPORT_ERROR,
599
+ "threshold_import_error": THRESHOLD_IMPORT_ERROR,
600
+ }
601
+ if not BENCH_GOLD_PATH.exists():
602
+ return {
603
+ "success": False,
604
+ "error": f"Benchmark gold file missing: {BENCH_GOLD_PATH}",
605
+ }
606
+ if not THRESHOLDS_PATH.exists():
607
+ return {
608
+ "success": False,
609
+ "error": f"Thresholds file missing: {THRESHOLDS_PATH}",
610
+ }
611
+
612
+ if pred_file is None:
613
+ return {"success": False, "error": "Upload a .jsonl predictions file first."}
614
+
615
+ if hasattr(pred_file, "name"):
616
+ pred_path = Path(pred_file.name)
617
+ elif isinstance(pred_file, str):
618
+ pred_path = Path(pred_file)
619
+ else:
620
+ # Save uploaded bytes to a temp file.
621
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl") as tmp:
622
+ tmp.write(pred_file.read())
623
+ pred_path = Path(tmp.name)
624
+
625
+ try:
626
+ schema_report = run_schema_evaluation(
627
+ str(BENCH_GOLD_PATH),
628
+ str(pred_path),
629
+ max_error_examples=10,
630
+ )
631
+ except Exception as exc:
632
+ return {"success": False, "error": f"Schema evaluation failed: {exc}"}
633
+
634
+ try:
635
+ thresholds = load_thresholds(THRESHOLDS_PATH)
636
+ threshold_results = evaluate_thresholds(schema_report["metrics"], thresholds)
637
+ except Exception as exc:
638
+ return {"success": False, "error": f"Threshold comparison failed: {exc}"}
639
+
640
+ return {
641
+ "success": True,
642
+ "overall_pass": threshold_results.get("overall_pass"),
643
+ "schema_metrics": schema_report["metrics"],
644
+ "threshold_results": threshold_results,
645
+ "error_samples": schema_report.get("error_samples", []),
646
+ }
647
+
648
+
649
+ def describe_router_backend() -> str:
650
+ if client is None:
651
+ return f"Router backend not initialised. {ROUTER_LOAD_ERROR}"
652
+ return f"Using Hugging Face Inference endpoint: `{HF_ROUTER_REPO}`"
653
+
654
+
655
+ with gr.Blocks(title="CourseGPT Router Control Room") as demo:
656
+ gr.Markdown(
657
+ "## CourseGPT Router Control Room\n"
658
+ "Milestone 6 deployment scaffold for the router agent. Populate the router model "
659
+ "environment variables to enable live inference, or rely on the bundled sample plan."
660
+ )
661
+
662
+ gr.Markdown(f"**Backend status:** {describe_router_backend()}")
663
+
664
+ with gr.Tab("Router Planner"):
665
+ user_query_state = gr.State("")
666
+ user_query = gr.Textbox(
667
+ label="User query",
668
+ lines=8,
669
+ placeholder="Describe the task that needs routing...",
670
+ )
671
+ generate_btn = gr.Button("Generate plan", variant="primary")
672
+ plan_output = gr.JSON(label="Router plan")
673
+ generate_btn.click(
674
+ fn=generate_plan_and_store,
675
+ inputs=user_query,
676
+ outputs=[plan_output, user_query_state],
677
+ )
678
+
679
+ validate_btn = gr.Button("Run structural checks")
680
+ validation_output = gr.JSON(label="Validation summary")
681
+ validate_btn.click(fn=validate_plan, inputs=plan_output, outputs=validation_output)
682
+
683
+ execute_btn = gr.Button("Simulate agent execution")
684
+ execution_output = gr.JSON(label="Agent execution log")
685
+ execute_btn.click(
686
+ fn=execute_plan,
687
+ inputs=[plan_output, user_query_state],
688
+ outputs=execution_output,
689
+ )
690
+
691
+ with gr.Tab("Benchmark"):
692
+ gr.Markdown(
693
+ "Upload a JSONL file of router predictions (one JSON object per line). "
694
+ "The file must align with the `router_benchmark_hard.jsonl` gold split."
695
+ )
696
+ startup_status = STARTUP_BENCHMARK_RESULT.get("message", "Benchmark not run.")
697
+ gr.Markdown(f"**Startup benchmark status:** {startup_status}")
698
+ if STARTUP_BENCHMARK_RESULT.get("report"):
699
+ gr.JSON(
700
+ value=STARTUP_BENCHMARK_RESULT["report"],
701
+ label="Startup benchmark report",
702
+ )
703
+ predictions_file = gr.File(label="Predictions (.jsonl)", file_types=[".jsonl"])
704
+ benchmark_btn = gr.Button("Evaluate against thresholds", variant="primary")
705
+ benchmark_output = gr.JSON(label="Benchmark report")
706
+ benchmark_btn.click(fn=benchmark_predictions, inputs=predictions_file, outputs=benchmark_output)
707
+
708
+ with gr.Tab("Docs & TODO"):
709
+ gr.Markdown(
710
+ "- Populate `/math`, `/code`, `/general-search` agent hooks for live orchestration.\n"
711
+ "- Add citations and latency logging once the production router is connected.\n"
712
+ "- Link to Milestone 5 benchmark reports and final project documentation."
713
+ )
714
+ gr.Markdown("**Agent load summary:**\n" + AGENT_STATUS_MARKDOWN)
715
+
716
+ demo.queue()
717
+
718
+
719
+ if __name__ == "__main__": # pragma: no cover
720
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.36.1
2
+ huggingface_hub>=0.24.5
3
+ orjson>=3.10.7
4
+ google-generativeai>=0.5.2
space_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "CourseGPT Router Control Room",
3
+ "emoji": "🧭",
4
+ "colorFrom": "blue",
5
+ "colorTo": "purple",
6
+ "sdk": "gradio",
7
+ "sdk_version": "4.36",
8
+ "python_version": "3.11",
9
+ "app_file": "app.py",
10
+ "pinned": false,
11
+ "license": "apache-2.0",
12
+ "short_description": "Milestone 6 router deployment scaffold with built-in benchmarking."
13
+ }