Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -127,6 +127,47 @@ def index_from_url(url: str) -> Tuple[str, str]:
|
|
| 127 |
return status, local_path
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
# =============================
|
| 131 |
# Local Search (ColPali)
|
| 132 |
# =============================
|
|
@@ -169,6 +210,45 @@ def search(query: str, k: int = 5) -> List[int]:
|
|
| 169 |
return top_k_indices
|
| 170 |
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
| 173 |
"""Turn page indices into OpenAI vision content parts."""
|
| 174 |
parts: List[Dict[str, Any]] = []
|
|
@@ -186,7 +266,9 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
|
| 186 |
# Agent System Prompt
|
| 187 |
# =============================
|
| 188 |
|
| 189 |
-
|
|
|
|
|
|
|
| 190 |
"""
|
| 191 |
You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
|
| 192 |
Act iteratively:
|
|
@@ -205,12 +287,31 @@ Deliverable:
|
|
| 205 |
).strip()
|
| 206 |
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
# =============================
|
| 209 |
# MCP config (search-only)
|
| 210 |
# =============================
|
|
|
|
| 211 |
DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
|
| 212 |
DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
|
| 213 |
-
DEFAULT_ALLOWED_TOOLS = "mcp_test_search" # search-only; no get_pages
|
| 214 |
|
| 215 |
|
| 216 |
# =============================
|
|
@@ -222,8 +323,7 @@ def stream_agent(question: str,
|
|
| 222 |
model_name: str,
|
| 223 |
server_url: str,
|
| 224 |
server_label: str,
|
| 225 |
-
|
| 226 |
-
allowed_tools: str):
|
| 227 |
"""
|
| 228 |
Multi-round streaming:
|
| 229 |
• Seed: optional local ColPali search on the user question to attach initial pages.
|
|
@@ -231,6 +331,10 @@ def stream_agent(question: str,
|
|
| 231 |
• If the model calls mcp_test_search and returns indices, we end the stream and
|
| 232 |
start a NEW API call with previous_response_id + the requested pages attached.
|
| 233 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
if not api_key:
|
| 235 |
yield "⚠️ **Please provide your OpenAI API key.**", "", ""
|
| 236 |
return
|
|
@@ -243,7 +347,7 @@ def stream_agent(question: str,
|
|
| 243 |
|
| 244 |
# Optional seeding: attach some likely pages on round 1
|
| 245 |
try:
|
| 246 |
-
seed_indices = search(question, k=5)
|
| 247 |
except Exception as e:
|
| 248 |
yield f"❌ Search failed: {e}", "", ""
|
| 249 |
return
|
|
@@ -256,8 +360,8 @@ def stream_agent(question: str,
|
|
| 256 |
"type": "mcp",
|
| 257 |
"server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
|
| 258 |
"server_url": server_url or DEFAULT_MCP_SERVER_URL,
|
| 259 |
-
"allowed_tools": [
|
| 260 |
-
"require_approval":
|
| 261 |
}]
|
| 262 |
|
| 263 |
# Shared mutable state for each round
|
|
@@ -282,7 +386,7 @@ def stream_agent(question: str,
|
|
| 282 |
if round_idx == 1:
|
| 283 |
parts.append({"type": "input_text", "text": question})
|
| 284 |
else:
|
| 285 |
-
parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you
|
| 286 |
|
| 287 |
parts += _build_image_parts_from_indices(attached_indices)
|
| 288 |
if attached_indices:
|
|
@@ -392,7 +496,7 @@ def stream_agent(question: str,
|
|
| 392 |
expanded.add(i - 1)
|
| 393 |
expanded.add(i + 1)
|
| 394 |
expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 395 |
-
pending_indices = sorted(expanded)
|
| 396 |
round_idx += 1
|
| 397 |
continue
|
| 398 |
|
|
@@ -565,14 +669,10 @@ def build_ui():
|
|
| 565 |
value=DEFAULT_MCP_SERVER_LABEL,
|
| 566 |
)
|
| 567 |
with gr.Row():
|
| 568 |
-
|
| 569 |
-
label="
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
require_approval_box = gr.Dropdown(
|
| 573 |
-
label="Require Approval",
|
| 574 |
-
choices=["never", "auto", "always"],
|
| 575 |
-
value="never",
|
| 576 |
)
|
| 577 |
|
| 578 |
with gr.Column(scale=3):
|
|
@@ -593,6 +693,7 @@ def build_ui():
|
|
| 593 |
server_label_box,
|
| 594 |
require_approval_box,
|
| 595 |
allowed_tools_box,
|
|
|
|
| 596 |
],
|
| 597 |
outputs=[final_md, summary_md, log_md],
|
| 598 |
)
|
|
|
|
| 127 |
return status, local_path
|
| 128 |
|
| 129 |
|
| 130 |
+
def query_gpt(query: str, retrieved_images: list[tuple[Image.Image, str]]) -> str:
|
| 131 |
+
"""Calls OpenAI's GPT model with the query and image data."""
|
| 132 |
+
if api_key and api_key.startswith("sk"):
|
| 133 |
+
try:
|
| 134 |
+
from openai import OpenAI
|
| 135 |
+
|
| 136 |
+
base64_images = [encode_image_to_base64(im_caption[0]) for im_caption in retrieved_images]
|
| 137 |
+
client = OpenAI(api_key=api_key.strip())
|
| 138 |
+
PROMPT = """
|
| 139 |
+
You are a smart assistant designed to answer questions about a PDF document.
|
| 140 |
+
You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
|
| 141 |
+
If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
|
| 142 |
+
Give detailed and extensive answers, only containing info in the pages you are given.
|
| 143 |
+
You can answer using information contained in plots and figures if necessary.
|
| 144 |
+
Answer in the same language as the query.
|
| 145 |
+
Query: {query}
|
| 146 |
+
PDF pages:
|
| 147 |
+
""".strip()
|
| 148 |
+
|
| 149 |
+
response = client.responses.create(
|
| 150 |
+
model="gpt-5",
|
| 151 |
+
input=[
|
| 152 |
+
{
|
| 153 |
+
"role": "user",
|
| 154 |
+
"content": (
|
| 155 |
+
[{"type": "input_text", "text": PROMPT.format(query=query)}] +
|
| 156 |
+
[{"type": "input_image",
|
| 157 |
+
"image_url": f"data:image/jpeg;base64,{im}"}
|
| 158 |
+
for im in base64_images]
|
| 159 |
+
)
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
# max_tokens=500,
|
| 163 |
+
)
|
| 164 |
+
return response.output_text
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(e)
|
| 167 |
+
return "OpenAI API connection failure. Verify that OPENAI_API_KEY is set and valid (sk-***)."
|
| 168 |
+
return "Set OPENAI_API_KEY in your environment to get a custom response."
|
| 169 |
+
|
| 170 |
+
|
| 171 |
# =============================
|
| 172 |
# Local Search (ColPali)
|
| 173 |
# =============================
|
|
|
|
| 210 |
return top_k_indices
|
| 211 |
|
| 212 |
|
| 213 |
+
def search_synthetize(query: str, k: int = 5) -> List[int]:
|
| 214 |
+
"""
|
| 215 |
+
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 216 |
+
MCP tool description:
|
| 217 |
+
- name: mcp_test_search
|
| 218 |
+
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 219 |
+
- input_schema:
|
| 220 |
+
type: object
|
| 221 |
+
properties:
|
| 222 |
+
query: {type: string, description: "User query in natural language."}
|
| 223 |
+
k: {type: integer, minimum: 1, maximum: 20, default: 5. description: "Number of top pages to retrieve."}
|
| 224 |
+
required: ["query"]
|
| 225 |
+
Args:
|
| 226 |
+
query (str): Natural-language question to search for.
|
| 227 |
+
k (int): Number of top results to return (1–10).
|
| 228 |
+
Returns:
|
| 229 |
+
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
| 230 |
+
"""
|
| 231 |
+
top_k_indices = search(query, k)
|
| 232 |
+
|
| 233 |
+
expanded = set(top_k_indices)
|
| 234 |
+
for i in base:
|
| 235 |
+
expanded.add(i - 1)
|
| 236 |
+
expanded.add(i + 1)
|
| 237 |
+
expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 238 |
+
expanded = sorted(expanded)
|
| 239 |
+
|
| 240 |
+
# Build gallery results with 1-based page numbering
|
| 241 |
+
results = []
|
| 242 |
+
for idx in expanded:
|
| 243 |
+
page_num = idx + 1
|
| 244 |
+
results.append((images[idx], f"Page {page_num}"))
|
| 245 |
+
|
| 246 |
+
# Generate grounded response
|
| 247 |
+
ai_response = query_gpt(query, results)
|
| 248 |
+
print("[search_synthetize]", ai_response)
|
| 249 |
+
return ai_response
|
| 250 |
+
|
| 251 |
+
|
| 252 |
def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
| 253 |
"""Turn page indices into OpenAI vision content parts."""
|
| 254 |
parts: List[Dict[str, Any]] = []
|
|
|
|
| 266 |
# Agent System Prompt
|
| 267 |
# =============================
|
| 268 |
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
SYSTEM1 = (
|
| 272 |
"""
|
| 273 |
You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
|
| 274 |
Act iteratively:
|
|
|
|
| 287 |
).strip()
|
| 288 |
|
| 289 |
|
| 290 |
+
SYSTEM2 = """
|
| 291 |
+
You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
|
| 292 |
+
Act iteratively:
|
| 293 |
+
1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
|
| 294 |
+
2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
|
| 295 |
+
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
| 296 |
+
|
| 297 |
+
Grounding & citations:
|
| 298 |
+
• Use ONLY information from retrieved pages.
|
| 299 |
+
• After any claim, cite the page as (p.<page>).
|
| 300 |
+
• If an answer is not present, say “Not found in the provided pages.”
|
| 301 |
+
|
| 302 |
+
Final deliverable (must be clear and standalone):
|
| 303 |
+
• Write a detailed answer in Markdown that directly addresses the user request in the request language.
|
| 304 |
+
• If dates or items are requested, include a concise table with the requested fields.
|
| 305 |
+
• Do not refer to “the above” or “previous messages”.
|
| 306 |
+
"""
|
| 307 |
+
|
| 308 |
+
|
| 309 |
# =============================
|
| 310 |
# MCP config (search-only)
|
| 311 |
# =============================
|
| 312 |
+
VISUAL_REASONING = True
|
| 313 |
DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
|
| 314 |
DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
|
|
|
|
| 315 |
|
| 316 |
|
| 317 |
# =============================
|
|
|
|
| 323 |
model_name: str,
|
| 324 |
server_url: str,
|
| 325 |
server_label: str,
|
| 326 |
+
visual_reasoning: str):
|
|
|
|
| 327 |
"""
|
| 328 |
Multi-round streaming:
|
| 329 |
• Seed: optional local ColPali search on the user question to attach initial pages.
|
|
|
|
| 331 |
• If the model calls mcp_test_search and returns indices, we end the stream and
|
| 332 |
start a NEW API call with previous_response_id + the requested pages attached.
|
| 333 |
"""
|
| 334 |
+
visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
|
| 335 |
+
allowed_tools = "mcp_test_search" if visual_reasoning else "mcp_test_search_synthetize"
|
| 336 |
+
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
| 337 |
+
|
| 338 |
if not api_key:
|
| 339 |
yield "⚠️ **Please provide your OpenAI API key.**", "", ""
|
| 340 |
return
|
|
|
|
| 347 |
|
| 348 |
# Optional seeding: attach some likely pages on round 1
|
| 349 |
try:
|
| 350 |
+
seed_indices = [] if visual_reasoning is False else search(question, k=5)
|
| 351 |
except Exception as e:
|
| 352 |
yield f"❌ Search failed: {e}", "", ""
|
| 353 |
return
|
|
|
|
| 360 |
"type": "mcp",
|
| 361 |
"server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
|
| 362 |
"server_url": server_url or DEFAULT_MCP_SERVER_URL,
|
| 363 |
+
"allowed_tools": [allowed_tools],
|
| 364 |
+
"require_approval": "never",
|
| 365 |
}]
|
| 366 |
|
| 367 |
# Shared mutable state for each round
|
|
|
|
| 386 |
if round_idx == 1:
|
| 387 |
parts.append({"type": "input_text", "text": question})
|
| 388 |
else:
|
| 389 |
+
parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
|
| 390 |
|
| 391 |
parts += _build_image_parts_from_indices(attached_indices)
|
| 392 |
if attached_indices:
|
|
|
|
| 496 |
expanded.add(i - 1)
|
| 497 |
expanded.add(i + 1)
|
| 498 |
expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 499 |
+
pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
|
| 500 |
round_idx += 1
|
| 501 |
continue
|
| 502 |
|
|
|
|
| 669 |
value=DEFAULT_MCP_SERVER_LABEL,
|
| 670 |
)
|
| 671 |
with gr.Row():
|
| 672 |
+
visual_reasoning_box = gr.Dropdown(
|
| 673 |
+
label="Visual Reasoning",
|
| 674 |
+
choices=["Visual Reasoning", "Vision Summary"],
|
| 675 |
+
value="Visual Reasoning",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
)
|
| 677 |
|
| 678 |
with gr.Column(scale=3):
|
|
|
|
| 693 |
server_label_box,
|
| 694 |
require_approval_box,
|
| 695 |
allowed_tools_box,
|
| 696 |
+
visual_reasoning_box
|
| 697 |
],
|
| 698 |
outputs=[final_md, summary_md, log_md],
|
| 699 |
)
|