ysharma HF Staff commited on
Commit
d9e41f8
Β·
verified Β·
1 Parent(s): c11381b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +838 -1
app.py CHANGED
@@ -1,3 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def convert_hf_space_to_url(space_name: str) -> str:
2
  """
3
  Convert HuggingFace space name to proper URL format.
@@ -102,4 +708,235 @@ def add_custom_server(name: str, space_name: str) -> tuple[str, str]:
102
  error_msg = f"❌ Failed to add server: {str(e)}"
103
  logger.error(error_msg)
104
  logger.error(traceback.format_exc())
105
- return error_msg, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import re
6
+ import base64
7
+ from typing import List, Dict, Any, Optional
8
+ from dataclasses import dataclass
9
+ import anthropic
10
+ from datetime import datetime
11
+ import logging
12
+ import traceback
13
+
14
+ # Import the proper MCP client components
15
+ from mcp import ClientSession
16
+ from mcp.client.sse import sse_client
17
+
18
+ # Optional import for file upload functionality
19
+ try:
20
+ import httpx
21
+ HTTPX_AVAILABLE = True
22
+ except ImportError:
23
+ HTTPX_AVAILABLE = False
24
+ logging.warning("httpx not available - file upload functionality limited")
25
+
26
+ # Set up enhanced logging
27
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
28
+ logger = logging.getLogger(__name__)
29
+
30
+ @dataclass
31
+ class MCPServerConfig:
32
+ name: str
33
+ url: str
34
+ description: str
35
+ space_id: Optional[str] = None
36
+
37
+ class UniversalMCPClient:
38
+ def __init__(self):
39
+ self.servers: Dict[str, MCPServerConfig] = {}
40
+ self.anthropic_client = None
41
+
42
+ # Initialize Anthropic client if API key is available
43
+ if os.getenv("ANTHROPIC_API_KEY"):
44
+ self.anthropic_client = anthropic.Anthropic(
45
+ api_key=os.getenv("ANTHROPIC_API_KEY")
46
+ )
47
+ logger.info("βœ… Anthropic client initialized")
48
+ else:
49
+ logger.warning("⚠️ ANTHROPIC_API_KEY not found")
50
+
51
+ async def add_server_async(self, config: MCPServerConfig) -> tuple[bool, str]:
52
+ """Add an MCP server using pure MCP protocol"""
53
+ try:
54
+ logger.info(f"πŸ”§ Adding MCP server: {config.name} at {config.url}")
55
+
56
+ # Clean and validate URL - handle various input formats
57
+ original_url = config.url.strip()
58
+
59
+ # Remove common MCP endpoint variations
60
+ base_url = original_url
61
+ for endpoint in ["/gradio_api/mcp/sse", "/gradio_api/mcp/", "/gradio_api/mcp"]:
62
+ if base_url.endswith(endpoint):
63
+ base_url = base_url[:-len(endpoint)]
64
+ break
65
+
66
+ # Remove trailing slashes
67
+ base_url = base_url.rstrip("/")
68
+
69
+ # Construct proper MCP URL
70
+ mcp_url = f"{base_url}/gradio_api/mcp/sse"
71
+
72
+ logger.info(f"πŸ”§ Original URL: {original_url}")
73
+ logger.info(f"πŸ”§ Base URL: {base_url}")
74
+ logger.info(f"πŸ”§ MCP URL: {mcp_url}")
75
+
76
+ # Extract space ID if it's a HuggingFace space
77
+ if "hf.space" in base_url:
78
+ space_parts = base_url.split("/")
79
+ if len(space_parts) >= 1:
80
+ space_id = space_parts[-1].replace('.hf.space', '').replace('https://', '').replace('http://', '')
81
+ if '-' in space_id:
82
+ # Format: username-spacename.hf.space
83
+ config.space_id = space_id.replace('-', '/', 1)
84
+ else:
85
+ config.space_id = space_id
86
+ logger.info(f"πŸ“ Detected HF Space ID: {config.space_id}")
87
+
88
+ # Update config with proper MCP URL
89
+ config.url = mcp_url
90
+
91
+ # Test MCP connection
92
+ success, message = await self._test_mcp_connection(config)
93
+
94
+ if success:
95
+ self.servers[config.name] = config
96
+ logger.info(f"βœ… MCP Server {config.name} added successfully")
97
+ return True, f"βœ… Successfully added MCP server: {config.name}\n{message}"
98
+ else:
99
+ logger.error(f"❌ Failed to connect to MCP server {config.name}: {message}")
100
+ return False, f"❌ Failed to add server: {config.name}\n{message}"
101
+
102
+ except Exception as e:
103
+ error_msg = f"Failed to add server {config.name}: {str(e)}"
104
+ logger.error(error_msg)
105
+ logger.error(traceback.format_exc())
106
+ return False, f"❌ {error_msg}"
107
+
108
+ async def _test_mcp_connection(self, config: MCPServerConfig) -> tuple[bool, str]:
109
+ """Test MCP server connection with detailed debugging"""
110
+ try:
111
+ logger.info(f"πŸ” Testing MCP connection to {config.url}")
112
+
113
+ timeout_seconds = 20.0
114
+
115
+ async with sse_client(config.url, timeout=timeout_seconds) as (read_stream, write_stream):
116
+ async with ClientSession(read_stream, write_stream) as session:
117
+ # Initialize MCP session
118
+ logger.info("πŸ”§ Initializing MCP session...")
119
+ await session.initialize()
120
+
121
+ # List available tools
122
+ logger.info("πŸ“‹ Listing available tools...")
123
+ tools = await session.list_tools()
124
+
125
+ tool_info = []
126
+ for tool in tools.tools:
127
+ tool_info.append(f" - {tool.name}: {tool.description}")
128
+ logger.info(f" πŸ“ Tool: {tool.name}")
129
+ logger.info(f" Description: {tool.description}")
130
+ if hasattr(tool, 'inputSchema') and tool.inputSchema:
131
+ logger.info(f" Input Schema: {tool.inputSchema}")
132
+
133
+ if len(tools.tools) == 0:
134
+ return False, "No tools found on MCP server"
135
+
136
+ message = f"Connected successfully!\nFound {len(tools.tools)} tools:\n" + "\n".join(tool_info)
137
+ return True, message
138
+
139
+ except asyncio.TimeoutError:
140
+ return False, "Connection timeout - server may be sleeping or unreachable"
141
+ except Exception as e:
142
+ logger.error(f"MCP connection failed: {e}")
143
+ logger.error(traceback.format_exc())
144
+ return False, f"Connection failed: {str(e)}"
145
+
146
+ def _extract_media_from_mcp_response(self, result_text: str, config: MCPServerConfig) -> Optional[str]:
147
+ """Enhanced media extraction from MCP responses"""
148
+ if not isinstance(result_text, str):
149
+ logger.info(f"πŸ” Non-string result: {type(result_text)}")
150
+ return None
151
+
152
+ base_url = config.url.replace("/gradio_api/mcp/sse", "")
153
+ logger.info(f"πŸ” Processing MCP result for media: {result_text[:300]}...")
154
+ logger.info(f"πŸ” Base URL: {base_url}")
155
+
156
+ # 1. Try to parse as JSON (most Gradio MCP servers return structured data)
157
+ try:
158
+ if result_text.strip().startswith('[') or result_text.strip().startswith('{'):
159
+ logger.info("πŸ” Attempting JSON parse...")
160
+ data = json.loads(result_text.strip())
161
+ logger.info(f"πŸ” Parsed JSON structure: {data}")
162
+
163
+ # Handle array format: [{'image': {'url': '...'}}] or [{'url': '...'}]
164
+ if isinstance(data, list) and len(data) > 0:
165
+ item = data[0]
166
+ logger.info(f"πŸ” First array item: {item}")
167
+
168
+ if isinstance(item, dict):
169
+ # Check for nested media structure
170
+ for media_type in ['image', 'audio', 'video']:
171
+ if media_type in item and isinstance(item[media_type], dict):
172
+ media_data = item[media_type]
173
+ if 'url' in media_data:
174
+ url = media_data['url']
175
+ logger.info(f"🎯 Found {media_type} URL: {url}")
176
+ return self._resolve_media_url(url, base_url)
177
+
178
+ # Check for direct URL
179
+ if 'url' in item:
180
+ url = item['url']
181
+ logger.info(f"🎯 Found direct URL: {url}")
182
+ return self._resolve_media_url(url, base_url)
183
+
184
+ # Handle object format: {'image': {'url': '...'}} or {'url': '...'}
185
+ elif isinstance(data, dict):
186
+ logger.info(f"πŸ” Processing dict: {data}")
187
+
188
+ # Check for nested media structure
189
+ for media_type in ['image', 'audio', 'video']:
190
+ if media_type in data and isinstance(data[media_type], dict):
191
+ media_data = data[media_type]
192
+ if 'url' in media_data:
193
+ url = media_data['url']
194
+ logger.info(f"🎯 Found {media_type} URL: {url}")
195
+ return self._resolve_media_url(url, base_url)
196
+
197
+ # Check for direct URL
198
+ if 'url' in data:
199
+ url = data['url']
200
+ logger.info(f"🎯 Found direct URL: {url}")
201
+ return self._resolve_media_url(url, base_url)
202
+
203
+ except json.JSONDecodeError:
204
+ logger.info("πŸ” Not valid JSON, trying other formats...")
205
+ except Exception as e:
206
+ logger.warning(f"πŸ” JSON parsing error: {e}")
207
+
208
+ # 2. Check for data URLs (base64 encoded media)
209
+ if result_text.startswith('data:'):
210
+ logger.info("🎯 Found data URL")
211
+ return result_text
212
+
213
+ # 3. Check for base64 image patterns
214
+ if any(result_text.startswith(pattern) for pattern in ['iVBORw0KGgoAAAANSUhEU', '/9j/', 'UklGR']):
215
+ logger.info("🎯 Found base64 image data")
216
+ return f"data:image/png;base64,{result_text}"
217
+
218
+ # 4. Check for file paths and convert to URLs
219
+ media_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.mp3', '.wav', '.ogg', '.m4a', '.flac', '.mp4', '.avi', '.mov']
220
+
221
+ if any(ext in result_text.lower() for ext in media_extensions):
222
+ # Extract just the filename if it's a path
223
+ if '/' in result_text:
224
+ filename = result_text.split('/')[-1]
225
+ else:
226
+ filename = result_text.strip()
227
+
228
+ # Create Gradio file URL
229
+ if filename.startswith('http'):
230
+ media_url = filename
231
+ else:
232
+ media_url = f"{base_url}/file={filename}"
233
+
234
+ logger.info(f"🎯 Found media file: {media_url}")
235
+ return media_url
236
+
237
+ # 5. Check for HTTP URLs that look like media
238
+ if result_text.startswith('http') and any(ext in result_text.lower() for ext in media_extensions):
239
+ logger.info(f"🎯 Found HTTP media URL: {result_text}")
240
+ return result_text
241
+
242
+ logger.info("❌ No media detected in result")
243
+ return None
244
+
245
+ def _resolve_media_url(self, url: str, base_url: str) -> str:
246
+ """Resolve relative URLs to absolute URLs"""
247
+ if url.startswith('http') or url.startswith('data:'):
248
+ return url
249
+ elif url.startswith('/'):
250
+ return f"{base_url}/file={url}"
251
+ else:
252
+ return f"{base_url}/file={url}"
253
+
254
+ def _convert_file_to_accessible_url(self, file_path: str, base_url: str) -> str:
255
+ """Convert local file path to accessible URL for MCP servers"""
256
+ try:
257
+ # Extract filename
258
+ filename = file_path.split('/')[-1] if '/' in file_path else file_path
259
+
260
+ # For Gradio MCP servers, we can use the /file= endpoint
261
+ # This assumes the MCP server can access the same file system or we upload it
262
+ accessible_url = f"{base_url}/file={filename}"
263
+
264
+ logger.info(f"πŸ”— Converted file path to accessible URL: {accessible_url}")
265
+ return accessible_url
266
+ except Exception as e:
267
+ logger.error(f"Failed to convert file to accessible URL: {e}")
268
+ return file_path # Fallback to original path
269
+
270
+ async def upload_file_to_gradio_server(self, file_path: str, target_server_url: str) -> Optional[str]:
271
+ """Upload a local file to a Gradio server and return the accessible URL"""
272
+ if not HTTPX_AVAILABLE:
273
+ logger.error("httpx not available for file upload")
274
+ return None
275
+
276
+ try:
277
+ import httpx
278
+
279
+ # Remove MCP endpoint to get base URL
280
+ base_url = target_server_url.replace("/gradio_api/mcp/sse", "")
281
+ upload_url = f"{base_url}/upload"
282
+
283
+ # Read the file
284
+ with open(file_path, "rb") as f:
285
+ file_content = f.read()
286
+
287
+ # Get filename
288
+ filename = file_path.split('/')[-1] if '/' in file_path else file_path
289
+
290
+ # Upload file to Gradio server
291
+ files = {"file": (filename, file_content)}
292
+
293
+ async with httpx.AsyncClient() as client:
294
+ response = await client.post(upload_url, files=files, timeout=30.0)
295
+
296
+ if response.status_code == 200:
297
+ # Gradio usually returns the file path/URL in the response
298
+ result = response.json()
299
+ if isinstance(result, list) and len(result) > 0:
300
+ uploaded_path = result[0]
301
+ # Convert to accessible URL
302
+ accessible_url = f"{base_url}/file={uploaded_path}"
303
+ logger.info(f"πŸ“€ Successfully uploaded file: {accessible_url}")
304
+ return accessible_url
305
+
306
+ logger.warning(f"File upload failed with status {response.status_code}")
307
+ return None
308
+
309
+ except Exception as e:
310
+ logger.error(f"Failed to upload file to Gradio server: {e}")
311
+ return None
312
+
313
+ def _check_file_upload_compatibility(self, config: MCPServerConfig) -> str:
314
+ """Check if a server likely supports file uploads"""
315
+ if "hf.space" in config.url:
316
+ return "🟑 Hugging Face Space (usually compatible)"
317
+ elif "gradio" in config.url.lower():
318
+ return "🟒 Gradio server (likely compatible)"
319
+ elif "localhost" in config.url or "127.0.0.1" in config.url:
320
+ return "🟒 Local server (file access available)"
321
+ else:
322
+ return "πŸ”΄ Remote server (may need public URLs)"
323
+
324
+ def get_server_status(self) -> Dict[str, str]:
325
+ """Get status of all configured servers"""
326
+ status = {}
327
+ for name in self.servers:
328
+ compatibility = self._check_file_upload_compatibility(self.servers[name])
329
+ status[name] = f"βœ… Connected (MCP Protocol) - {compatibility}"
330
+ return status
331
+
332
+ # Global MCP client instance
333
+ mcp_client = UniversalMCPClient()
334
+
335
+ def chat_with_mcp(message: Dict[str, Any], history: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
336
+ """Enhanced MCP chat function with multimodal input support"""
337
+
338
+ if not mcp_client.anthropic_client:
339
+ error_msg = "❌ Anthropic API key not configured. Please set ANTHROPIC_API_KEY environment variable."
340
+ history.append({"role": "user", "content": error_msg})
341
+ history.append({"role": "assistant", "content": error_msg})
342
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
343
+
344
+ # Initialize variables for error handling
345
+ user_text = ""
346
+ user_files = []
347
+
348
+ try:
349
+ # Handle multimodal input - message is a dict with 'text' and 'files'
350
+ user_text = message.get("text", "") if message else ""
351
+ user_files = message.get("files", []) if message else []
352
+
353
+ # Handle case where message might be a string (backward compatibility)
354
+ if isinstance(message, str):
355
+ user_text = message
356
+ user_files = []
357
+
358
+ logger.info(f"πŸ’¬ Processing multimodal message:")
359
+ logger.info(f" πŸ“ Text: {user_text}")
360
+ logger.info(f" πŸ“ Files: {len(user_files)} files uploaded")
361
+
362
+ # Add uploaded files to chat history first
363
+ for file_path in user_files:
364
+ logger.info(f" πŸ“„ File: {file_path}")
365
+ history.append({"role": "user", "content": {"path": file_path}})
366
+
367
+ # Add text message if provided
368
+ if user_text and user_text.strip():
369
+ history.append({"role": "user", "content": user_text})
370
+
371
+ # If no text and no files, return early
372
+ if not user_text.strip() and not user_files:
373
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
374
+
375
+ # Create messages for Claude API
376
+ messages = []
377
+
378
+ # Convert history to Claude API format (text only for context)
379
+ recent_history = history[-16:] if len(history) > 16 else history
380
+ for msg in recent_history:
381
+ if msg.get("role") in ["user", "assistant"]:
382
+ content = msg.get("content", "")
383
+
384
+ # Convert any non-string content to string description for context
385
+ if isinstance(content, dict):
386
+ if "path" in content:
387
+ file_path = content.get('path', 'unknown')
388
+ # Determine file type for context
389
+ if any(ext in file_path.lower() for ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp']):
390
+ content = f"[User uploaded an image: {file_path}]"
391
+ elif any(ext in file_path.lower() for ext in ['.mp3', '.wav', '.ogg', '.m4a', '.flac']):
392
+ content = f"[User uploaded an audio file: {file_path}]"
393
+ elif any(ext in file_path.lower() for ext in ['.mp4', '.avi', '.mov']):
394
+ content = f"[User uploaded a video file: {file_path}]"
395
+ else:
396
+ content = f"[User uploaded a file: {file_path}]"
397
+ else:
398
+ content = f"[Object: {str(content)[:50]}...]"
399
+ elif isinstance(content, (list, tuple)):
400
+ content = f"[List: {str(content)[:50]}...]"
401
+ elif content is None:
402
+ content = "[Empty]"
403
+ else:
404
+ content = str(content)
405
+
406
+ messages.append({
407
+ "role": msg["role"],
408
+ "content": content
409
+ })
410
+
411
+ # Check if we have MCP servers to use
412
+ if not mcp_client.servers:
413
+ # No MCP servers - use regular Claude API for simple chat
414
+ logger.info("πŸ’¬ No MCP servers available, using regular Claude chat")
415
+
416
+ system_prompt = f"""You are Claude Sonnet 4, a helpful AI assistant with native multimodal capabilities. You can have conversations, answer questions, help with various tasks, and provide information on a wide range of topics.
417
+ YOUR NATIVE CAPABILITIES (Available right now):
418
+ - **Image Understanding**: You can directly see and describe images, analyze their content, read text in images, identify objects, people, scenes, etc.
419
+ - **Text Processing**: You can analyze, summarize, translate, and process text directly
420
+ - **General Knowledge**: You can answer questions, explain concepts, and have conversations
421
+ - **Code Analysis**: You can read, analyze, and explain code
422
+ Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
423
+ IMPORTANT: You DO NOT need MCP servers for:
424
+ - Describing or analyzing uploaded images
425
+ - Reading text in images
426
+ - Identifying objects, people, or scenes in images
427
+ - General conversation and knowledge questions
428
+ You DO need MCP servers for:
429
+ - Creating new images, audio, or video
430
+ - Editing or transforming existing media files
431
+ - Transcribing audio files
432
+ - Processing non-image files (audio, video, documents)
433
+ If users upload images and ask you to describe or analyze them, use your native vision capabilities immediately. Only mention MCP servers if they ask for creation or editing tasks."""
434
+
435
+ # Use regular messages API
436
+ response = mcp_client.anthropic_client.messages.create(
437
+ model="claude-sonnet-4-20250514",
438
+ max_tokens=2048,
439
+ system=system_prompt,
440
+ messages=messages
441
+ )
442
+
443
+ else:
444
+ # We have MCP servers - use the MCP connector API
445
+ mcp_servers = []
446
+ for server_name, config in mcp_client.servers.items():
447
+ mcp_servers.append({
448
+ "type": "url",
449
+ "url": config.url,
450
+ "name": server_name.replace(" ", "_").lower()
451
+ })
452
+
453
+ # Enhanced system prompt with multimodal and MCP instructions
454
+ uploaded_files_context = ""
455
+ if user_files:
456
+ uploaded_files_context = f"\n\nFILES UPLOADED BY USER:\n"
457
+ for i, file_path in enumerate(user_files, 1):
458
+ file_name = file_path.split('/')[-1] if '/' in file_path else file_path
459
+ if any(ext in file_path.lower() for ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp']):
460
+ file_type = "Image"
461
+ elif any(ext in file_path.lower() for ext in ['.mp3', '.wav', '.ogg', '.m4a', '.flac']):
462
+ file_type = "Audio"
463
+ elif any(ext in file_path.lower() for ext in ['.mp4', '.avi', '.mov']):
464
+ file_type = "Video"
465
+ else:
466
+ file_type = "File"
467
+ uploaded_files_context += f"{i}. {file_type}: {file_name} (path: {file_path})\n"
468
+
469
+ # Enhanced system prompt with Claude's native capabilities and MCP usage
470
+ system_prompt = f"""You are Claude Sonnet 4, a helpful AI assistant with both native multimodal capabilities and access to various MCP tools.
471
+ YOUR NATIVE CAPABILITIES (No MCP tools needed):
472
+ - **Image Understanding**: You can directly see and describe images, analyze their content, read text in images, etc.
473
+ - **Text Processing**: You can analyze, summarize, translate, and process text directly
474
+ - **General Knowledge**: You can answer questions, explain concepts, and have conversations
475
+ - **Code Analysis**: You can read, analyze, and explain code
476
+ WHEN TO USE MCP TOOLS:
477
+ - **Image Generation**: Creating new images from text prompts
478
+ - **Image Editing**: Modifying, enhancing, or transforming existing images
479
+ - **Audio Processing**: Transcribing audio, generating speech, audio enhancement
480
+ - **Video Processing**: Creating or editing videos
481
+ - **Specialized Analysis**: Tasks requiring specific models or APIs
482
+ UPLOADED FILES HANDLING:
483
+ {uploaded_files_context}
484
+ IMPORTANT - For uploaded images:
485
+ - **Image Description/Analysis**: Use your NATIVE vision capabilities - you can see and describe images directly
486
+ - **Image Editing/Enhancement**: Use MCP image processing tools
487
+ - **Image Generation**: Use MCP image generation tools
488
+ IMPORTANT - File URL Conversion for MCP Tools:
489
+ When using MCP tools that require file inputs, you need to be aware that uploaded files have local paths that remote MCP servers cannot access.
490
+ For uploaded files in MCP tool calls:
491
+ - If an MCP tool fails with "Invalid file data format" or similar errors about file paths
492
+ - The issue is that remote MCP servers cannot access local file paths like '/tmp/gradio/...'
493
+ - In such cases, inform the user that the MCP server requires files to be accessible via public URLs
494
+ - Suggest that they need a "File Upload" MCP server or that the specific MCP server may need configuration for file handling
495
+ Current uploaded files that may need URL conversion:
496
+ {uploaded_files_context}
497
+ IMPORTANT - GRADIO MEDIA DISPLAY:
498
+ When MCP tools return media, end your response with "MEDIA_GENERATED: [URL]" where [URL] is the actual media URL.
499
+ Examples:
500
+ - User uploads image + "What's in this image?" β†’ Use NATIVE vision (no MCP needed)
501
+ - User uploads image + "Make this vintage" β†’ Use MCP image editing tool
502
+ - User says "Generate a sunset image" β†’ Use MCP image generation tool
503
+ - User uploads audio + "Transcribe this" β†’ Use MCP transcription tool
504
+ Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
505
+ Available MCP servers: {list(mcp_client.servers.keys())}"""
506
+
507
+ # Debug logging
508
+ logger.info(f"πŸ“€ Sending {len(messages)} messages to Claude API")
509
+ logger.info(f"πŸ”§ Using {len(mcp_servers)} MCP servers")
510
+
511
+ # Call Claude with MCP connector using the correct beta API
512
+ response = mcp_client.anthropic_client.beta.messages.create(
513
+ model="claude-sonnet-4-20250514",
514
+ max_tokens=2048,
515
+ system=system_prompt,
516
+ messages=messages,
517
+ mcp_servers=mcp_servers,
518
+ betas=["mcp-client-2025-04-04"]
519
+ )
520
+
521
+ response_text = ""
522
+ media_url = None
523
+ current_server_name = None # Track the current server for tool results
524
+
525
+ # Process Claude's response
526
+ for content in response.content:
527
+ if content.type == "text":
528
+ response_text += content.text
529
+ # Check if Claude indicated media was generated
530
+ if "MEDIA_GENERATED:" in content.text:
531
+ media_match = re.search(r"MEDIA_GENERATED:\s*([^\s]+)", content.text)
532
+ if media_match:
533
+ media_url = media_match.group(1)
534
+ # Clean up the response text
535
+ response_text = re.sub(r"MEDIA_GENERATED:\s*[^\s]+", "", response_text).strip()
536
+ logger.info(f"🎯 Claude indicated media generated: {media_url}")
537
+
538
+ elif hasattr(content, 'type') and content.type == "mcp_tool_use":
539
+ tool_name = content.name
540
+ server_name = content.server_name
541
+ current_server_name = server_name # Remember for the result
542
+
543
+ logger.info(f"πŸ”§ Claude used MCP tool: {tool_name} on server: {server_name}")
544
+
545
+ response_text += f"\n\nπŸ”§ Used {tool_name} successfully!"
546
+
547
+ elif hasattr(content, 'type') and content.type == "mcp_tool_result":
548
+ # mcp_tool_result blocks don't have server_name, but we can use the last one
549
+ tool_use_id = getattr(content, 'tool_use_id', 'unknown')
550
+
551
+ logger.info(f"πŸ“ Processing MCP tool result (tool_use_id: {tool_use_id})")
552
+
553
+ if content.content:
554
+ result_content = content.content[0]
555
+ result_text = result_content.text if hasattr(result_content, 'text') else str(result_content)
556
+
557
+ logger.info(f"πŸ“ MCP tool result: {result_text[:200]}...")
558
+
559
+ response_text += f"\n\n**Result**: {result_text}"
560
+
561
+ # Try to extract media from the result using the current server
562
+ if current_server_name and current_server_name in mcp_client.servers:
563
+ config = mcp_client.servers[current_server_name]
564
+ extracted_media = mcp_client._extract_media_from_mcp_response(result_text, config)
565
+ if extracted_media:
566
+ media_url = extracted_media
567
+ logger.info(f"🎯 Extracted media from MCP result: {media_url}")
568
+ else:
569
+ # Fallback: try all servers to find media
570
+ for server_name, config in mcp_client.servers.items():
571
+ extracted_media = mcp_client._extract_media_from_mcp_response(result_text, config)
572
+ if extracted_media:
573
+ media_url = extracted_media
574
+ logger.info(f"🎯 Extracted media from MCP result (fallback): {media_url}")
575
+ break
576
+ else:
577
+ response_text += f"\n\n❌ Tool call failed: No content returned"
578
+
579
+ if not response_text:
580
+ response_text = "I understand your request and I'm here to help."
581
+
582
+ # Add assistant response to history
583
+ history.append({"role": "assistant", "content": response_text})
584
+
585
+ # Add media as separate message if we have it
586
+ if media_url:
587
+ logger.info(f"🎨 Adding media to chat: {media_url}")
588
+ history.append({"role": "assistant", "content": {"path": media_url}})
589
+
590
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
591
+
592
+ except Exception as e:
593
+ error_msg = f"❌ Error: {str(e)}"
594
+ logger.error(f"Chat error: {e}")
595
+ logger.error(traceback.format_exc())
596
+
597
+ # Add user input to history if it exists
598
+ if user_text and user_text.strip():
599
+ history.append({"role": "user", "content": user_text})
600
+ if user_files:
601
+ for file_path in user_files:
602
+ history.append({"role": "user", "content": {"path": file_path}})
603
+
604
+ history.append({"role": "assistant", "content": error_msg})
605
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
606
+
607
  def convert_hf_space_to_url(space_name: str) -> str:
608
  """
609
  Convert HuggingFace space name to proper URL format.
 
708
  error_msg = f"❌ Failed to add server: {str(e)}"
709
  logger.error(error_msg)
710
  logger.error(traceback.format_exc())
711
+ return error_msg, ""
712
+
713
+ def get_server_status() -> tuple[str, str]:
714
+ """Get status of all servers in accordion format"""
715
+ try:
716
+ status = mcp_client.get_server_status()
717
+ server_count = f"**Total MCP Servers**: {len(status)}"
718
+
719
+ if not status:
720
+ return server_count, "<p><em>No MCP servers configured yet.</em></p>"
721
+
722
+ accordion_html = ""
723
+
724
+ for name, state in status.items():
725
+ server_config = mcp_client.servers[name]
726
+ base_url = server_config.url.replace("/gradio_api/mcp/sse", "")
727
+
728
+ # Determine health status
729
+ health = "🟒 Healthy" if "βœ… Connected" in state else "πŸ”΄ Unhealthy"
730
+
731
+ accordion_html += f"""
732
+ <details style="margin-bottom: 10px;">
733
+ <summary style="cursor: pointer; padding: 8px; background: #e9ecef; border-radius: 4px;"><strong>πŸ”§ {name}</strong></summary>
734
+ <div style="padding: 10px; border-left: 3px solid #007bff; margin-left: 10px; margin-top: 5px;">
735
+ <p><strong>Title:</strong> {name}</p>
736
+ <p><strong>Status:</strong> Connected (MCP Protocol)</p>
737
+ <p><strong>Health:</strong> {health}</p>
738
+ <p><strong>Base URL:</strong> {base_url}</p>
739
+ </div>
740
+ </details>
741
+ """
742
+
743
+ return server_count, accordion_html
744
+
745
+ except Exception as e:
746
+ return "**Total MCP Servers**: 0", f"<p style='color: red;'>❌ Error getting status: {str(e)}</p>"
747
+
748
+ # Create Gradio Interface
749
+ def create_interface():
750
+ # Custom CSS for better layout
751
+ custom_css = """
752
+ /* Hide Gradio footer */
753
+ footer {
754
+ display: none !important;
755
+ }
756
+
757
+ /* Make chatbot expand to fill available space */
758
+ .gradio-container {
759
+ height: 100vh !important;
760
+ }
761
+
762
+ /* Ensure proper flex layout */
763
+ .main-content {
764
+ display: flex;
765
+ flex-direction: column;
766
+ height: 100%;
767
+ }
768
+
769
+ /* Input area stays at bottom with minimal padding */
770
+ .input-area {
771
+ margin-top: auto;
772
+ padding-top: 0.25rem !important;
773
+ padding-bottom: 0 !important;
774
+ margin-bottom: 0 !important;
775
+ }
776
+
777
+ /* Reduce padding around chatbot */
778
+ .chatbot {
779
+ margin-bottom: 0 !important;
780
+ padding-bottom: 0 !important;
781
+ }
782
+ """
783
+
784
+ with gr.Blocks(
785
+ title="Universal MCP Client",
786
+ theme=gr.themes.Citrus(),
787
+ fill_height=True,
788
+ css=custom_css
789
+ ) as demo:
790
+
791
+ # Sidebar with relevant information
792
+ with gr.Sidebar():
793
+ gr.Markdown("# Gradio.chat.app")
794
+
795
+ # Collapsible information section
796
+ with gr.Accordion("πŸ“š Guide & Info", open=True):
797
+ gr.Markdown("""
798
+ ## βœ… Quick Start
799
+
800
+ **Native Capabilities:**
801
+ - πŸ‘οΈ **Image Understanding**: Upload & ask "What's in this?"
802
+ - πŸ’¬ **Chat**: All conversation capabilities
803
+ - 🧠 **Analysis**: Code, text, documents
804
+
805
+ **MCP Servers:**
806
+ - 🎨 **Generate**: Images, audio, content
807
+ - ⚑ **Process**: Files via connected servers
808
+ - πŸ”§ **Edit**: Transform existing media
809
+ """)
810
+
811
+ gr.Markdown("""
812
+ ## 🎯 How It Works
813
+
814
+ 1. **Direct Tasks**: Claude handles image analysis instantly
815
+ 2. **Generation**: MCP servers create new content
816
+ 3. **File Processing**: Server-dependent compatibility
817
+
818
+ ## πŸ“ File Support
819
+ - **Images**: PNG, JPG, GIF, WebP
820
+ - **Audio**: MP3, WAV, M4A, FLAC
821
+ - **Video**: MP4, AVI, MOV
822
+ - **Documents**: PDF, TXT, DOCX
823
+ """)
824
+
825
+ # Server status (not in accordion) - make it reactive
826
+ gr.Markdown("## πŸ”§ Server Status")
827
+ server_count_display = gr.Markdown(f"**Connected Servers**: {len(mcp_client.servers)}")
828
+
829
+ if mcp_client.servers:
830
+ server_list = "\n".join([f"β€’ **{name}**" for name in mcp_client.servers.keys()])
831
+ server_list_display = gr.Markdown(server_list)
832
+ else:
833
+ server_list_display = gr.Markdown("*No servers connected*\n\nAdd servers below.")
834
+
835
+ # Server management in accordion
836
+ with gr.Accordion("βš™οΈ Manage Servers", open=False):
837
+ gr.Markdown("### Add MCP Server")
838
+
839
+ server_name = gr.Textbox(
840
+ label="Server Title",
841
+ placeholder="Text to Image Generator"
842
+ )
843
+ space_name = gr.Textbox(
844
+ label="HuggingFace Space Name",
845
+ placeholder="ysharma/dalle-3-xl-lora-v2"
846
+ )
847
+
848
+ add_server_btn = gr.Button("Add Server", variant="primary")
849
+ add_server_output = gr.Textbox(label="Status", interactive=False)
850
+ add_server_details = gr.HTML(label="Details")
851
+
852
+ status_btn = gr.Button("Refresh Status", variant="secondary")
853
+ status_count = gr.Markdown("**Total MCP Servers**: 0")
854
+ status_output = gr.HTML()
855
+
856
+ # Main chat area - full height
857
+ with gr.Column(elem_classes="main-content"):
858
+ # Chatbot takes most of the space
859
+ chatbot = gr.Chatbot(
860
+ label="Universal MCP-Powered Multimodal Chatbot",
861
+ show_label=False,
862
+ type="messages",
863
+ scale=1, # Expand to fill available space
864
+ show_copy_button=True,
865
+ avatar_images=None
866
+ )
867
+
868
+ # Input area at bottom - fixed size
869
+ with gr.Column(scale=0, elem_classes="input-area"):
870
+ chat_input = gr.MultimodalTextbox(
871
+ interactive=True,
872
+ file_count="multiple",
873
+ placeholder="Enter message or upload files (images, audio, video, documents)...",
874
+ show_label=False,
875
+ sources=["upload", "microphone"],
876
+ file_types=None # Accept all file types
877
+ )
878
+
879
+ # Event handlers for multimodal chat
880
+ def submit_message(message, history):
881
+ if message and (message.get("text", "").strip() or message.get("files", [])):
882
+ new_history, cleared_input = chat_with_mcp(message, history)
883
+ return new_history, cleared_input
884
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
885
+
886
+ def enable_input():
887
+ return gr.MultimodalTextbox(interactive=True)
888
+
889
+ def update_server_display():
890
+ """Update the server status display in sidebar"""
891
+ server_count = len(mcp_client.servers)
892
+ count_text = f"**Connected Servers**: {server_count}"
893
+
894
+ if mcp_client.servers:
895
+ server_list = "\n".join([f"β€’ **{name}**" for name in mcp_client.servers.keys()])
896
+ return count_text, server_list
897
+ else:
898
+ return count_text, "*No servers connected*\n\nAdd servers below."
899
+
900
+ def handle_add_server(name, space_name):
901
+ """Handle adding a server and update displays"""
902
+ status_msg, details_html = add_custom_server(name, space_name)
903
+
904
+ # Update sidebar server display
905
+ count_text, list_text = update_server_display()
906
+
907
+ return status_msg, details_html, count_text, list_text, "", "" # Clear inputs
908
+
909
+ def handle_refresh_status():
910
+ """Handle refresh status button"""
911
+ count_text, accordions_html = get_server_status()
912
+ return count_text, accordions_html
913
+
914
+ # Set up the chat flow - using built-in submit functionality
915
+ chat_msg_enter = chat_input.submit(
916
+ submit_message,
917
+ inputs=[chat_input, chatbot],
918
+ outputs=[chatbot, chat_input]
919
+ )
920
+ chat_msg_enter.then(enable_input, None, [chat_input])
921
+
922
+ # Server management functionality
923
+ add_server_btn.click(
924
+ handle_add_server,
925
+ inputs=[server_name, space_name],
926
+ outputs=[add_server_output, add_server_details, server_count_display, server_list_display, server_name, space_name]
927
+ )
928
+
929
+ status_btn.click(
930
+ handle_refresh_status,
931
+ outputs=[status_count, status_output]
932
+ )
933
+
934
+ return demo
935
+
936
+ if __name__ == "__main__":
937
+ logger.info("πŸš€ Starting Universal Multimodal MCP Chatbot Client...")
938
+
939
+ demo = create_interface()
940
+ demo.launch(debug=True)
941
+
942
+ logger.info("βœ… Universal Multimodal MCP Chatbot Client started successfully!")