ysharma HF Staff commited on
Commit
84f8ce7
·
verified ·
1 Parent(s): 9208064

Update chat_handler.py

Browse files
Files changed (1) hide show
  1. chat_handler.py +420 -323
chat_handler.py CHANGED
@@ -1,15 +1,17 @@
1
  """
2
- Chat handling logic for Universal MCP Client - Enhanced with Inference Provider Support
3
  """
4
  import re
5
  import logging
6
  import traceback
7
- import asyncio
8
  from datetime import datetime
9
  from typing import Dict, Any, List, Tuple, Optional
10
  import gradio as gr
11
  from gradio import ChatMessage
 
12
  import time
 
 
13
 
14
  from config import AppConfig
15
  from mcp_client import UniversalMCPClient
@@ -17,26 +19,54 @@ from mcp_client import UniversalMCPClient
17
  logger = logging.getLogger(__name__)
18
 
19
  class ChatHandler:
20
- """Handles chat interactions with multiple LLM backends and MCP servers using ChatMessage dataclass"""
21
 
22
  def __init__(self, mcp_client: UniversalMCPClient):
23
  self.mcp_client = mcp_client
 
 
 
 
 
 
 
24
 
25
- def process_multimodal_message(self, message: Dict[str, Any], history: List) -> Tuple[List[ChatMessage], Dict[str, Any]]:
26
- """Enhanced MCP chat function with multimodal input support and multiple LLM backends"""
 
 
 
27
 
28
- # Check if any LLM backend is configured
29
- backend_configured = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- if self.mcp_client.anthropic_client and AppConfig.ANTHROPIC_API_KEY:
32
- backend_configured = True
33
- backend_type = "anthropic"
34
- elif self.mcp_client.hf_client and self.mcp_client.current_provider:
35
- backend_configured = True
36
- backend_type = "hf_inference"
37
 
38
- if not backend_configured:
39
- error_msg = "❌ No LLM backend configured. Please configure either Anthropic API key or HuggingFace Inference Provider."
40
  history.append(ChatMessage(role="user", content=error_msg))
41
  history.append(ChatMessage(role="assistant", content=error_msg))
42
  return history, gr.MultimodalTextbox(value=None, interactive=False)
@@ -44,7 +74,9 @@ class ChatHandler:
44
  # Initialize variables for error handling
45
  user_text = ""
46
  user_files = []
47
-
 
 
48
  try:
49
  # Handle multimodal input - message is a dict with 'text' and 'files'
50
  user_text = message.get("text", "") if message else ""
@@ -55,7 +87,7 @@ class ChatHandler:
55
  user_text = message
56
  user_files = []
57
 
58
- logger.info(f"💬 Processing multimodal message with {backend_type} backend:")
59
  logger.info(f" 📝 Text: {user_text}")
60
  logger.info(f" 📁 Files: {len(user_files)} files uploaded")
61
  logger.info(f" 📋 History type: {type(history)}, length: {len(history)}")
@@ -84,10 +116,23 @@ class ChatHandler:
84
 
85
  history = converted_history
86
 
87
- # Add uploaded files to chat history first
88
  for file_path in user_files:
89
- logger.info(f" 📄 File: {file_path}")
90
- history.append(ChatMessage(role="user", content={"path": file_path}))
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # Add text message if provided
93
  if user_text and user_text.strip():
@@ -97,14 +142,11 @@ class ChatHandler:
97
  if not user_text.strip() and not user_files:
98
  return history, gr.MultimodalTextbox(value=None, interactive=False)
99
 
100
- # Create messages for LLM API
101
- messages = self._prepare_llm_messages(history)
102
 
103
- # Process the chat based on backend type
104
- if backend_type == "anthropic":
105
- response_messages = self._call_anthropic_api(messages, user_files)
106
- else: # hf_inference
107
- response_messages = self._call_hf_inference_api(messages, user_files)
108
 
109
  # Add all response messages to history
110
  history.extend(response_messages)
@@ -126,12 +168,24 @@ class ChatHandler:
126
  history.append(ChatMessage(role="assistant", content=error_msg))
127
  return history, gr.MultimodalTextbox(value=None, interactive=False)
128
 
129
- def _prepare_llm_messages(self, history: List) -> List[Dict[str, Any]]:
130
- """Convert history (ChatMessage or dict) to LLM API format"""
131
  messages = []
132
 
133
- # Convert history to LLM API format (text only for context)
134
- recent_history = history[-16:] if len(history) > 16 else history
 
 
 
 
 
 
 
 
 
 
 
 
135
  for msg in recent_history:
136
  # Handle both ChatMessage objects and dictionary format for backward compatibility
137
  if hasattr(msg, 'role'): # ChatMessage object
@@ -149,15 +203,20 @@ class ChatHandler:
149
  if isinstance(content, dict):
150
  if "path" in content:
151
  file_path = content.get('path', 'unknown')
152
- # Determine file type for context
153
- if AppConfig.is_image_file(file_path):
154
- content = f"[User uploaded an image: {file_path}]"
155
- elif AppConfig.is_audio_file(file_path):
156
- content = f"[User uploaded an audio file: {file_path}]"
157
- elif AppConfig.is_video_file(file_path):
158
- content = f"[User uploaded a video file: {file_path}]"
 
 
 
 
159
  else:
160
- content = f"[User uploaded a file: {file_path}]"
 
161
  else:
162
  content = f"[Object: {str(content)[:50]}...]"
163
  elif isinstance(content, (list, tuple)):
@@ -174,238 +233,241 @@ class ChatHandler:
174
 
175
  return messages
176
 
177
- def _call_anthropic_api(self, messages: List[Dict[str, Any]], user_files: List[str]) -> List[ChatMessage]:
178
- """Call Anthropic API (existing implementation)"""
179
 
180
- # Check if we have MCP servers to use
181
- if not self.mcp_client.servers:
182
- return self._call_claude_without_mcp(messages)
 
183
  else:
184
- return self._call_claude_with_mcp(messages, user_files)
185
-
186
- def _call_hf_inference_api(self, messages: List[Dict[str, Any]], user_files: List[str]) -> List[ChatMessage]:
187
- """Call HuggingFace Inference API with custom MCP implementation"""
188
-
189
- # Run async call in sync context
190
- def run_async():
191
- loop = asyncio.new_event_loop()
192
- asyncio.set_event_loop(loop)
193
- try:
194
- return loop.run_until_complete(
195
- self.mcp_client.call_llm_with_mcp(messages, user_files)
196
- )
197
- finally:
198
- loop.close()
199
-
200
- try:
201
- return run_async()
202
- except Exception as e:
203
- logger.error(f"HF Inference API error: {e}")
204
- return [ChatMessage(role="assistant", content=f"❌ Error with HF Inference: {str(e)}")]
205
 
206
- def _call_claude_without_mcp(self, messages: List[Dict[str, Any]]) -> List[ChatMessage]:
207
- """Call Claude API without MCP servers"""
208
- logger.info("💬 No MCP servers available, using regular Claude chat")
209
 
210
  system_prompt = self._get_native_system_prompt()
211
 
212
- # Use regular messages API
213
- response = self.mcp_client.anthropic_client.messages.create(
214
- model=AppConfig.CLAUDE_MODEL,
215
- max_tokens=AppConfig.MAX_TOKENS,
216
- system=system_prompt,
217
- messages=messages
218
- )
219
-
220
- response_text = ""
221
- for content in response.content:
222
- if content.type == "text":
223
- response_text += content.text
224
 
225
- if not response_text:
226
- response_text = "I understand your request and I'm here to help."
 
 
 
 
 
 
 
 
227
 
228
- return [ChatMessage(role="assistant", content=response_text)]
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- def _call_claude_with_mcp(self, messages: List[Dict[str, Any]], user_files: List[str]) -> List[ChatMessage]:
231
- """Call Claude API with MCP servers and return structured responses"""
232
- mcp_servers = []
233
- for server_name, config in self.mcp_client.servers.items():
234
- mcp_servers.append({
235
- "type": "url",
236
- "url": config.url,
237
- "name": server_name.replace(" ", "_").lower()
238
- })
239
 
240
  # Enhanced system prompt with multimodal and MCP instructions
241
- system_prompt = self._get_mcp_system_prompt(user_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  # Debug logging
244
- logger.info(f"📤 Sending {len(messages)} messages to Claude API")
245
- logger.info(f"🔧 Using {len(mcp_servers)} MCP servers")
 
 
246
 
247
  start_time = time.time()
248
 
249
- # Call Claude with MCP connector using the correct beta API
250
- response = self.mcp_client.anthropic_client.beta.messages.create(
251
- model=AppConfig.CLAUDE_MODEL,
252
- max_tokens=AppConfig.MAX_TOKENS,
253
- system=system_prompt,
254
- messages=messages,
255
- mcp_servers=mcp_servers,
256
- betas=[AppConfig.MCP_BETA_VERSION]
257
- )
258
-
259
- return self._process_mcp_response(response, start_time)
 
260
 
261
- def _process_mcp_response(self, response, start_time: float) -> List[ChatMessage]:
262
- """Process Claude's response with MCP tool calls into structured ChatMessage objects"""
263
  chat_messages = []
264
- current_tool_id = None
265
- current_server_name = None
266
- tool_start_time = None
267
- text_segments = [] # Collect text segments separately
268
-
269
- # Process Claude's response
270
- for content in response.content:
271
- if content.type == "text":
272
- # Collect text segments but don't combine them yet
273
- text_content = content.text
274
- # Check if Claude indicated media was generated
275
- if "MEDIA_GENERATED:" in text_content:
276
- media_match = re.search(r"MEDIA_GENERATED:\s*([^\s]+)", text_content)
277
- if media_match:
278
- media_url = media_match.group(1)
279
- # Clean up the response text
280
- text_content = re.sub(r"MEDIA_GENERATED:\s*[^\s]+", "", text_content).strip()
281
- logger.info(f"🎯 Claude indicated media generated: {media_url}")
282
- # Add media as separate message
283
- chat_messages.append(ChatMessage(
284
- role="assistant",
285
- content={"path": media_url}
286
- ))
287
-
288
- if text_content.strip():
289
- text_segments.append(text_content.strip())
290
 
291
- elif hasattr(content, 'type') and content.type == "mcp_tool_use":
292
- # Add any accumulated text before tool use
293
- if text_segments:
294
- combined_text = " ".join(text_segments)
295
- if combined_text.strip():
296
- chat_messages.append(ChatMessage(
297
- role="assistant",
298
- content=combined_text.strip()
299
- ))
300
- text_segments = [] # Reset
301
-
302
- tool_name = content.name
303
- server_name = content.server_name
304
- current_tool_id = getattr(content, 'id', 'unknown')
305
- current_server_name = server_name
306
- tool_start_time = time.time()
307
-
308
- logger.info(f"🔧 Claude used MCP tool: {tool_name} on server: {server_name}")
309
-
310
- # Create a "thinking" message for tool usage
311
- chat_messages.append(ChatMessage(
312
- role="assistant",
313
- content="",
314
- metadata={
315
- "title": f"🔧 Using {tool_name}",
316
- "id": current_tool_id,
317
- "status": "pending",
318
- "log": f"Server: {server_name}"
319
- }
320
- ))
321
-
322
- elif hasattr(content, 'type') and content.type == "mcp_tool_result":
323
- tool_use_id = getattr(content, 'tool_use_id', 'unknown')
324
- duration = time.time() - tool_start_time if tool_start_time else None
325
-
326
- logger.info(f"📝 Processing MCP tool result (tool_use_id: {tool_use_id})")
327
 
328
- # Update the pending tool message to completed
329
- for msg in chat_messages:
330
- if (msg.metadata and
331
- msg.metadata.get("id") == current_tool_id and
332
- msg.metadata.get("status") == "pending"):
333
- msg.metadata["status"] = "done"
334
- if duration:
335
- msg.metadata["duration"] = round(duration, 2)
336
- break
337
 
338
- media_url = None
339
- if content.content:
340
- result_content = content.content[0]
341
- result_text = result_content.text if hasattr(result_content, 'text') else str(result_content)
342
 
343
- logger.info(f"📝 MCP tool result: {result_text[:200]}...")
 
344
 
345
- # Try to extract media URL from the result
346
- if current_server_name and current_server_name in self.mcp_client.servers:
347
- config = self.mcp_client.servers[current_server_name]
348
- extracted_media = self.mcp_client._extract_media_from_mcp_response(result_text, config)
349
- if extracted_media:
350
- media_url = extracted_media
351
- logger.info(f"🎯 Extracted media from MCP result: {media_url}")
 
 
 
 
 
 
 
 
352
  else:
353
- # Fallback: try all servers to find media
354
- for server_name, config in self.mcp_client.servers.items():
355
- extracted_media = self.mcp_client._extract_media_from_mcp_response(result_text, config)
356
- if extracted_media:
357
- media_url = extracted_media
358
- logger.info(f"🎯 Extracted media from MCP result (fallback): {media_url}")
359
- break
360
 
361
- # Always show the full tool result
362
  chat_messages.append(ChatMessage(
363
  role="assistant",
364
- content=result_text,
365
  metadata={
366
- "title": "📋 Tool Result",
367
- "parent_id": current_tool_id,
368
  "status": "done"
369
  }
370
  ))
371
 
372
- # Only add separate media display if the tool result does NOT contain
373
- # any Gradio file data structures that would be auto-rendered
374
- if media_url and not self._contains_gradio_file_structure(result_text):
375
- logger.info(f"🎯 Adding separate media display for: {media_url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  chat_messages.append(ChatMessage(
377
- role="assistant",
378
  content={"path": media_url}
379
  ))
380
  else:
381
- if media_url:
382
- logger.info(f"🚫 Skipping separate media - tool result contains Gradio file structure")
383
- else:
384
- logger.info(f"🚫 No media URL extracted")
385
-
 
 
 
 
 
 
 
 
386
  else:
387
- # Add error message for failed tool call
 
 
 
388
  chat_messages.append(ChatMessage(
389
  role="assistant",
390
- content="Tool call failed: No content returned",
391
  metadata={
392
- "title": "❌ Tool Error",
393
- "parent_id": current_tool_id,
394
- "status": "done"
 
395
  }
396
  ))
397
-
398
- # Add any remaining text segments after all processing
399
- if text_segments:
400
- combined_text = " ".join(text_segments)
401
- if combined_text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  chat_messages.append(ChatMessage(
403
  role="assistant",
404
- content=combined_text.strip()
405
  ))
406
-
407
- # Fallback if no content was processed
408
- if not chat_messages:
 
409
  chat_messages.append(ChatMessage(
410
  role="assistant",
411
  content="I understand your request and I'm here to help."
@@ -413,130 +475,165 @@ class ChatHandler:
413
 
414
  return chat_messages
415
 
416
- def _contains_gradio_file_structure(self, text: str) -> bool:
417
- """Check if the text contains ANY Gradio file data structures that would be auto-rendered"""
418
-
419
- # Check for key indicators of Gradio file structures
420
- gradio_indicators = [
421
- # Gradio FileData type indicators
422
- "'_type': 'gradio.FileData'",
423
- '"_type": "gradio.FileData"',
424
- 'gradio.FileData',
425
-
426
- # File structure patterns
427
- "'path':",
428
- '"path":',
429
- "'url':",
430
- '"url":',
431
- "'orig_name':",
432
- '"orig_name":',
433
- "'mime_type':",
434
- '"mime_type":',
435
- 'is_stream',
436
- 'meta_type',
437
-
438
- # Common file result patterns
439
- "{'image':",
440
- '{"image":',
441
- "{'audio':",
442
- '{"audio":',
443
- "{'video':",
444
- '{"video":',
445
- "{'file':",
446
- '{"file":',
447
-
448
- # List patterns that typically contain file objects
449
- "[{'image'",
450
- '[{"image"',
451
- "[{'audio'",
452
- '[{"audio"',
453
- "[{'video'",
454
- '[{"video"',
455
- "[{'file'",
456
- '[{"file"'
 
 
 
 
 
 
 
 
 
 
 
457
  ]
458
 
459
- # If we find multiple indicators, it's likely a Gradio file structure
460
- indicator_count = sum(1 for indicator in gradio_indicators if indicator in text)
 
 
 
 
461
 
462
- # Also check for simple URL patterns (for audio case)
463
- is_simple_url = (text.strip().startswith('http') and
464
- len(text.strip().split()) == 1 and
465
- any(ext in text.lower() for ext in ['.wav', '.mp3', '.mp4', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webm', '.ogg']))
 
 
 
466
 
467
- result = indicator_count >= 2 or is_simple_url
468
- logger.debug(f"📋 File structure check: {indicator_count} indicators, simple_url: {is_simple_url}, result: {result}")
 
 
469
 
470
- return result
 
471
 
472
  def _get_native_system_prompt(self) -> str:
473
- """Get system prompt for Claude without MCP servers"""
474
- return f"""You are Claude Sonnet 4, a helpful AI assistant with native multimodal capabilities. You can have conversations, answer questions, help with various tasks, and provide information on a wide range of topics.
475
-
476
- YOUR NATIVE CAPABILITIES (Available right now):
477
- - **Image Understanding**: You can directly see and describe images, analyze their content, read text in images, identify objects, people, scenes, etc.
478
  - **Text Processing**: You can analyze, summarize, translate, and process text directly
479
  - **General Knowledge**: You can answer questions, explain concepts, and have conversations
480
  - **Code Analysis**: You can read, analyze, and explain code
481
-
 
482
  Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
483
-
484
- IMPORTANT: You DO NOT need MCP servers for:
485
- - Describing or analyzing uploaded images
486
- - Reading text in images
487
- - Identifying objects, people, or scenes in images
488
- - General conversation and knowledge questions
489
-
490
- You DO need MCP servers for:
491
- - Creating new images, audio, or video
492
- - Editing or transforming existing media files
493
- - Transcribing audio files
494
- - Processing non-image files (audio, video, documents)
495
-
496
- If users upload images and ask you to describe or analyze them, use your native vision capabilities immediately. Only mention MCP servers if they ask for creation or editing tasks."""
497
 
498
- def _get_mcp_system_prompt(self, user_files: List[str]) -> str:
499
- """Get system prompt for Claude with MCP servers"""
 
 
 
500
  uploaded_files_context = ""
501
- if user_files:
502
- uploaded_files_context = f"\n\nFILES UPLOADED BY USER:\n"
503
- for i, file_path in enumerate(user_files, 1):
504
- file_name = file_path.split('/')[-1] if '/' in file_path else file_path
505
- if AppConfig.is_image_file(file_path):
506
  file_type = "Image"
507
- elif AppConfig.is_audio_file(file_path):
508
  file_type = "Audio"
509
- elif AppConfig.is_video_file(file_path):
510
  file_type = "Video"
511
  else:
512
  file_type = "File"
513
- uploaded_files_context += f"{i}. {file_type}: {file_name} (path: {file_path})\n"
514
 
515
- return f"""You are Claude Sonnet 4, a helpful AI assistant with both native multimodal capabilities and access to various MCP tools.
516
-
517
- YOUR NATIVE CAPABILITIES (No MCP tools needed):
518
- - **Image Understanding**: You can directly see and describe images, analyze their content, read text in images, etc.
 
 
 
 
519
  - **Text Processing**: You can analyze, summarize, translate, and process text directly
520
  - **General Knowledge**: You can answer questions, explain concepts, and have conversations
521
  - **Code Analysis**: You can read, analyze, and explain code
522
-
 
 
 
 
523
  WHEN TO USE MCP TOOLS:
524
  - **Image Generation**: Creating new images from text prompts
525
  - **Image Editing**: Modifying, enhancing, or transforming existing images
526
  - **Audio Processing**: Transcribing audio, generating speech, audio enhancement
527
  - **Video Processing**: Creating or editing videos
 
528
  - **Specialized Analysis**: Tasks requiring specific models or APIs
529
-
530
- UPLOADED FILES HANDLING:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  {uploaded_files_context}
532
-
533
- IMPORTANT - For uploaded images:
534
- - **Image Description/Analysis**: Use your NATIVE vision capabilities - you can see and describe images directly
535
- - **Image Editing/Enhancement**: Use MCP image processing tools
536
- - **Image Generation**: Use MCP image generation tools
537
-
538
- IMPORTANT - GRADIO MEDIA DISPLAY:
539
- When MCP tools return media, end your response with "MEDIA_GENERATED: [URL]" where [URL] is the actual media URL.
540
-
541
  Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
542
- Available MCP servers: {list(self.mcp_client.servers.keys())}"""
 
1
  """
2
+ Chat handling logic for Universal MCP Client - Fixed Version with File Upload Support
3
  """
4
  import re
5
  import logging
6
  import traceback
 
7
  from datetime import datetime
8
  from typing import Dict, Any, List, Tuple, Optional
9
  import gradio as gr
10
  from gradio import ChatMessage
11
+ from gradio_client import Client
12
  import time
13
+ import json
14
+ import httpx
15
 
16
  from config import AppConfig
17
  from mcp_client import UniversalMCPClient
 
19
  logger = logging.getLogger(__name__)
20
 
21
  class ChatHandler:
22
+ """Handles chat interactions with HF Inference Providers and MCP servers using ChatMessage dataclass"""
23
 
24
  def __init__(self, mcp_client: UniversalMCPClient):
25
  self.mcp_client = mcp_client
26
+ # Initialize the file uploader client for converting local files to public URLs
27
+ try:
28
+ self.uploader_client = Client("abidlabs/file-uploader")
29
+ logger.info("✅ File uploader client initialized")
30
+ except Exception as e:
31
+ logger.error(f"Failed to initialize file uploader: {e}")
32
+ self.uploader_client = None
33
 
34
+ def _upload_file_to_gradio_server(self, file_path: str) -> str:
35
+ """Upload a file to the Gradio server and get a public URL"""
36
+ if not self.uploader_client:
37
+ logger.error("File uploader client not initialized")
38
+ return file_path
39
 
40
+ try:
41
+ # Open file in binary mode as your peer discovered
42
+ with open(file_path, "rb") as f_:
43
+ files = [("files", (file_path.split("/")[-1], f_))]
44
+ r = httpx.post(
45
+ self.uploader_client.upload_url,
46
+ files=files,
47
+ )
48
+ r.raise_for_status()
49
+ result = r.json()
50
+ uploaded_path = result[0]
51
+ # Construct the full public URL
52
+ public_url = f"{self.uploader_client.src}/gradio_api/file={uploaded_path}"
53
+ logger.info(f"✅ Uploaded {file_path} -> {public_url}")
54
+ return public_url
55
+ except Exception as e:
56
+ logger.error(f"Failed to upload file {file_path}: {e}")
57
+ return file_path # Return original path as fallback
58
+
59
+ def process_multimodal_message(self, message: Dict[str, Any], history: List) -> Tuple[List[ChatMessage], Dict[str, Any]]:
60
+ """Enhanced MCP chat function with multimodal input support and ChatMessage formatting"""
61
 
62
+ if not self.mcp_client.hf_client:
63
+ error_msg = "❌ HuggingFace token not configured. Please set HF_TOKEN environment variable or login."
64
+ history.append(ChatMessage(role="user", content=error_msg))
65
+ history.append(ChatMessage(role="assistant", content=error_msg))
66
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
 
67
 
68
+ if not self.mcp_client.current_provider or not self.mcp_client.current_model:
69
+ error_msg = "❌ Please select an inference provider and model first."
70
  history.append(ChatMessage(role="user", content=error_msg))
71
  history.append(ChatMessage(role="assistant", content=error_msg))
72
  return history, gr.MultimodalTextbox(value=None, interactive=False)
 
74
  # Initialize variables for error handling
75
  user_text = ""
76
  user_files = []
77
+ uploaded_file_urls = [] # Store uploaded file URLs
78
+ self.file_url_mapping = {} # Add this: Map local paths to uploaded URLs
79
+
80
  try:
81
  # Handle multimodal input - message is a dict with 'text' and 'files'
82
  user_text = message.get("text", "") if message else ""
 
87
  user_text = message
88
  user_files = []
89
 
90
+ logger.info(f"💬 Processing multimodal message:")
91
  logger.info(f" 📝 Text: {user_text}")
92
  logger.info(f" 📁 Files: {len(user_files)} files uploaded")
93
  logger.info(f" 📋 History type: {type(history)}, length: {len(history)}")
 
116
 
117
  history = converted_history
118
 
119
+ # Upload files and get public URLs
120
  for file_path in user_files:
121
+ logger.info(f" 📄 Local File: {file_path}")
122
+ try:
123
+ # Upload file to get public URL
124
+ uploaded_url = self._upload_file_to_gradio_server(file_path)
125
+ # Store the mapping
126
+ self.file_url_mapping[file_path] = uploaded_url
127
+ logger.info(f" ✅ Uploaded File URL: {uploaded_url}")
128
+
129
+ # Add to history with public URL
130
+ history.append(ChatMessage(role="user", content={"path": uploaded_url}))
131
+ except Exception as upload_error:
132
+ logger.error(f"Failed to upload file {file_path}: {upload_error}")
133
+ # Fallback to local path with warning
134
+ history.append(ChatMessage(role="user", content={"path": file_path}))
135
+ logger.warning(f"⚠️ Using local path for {file_path} - MCP servers may not be able to access it")
136
 
137
  # Add text message if provided
138
  if user_text and user_text.strip():
 
142
  if not user_text.strip() and not user_files:
143
  return history, gr.MultimodalTextbox(value=None, interactive=False)
144
 
145
+ # Create messages for HF Inference API
146
+ messages = self._prepare_hf_messages(history, uploaded_file_urls)
147
 
148
+ # Process the chat and get structured responses
149
+ response_messages = self._call_hf_api(messages, uploaded_file_urls)
 
 
 
150
 
151
  # Add all response messages to history
152
  history.extend(response_messages)
 
168
  history.append(ChatMessage(role="assistant", content=error_msg))
169
  return history, gr.MultimodalTextbox(value=None, interactive=False)
170
 
171
+ def _prepare_hf_messages(self, history: List, uploaded_file_urls: List[str] = None) -> List[Dict[str, Any]]:
172
+ """Convert history (ChatMessage or dict) to HuggingFace Inference API format"""
173
  messages = []
174
 
175
+ # Get optimal context settings for current model/provider
176
+ if self.mcp_client.current_model and self.mcp_client.current_provider:
177
+ context_settings = AppConfig.get_optimal_context_settings(
178
+ self.mcp_client.current_model,
179
+ self.mcp_client.current_provider,
180
+ len(self.mcp_client.get_enabled_servers())
181
+ )
182
+ max_history = context_settings['recommended_history_limit']
183
+ else:
184
+ max_history = 20 # Fallback
185
+
186
+ # Convert history to HF API format (text only for context)
187
+ recent_history = history[-max_history:] if len(history) > max_history else history
188
+
189
  for msg in recent_history:
190
  # Handle both ChatMessage objects and dictionary format for backward compatibility
191
  if hasattr(msg, 'role'): # ChatMessage object
 
203
  if isinstance(content, dict):
204
  if "path" in content:
205
  file_path = content.get('path', 'unknown')
206
+ # Check if it's a public URL or local path
207
+ if file_path.startswith('http'):
208
+ # It's already a public URL
209
+ if AppConfig.is_image_file(file_path):
210
+ content = f"[User uploaded an image: {file_path}]"
211
+ elif AppConfig.is_audio_file(file_path):
212
+ content = f"[User uploaded an audio file: {file_path}]"
213
+ elif AppConfig.is_video_file(file_path):
214
+ content = f"[User uploaded a video file: {file_path}]"
215
+ else:
216
+ content = f"[User uploaded a file: {file_path}]"
217
  else:
218
+ # Local path - mention it's not accessible to remote servers
219
+ content = f"[User uploaded a file (local path, not accessible to remote servers): {file_path}]"
220
  else:
221
  content = f"[Object: {str(content)[:50]}...]"
222
  elif isinstance(content, (list, tuple)):
 
233
 
234
  return messages
235
 
236
+ def _call_hf_api(self, messages: List[Dict[str, Any]], uploaded_file_urls: List[str] = None) -> List[ChatMessage]:
237
+ """Call HuggingFace Inference API and return structured ChatMessage responses"""
238
 
239
+ # Check if we have enabled MCP servers to use
240
+ enabled_servers = self.mcp_client.get_enabled_servers()
241
+ if not enabled_servers:
242
+ return self._call_hf_without_mcp(messages)
243
  else:
244
+ return self._call_hf_with_mcp(messages, uploaded_file_urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ def _call_hf_without_mcp(self, messages: List[Dict[str, Any]]) -> List[ChatMessage]:
247
+ """Call HF Inference API without MCP servers"""
248
+ logger.info("💬 No MCP servers available, using regular HF Inference chat")
249
 
250
  system_prompt = self._get_native_system_prompt()
251
 
252
+ # Add system prompt to messages
253
+ if messages and messages[0].get("role") == "system":
254
+ messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
255
+ else:
256
+ messages.insert(0, {"role": "system", "content": system_prompt})
 
 
 
 
 
 
 
257
 
258
+ # Get optimal token settings
259
+ if self.mcp_client.current_model and self.mcp_client.current_provider:
260
+ context_settings = AppConfig.get_optimal_context_settings(
261
+ self.mcp_client.current_model,
262
+ self.mcp_client.current_provider,
263
+ 0 # No MCP servers
264
+ )
265
+ max_tokens = context_settings['max_response_tokens']
266
+ else:
267
+ max_tokens = 8192
268
 
269
+ # Use HF Inference API
270
+ try:
271
+ response = self.mcp_client.generate_chat_completion(messages, **{"max_tokens": max_tokens})
272
+ response_text = response.choices[0].message.content
273
+
274
+ if not response_text:
275
+ response_text = "I understand your request and I'm here to help."
276
+
277
+ return [ChatMessage(role="assistant", content=response_text)]
278
+ except Exception as e:
279
+ logger.error(f"HF Inference API call failed: {e}")
280
+ return [ChatMessage(role="assistant", content=f"❌ API call failed: {str(e)}")]
281
 
282
+ def _call_hf_with_mcp(self, messages: List[Dict[str, Any]], uploaded_file_urls: List[str] = None) -> List[ChatMessage]:
283
+ """Call HF Inference API with MCP servers and return structured responses"""
 
 
 
 
 
 
 
284
 
285
  # Enhanced system prompt with multimodal and MCP instructions
286
+ system_prompt = self._get_mcp_system_prompt(uploaded_file_urls)
287
+
288
+ # Add system prompt to messages
289
+ if messages and messages[0].get("role") == "system":
290
+ messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
291
+ else:
292
+ messages.insert(0, {"role": "system", "content": system_prompt})
293
+
294
+ # Get optimal token settings
295
+ enabled_servers = self.mcp_client.get_enabled_servers()
296
+ if self.mcp_client.current_model and self.mcp_client.current_provider:
297
+ context_settings = AppConfig.get_optimal_context_settings(
298
+ self.mcp_client.current_model,
299
+ self.mcp_client.current_provider,
300
+ len(enabled_servers)
301
+ )
302
+ max_tokens = context_settings['max_response_tokens']
303
+ else:
304
+ max_tokens = 8192
305
 
306
  # Debug logging
307
+ logger.info(f"📤 Sending {len(messages)} messages to HF Inference API")
308
+ logger.info(f"🔧 Using {len(self.mcp_client.servers)} MCP servers")
309
+ logger.info(f"🤖 Model: {self.mcp_client.current_model} via {self.mcp_client.current_provider}")
310
+ logger.info(f"📏 Max tokens: {max_tokens}")
311
 
312
  start_time = time.time()
313
 
314
+ try:
315
+ # Pass file mapping to MCP client
316
+ if hasattr(self, 'file_url_mapping'):
317
+ self.mcp_client.chat_handler_file_mapping = self.file_url_mapping
318
+
319
+ # Call HF Inference with MCP tool support - using optimal max_tokens
320
+ response = self.mcp_client.generate_chat_completion_with_mcp_tools(messages, **{"max_tokens": max_tokens})
321
+
322
+ return self._process_hf_response(response, start_time)
323
+ except Exception as e:
324
+ logger.error(f"HF Inference API call with MCP failed: {e}")
325
+ return [ChatMessage(role="assistant", content=f"❌ API call failed: {str(e)}")]
326
 
327
+ def _process_hf_response(self, response, start_time: float) -> List[ChatMessage]:
328
+ """Process HF Inference response with simplified media handling and nested errors"""
329
  chat_messages = []
330
+
331
+ try:
332
+ response_text = response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ if not response_text:
335
+ response_text = "I understand your request and I'm here to help."
336
+
337
+ # Check if this response includes tool execution info
338
+ if hasattr(response, '_tool_execution'):
339
+ tool_info = response._tool_execution
340
+ logger.info(f"🔧 Processing response with tool execution: {tool_info}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ duration = round(time.time() - start_time, 2)
343
+ tool_id = f"tool_{tool_info['tool']}_{int(time.time())}"
 
 
 
 
 
 
 
344
 
345
+ if tool_info['success']:
346
+ tool_result = str(tool_info['result'])
 
 
347
 
348
+ # Extract media URL if present
349
+ media_url = self._extract_media_url(tool_result, tool_info.get('server', ''))
350
 
351
+ # Create tool usage metadata message
352
+ chat_messages.append(ChatMessage(
353
+ role="assistant",
354
+ content="",
355
+ metadata={
356
+ "title": f"🔧 Used {tool_info['tool']}",
357
+ "status": "done",
358
+ "duration": duration,
359
+ "id": tool_id
360
+ }
361
+ ))
362
+
363
+ # Add nested success message with the raw result
364
+ if media_url:
365
+ result_preview = f"✅ Successfully generated media\nURL: {media_url[:100]}..."
366
  else:
367
+ result_preview = f"✅ Tool executed successfully\nResult: {tool_result[:200]}..."
 
 
 
 
 
 
368
 
 
369
  chat_messages.append(ChatMessage(
370
  role="assistant",
371
+ content=result_preview,
372
  metadata={
373
+ "title": "📊 Server Response",
374
+ "parent_id": tool_id,
375
  "status": "done"
376
  }
377
  ))
378
 
379
+ # Add LLM's descriptive text if present (before media)
380
+ if response_text and not response_text.startswith('{"use_tool"'):
381
+ # Clean the response text by removing URLs and tool JSON
382
+ clean_response = response_text
383
+ if media_url and media_url in clean_response:
384
+ clean_response = clean_response.replace(media_url, "").strip()
385
+
386
+ # Remove any remaining JSON tool call patterns
387
+ clean_response = re.sub(r'\{"use_tool"[^}]+\}', '', clean_response).strip()
388
+
389
+ # Remove all markdown link/image syntax completely
390
+ clean_response = re.sub(r'!\[([^\]]*)\]\([^)]*\)', '', clean_response) # Remove image markdown
391
+ clean_response = re.sub(r'\[([^\]]*)\]\([^)]*\)', '', clean_response) # Remove link markdown
392
+ clean_response = re.sub(r'!\[([^\]]*)\]', '', clean_response) # Remove broken image refs
393
+ clean_response = re.sub(r'\[([^\]]*)\]', '', clean_response) # Remove broken link refs
394
+ clean_response = re.sub(r'\(\s*\)', '', clean_response) # Remove empty parentheses
395
+ clean_response = clean_response.strip() # Final strip
396
+
397
+ # Only add if there's meaningful text left after cleaning
398
+ if clean_response and len(clean_response) > 10:
399
+ chat_messages.append(ChatMessage(
400
+ role="assistant",
401
+ content=clean_response
402
+ ))
403
+ # Handle media content if present
404
+ if media_url:
405
+ # Add media as a separate message - Gradio will auto-detect type
406
  chat_messages.append(ChatMessage(
407
+ role="assistant",
408
  content={"path": media_url}
409
  ))
410
  else:
411
+ # No media URL found, check if we need to show non-media result
412
+ if not response_text or response_text.startswith('{"use_tool"'):
413
+ # Only show result if there wasn't descriptive text from LLM
414
+ if len(tool_result) > 500:
415
+ result_preview = f"Operation completed successfully. Result preview: {tool_result[:500]}..."
416
+ else:
417
+ result_preview = f"Operation completed successfully. Result: {tool_result}"
418
+
419
+ chat_messages.append(ChatMessage(
420
+ role="assistant",
421
+ content=result_preview
422
+ ))
423
+
424
  else:
425
+ # Tool execution failed
426
+ error_details = tool_info['result']
427
+
428
+ # Create main tool message with error status
429
  chat_messages.append(ChatMessage(
430
  role="assistant",
431
+ content="",
432
  metadata={
433
+ "title": f"❌ Used {tool_info['tool']}",
434
+ "status": "error",
435
+ "duration": duration,
436
+ "id": tool_id
437
  }
438
  ))
439
+
440
+ # Add nested error response from server
441
+ chat_messages.append(ChatMessage(
442
+ role="assistant",
443
+ content=f"❌ Tool execution failed\n```\n{error_details}\n```",
444
+ metadata={
445
+ "title": "📊 Server Response",
446
+ "parent_id": tool_id,
447
+ "status": "error"
448
+ }
449
+ ))
450
+
451
+ # Add suggestions as another nested message
452
+ chat_messages.append(ChatMessage(
453
+ role="assistant",
454
+ content="**Suggestions:**\n• Try modifying your request slightly\n• Wait a moment and try again\n• Use a different MCP server if available",
455
+ metadata={
456
+ "title": "💡 Possible Solutions",
457
+ "parent_id": tool_id,
458
+ "status": "info"
459
+ }
460
+ ))
461
+ else:
462
+ # No tool usage, just return the response
463
  chat_messages.append(ChatMessage(
464
  role="assistant",
465
+ content=response_text
466
  ))
467
+
468
+ except Exception as e:
469
+ logger.error(f"Error processing HF response: {e}")
470
+ logger.error(traceback.format_exc())
471
  chat_messages.append(ChatMessage(
472
  role="assistant",
473
  content="I understand your request and I'm here to help."
 
475
 
476
  return chat_messages
477
 
478
+ def _extract_media_url(self, result_text: str, server_name: str) -> Optional[str]:
479
+ """Extract media URL from MCP response with improved pattern matching"""
480
+ if not isinstance(result_text, str):
481
+ return None
482
+
483
+ logger.info(f"🔍 Extracting media from result: {result_text[:500]}...")
484
+
485
+ # Try JSON parsing first
486
+ try:
487
+ if result_text.strip().startswith('[') or result_text.strip().startswith('{'):
488
+ data = json.loads(result_text.strip())
489
+
490
+ # Handle array format
491
+ if isinstance(data, list) and len(data) > 0:
492
+ item = data[0]
493
+ if isinstance(item, dict):
494
+ # Check for nested media structure
495
+ for media_type in ['audio', 'video', 'image']:
496
+ if media_type in item and isinstance(item[media_type], dict):
497
+ if 'url' in item[media_type]:
498
+ url = item[media_type]['url'].strip('\'"')
499
+ logger.info(f"🎯 Found {media_type} URL in JSON: {url}")
500
+ return url
501
+ # Check for direct URL
502
+ if 'url' in item:
503
+ url = item['url'].strip('\'"')
504
+ logger.info(f"🎯 Found direct URL in JSON: {url}")
505
+ return url
506
+
507
+ # Handle object format
508
+ elif isinstance(data, dict):
509
+ # Check for nested media structure
510
+ for media_type in ['audio', 'video', 'image']:
511
+ if media_type in data and isinstance(data[media_type], dict):
512
+ if 'url' in data[media_type]:
513
+ url = data[media_type]['url'].strip('\'"')
514
+ logger.info(f"🎯 Found {media_type} URL in JSON: {url}")
515
+ return url
516
+ # Check for direct URL
517
+ if 'url' in data:
518
+ url = data['url'].strip('\'"')
519
+ logger.info(f"🎯 Found direct URL in JSON: {url}")
520
+ return url
521
+
522
+ except json.JSONDecodeError:
523
+ pass
524
+
525
+ # Check for Gradio file URLs (common pattern)
526
+ gradio_patterns = [
527
+ r'https://[^/]+\.hf\.space/gradio_api/file=/[^/]+/[^/]+/[^\s"\'<>,]+',
528
+ r'https://[^/]+\.hf\.space/file=[^\s"\'<>,]+',
529
+ r'/gradio_api/file=/[^\s"\'<>,]+'
530
  ]
531
 
532
+ for pattern in gradio_patterns:
533
+ match = re.search(pattern, result_text)
534
+ if match:
535
+ url = match.group(0).rstrip('\'",:;')
536
+ logger.info(f"🎯 Found Gradio file URL: {url}")
537
+ return url
538
 
539
+ # Check for any HTTP URLs with media extensions
540
+ url_pattern = r'https?://[^\s"\'<>]+\.(?:mp3|wav|ogg|m4a|flac|aac|opus|wma|mp4|webm|avi|mov|mkv|m4v|wmv|png|jpg|jpeg|gif|webp|bmp|svg)'
541
+ match = re.search(url_pattern, result_text, re.IGNORECASE)
542
+ if match:
543
+ url = match.group(0)
544
+ logger.info(f"🎯 Found media URL by extension: {url}")
545
+ return url
546
 
547
+ # Check for data URLs
548
+ if result_text.startswith('data:'):
549
+ logger.info("🎯 Found data URL")
550
+ return result_text
551
 
552
+ logger.info("❌ No media URL found in result")
553
+ return None
554
 
555
  def _get_native_system_prompt(self) -> str:
556
+ """Get system prompt for HF Inference without MCP servers"""
557
+ model_info = AppConfig.AVAILABLE_MODELS.get(self.mcp_client.current_model, {})
558
+ context_length = model_info.get("context_length", 128000)
559
+
560
+ return f"""You are an AI assistant powered by {self.mcp_client.current_model} via {self.mcp_client.current_provider}. You have native capabilities for:
561
  - **Text Processing**: You can analyze, summarize, translate, and process text directly
562
  - **General Knowledge**: You can answer questions, explain concepts, and have conversations
563
  - **Code Analysis**: You can read, analyze, and explain code
564
+ - **Reasoning**: You can perform step-by-step reasoning and problem-solving
565
+ - **Context Window**: You have access to {context_length:,} tokens of context
566
  Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
567
+ Please provide helpful, accurate, and engaging responses to user queries."""
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
+ def _get_mcp_system_prompt(self, uploaded_file_urls: List[str] = None) -> str:
570
+ """Get enhanced system prompt for HF Inference with MCP servers"""
571
+ model_info = AppConfig.AVAILABLE_MODELS.get(self.mcp_client.current_model, {})
572
+ context_length = model_info.get("context_length", 128000)
573
+
574
  uploaded_files_context = ""
575
+ if uploaded_file_urls:
576
+ uploaded_files_context = f"\n\nFILES UPLOADED BY USER (Public URLs accessible to MCP servers):\n"
577
+ for i, file_url in enumerate(uploaded_file_urls, 1):
578
+ file_name = file_url.split('/')[-1] if '/' in file_url else file_url
579
+ if AppConfig.is_image_file(file_url):
580
  file_type = "Image"
581
+ elif AppConfig.is_audio_file(file_url):
582
  file_type = "Audio"
583
+ elif AppConfig.is_video_file(file_url):
584
  file_type = "Video"
585
  else:
586
  file_type = "File"
587
+ uploaded_files_context += f"{i}. {file_type}: {file_name}\n URL: {file_url}\n"
588
 
589
+ # Get available tools with correct names from enabled servers only
590
+ enabled_servers = self.mcp_client.get_enabled_servers()
591
+ tools_info = []
592
+ for server_name, config in enabled_servers.items():
593
+ tools_info.append(f"- **{server_name}**: {config.description}")
594
+
595
+ return f"""You are an AI assistant powered by {self.mcp_client.current_model} via {self.mcp_client.current_provider}, with access to various MCP tools.
596
+ YOUR NATIVE CAPABILITIES:
597
  - **Text Processing**: You can analyze, summarize, translate, and process text directly
598
  - **General Knowledge**: You can answer questions, explain concepts, and have conversations
599
  - **Code Analysis**: You can read, analyze, and explain code
600
+ - **Reasoning**: You can perform step-by-step reasoning and problem-solving
601
+ - **Context Window**: You have access to {context_length:,} tokens of context
602
+ AVAILABLE MCP TOOLS:
603
+ You have access to the following MCP servers:
604
+ {chr(10).join(tools_info)}
605
  WHEN TO USE MCP TOOLS:
606
  - **Image Generation**: Creating new images from text prompts
607
  - **Image Editing**: Modifying, enhancing, or transforming existing images
608
  - **Audio Processing**: Transcribing audio, generating speech, audio enhancement
609
  - **Video Processing**: Creating or editing videos
610
+ - **Text to Speech**: Converting text to audio
611
  - **Specialized Analysis**: Tasks requiring specific models or APIs
612
+ TOOL USAGE FORMAT:
613
+ When you need to use an MCP tool, respond with JSON in this exact format:
614
+ {{"use_tool": true, "server": "exact_server_name", "tool": "exact_tool_name", "arguments": {{"param": "value"}}}}
615
+ IMPORTANT: Always describe what you're going to do BEFORE the JSON tool call. For example:
616
+ "I'll generate speech for your text using the TTS tool."
617
+ {{"use_tool": true, "server": "text to speech", "tool": "Kokoro_TTS_mcp_test_generate_first", "arguments": {{"text": "hello"}}}}
618
+ IMPORTANT TOOL NAME MAPPING:
619
+ - For TTS server: use tool name "Kokoro_TTS_mcp_test_generate_first"
620
+ - For image generation: use tool name "dalle_3_xl_lora_v2_generate"
621
+ - For video generation: use tool name "ysharma_ltx_video_distilledtext_to_video"
622
+ - For letter counting: use tool name "gradio_app_dummy1_letter_counter"
623
+ EXACT SERVER NAMES TO USE:
624
+ {', '.join([f'"{name}"' for name in enabled_servers.keys()])}
625
+ FILE HANDLING FOR MCP TOOLS:
626
+ When using MCP tools with uploaded files, always use the public URLs provided above.
627
+ These URLs are accessible to remote MCP servers.
628
  {uploaded_files_context}
629
+ MEDIA HANDLING:
630
+ When tool results contain media URLs (images, audio, videos), the system will automatically embed them as playable media.
631
+ IMPORTANT NOTES:
632
+ - Always use the EXACT server names and tool names as specified above
633
+ - Use proper JSON format for tool calls
634
+ - Include all required parameters in arguments
635
+ - For file inputs to MCP tools, use the public URLs provided, not local paths
636
+ - ALWAYS provide a descriptive message before the JSON tool call
637
+ - After tool execution, you can provide additional context or ask if the user needs anything else
638
  Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
639
+ Current model: {self.mcp_client.current_model} via {self.mcp_client.current_provider}"""