prithivMLmods commited on
Commit
7c9f5e8
·
verified ·
1 Parent(s): 40825af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +287 -154
app.py CHANGED
@@ -17,7 +17,7 @@ import numpy as np
17
  from PIL import Image
18
  import edge_tts
19
  import trimesh
20
- import soundfile as sf # Added for audio processing with Phi-4
21
 
22
  import supervision as sv
23
  from ultralytics import YOLO as YOLODetector
@@ -46,6 +46,10 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
46
  return seed
47
 
48
  def glb_to_data_url(glb_path: str) -> str:
 
 
 
 
49
  with open(glb_path, "rb") as f:
50
  data = f.read()
51
  b64_data = base64.b64encode(data).decode("utf-8")
@@ -58,6 +62,7 @@ class Model:
58
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
  self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
60
  self.pipe.to(self.device)
 
61
  if torch.cuda.is_available():
62
  try:
63
  self.pipe.text_encoder = self.pipe.text_encoder.half()
@@ -66,6 +71,7 @@ class Model:
66
 
67
  self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
68
  self.pipe_img.to(self.device)
 
69
  if torch.cuda.is_available():
70
  text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
71
  if text_encoder_img is not None:
@@ -73,6 +79,7 @@ class Model:
73
 
74
  def to_glb(self, ply_path: str) -> str:
75
  mesh = trimesh.load(ply_path)
 
76
  rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
77
  mesh.apply_transform(rot)
78
  rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
@@ -107,7 +114,7 @@ class Model:
107
  export_to_ply(images[0], ply_path.name)
108
  return self.to_glb(ply_path.name)
109
 
110
- # Web Tools using DuckDuckGo and smolagents
111
 
112
  from typing import Any, Optional
113
  from smolagents.tools import Tool
@@ -115,20 +122,25 @@ import duckduckgo_search
115
 
116
  class DuckDuckGoSearchTool(Tool):
117
  name = "web_search"
118
- description = "Performs a duckduckgo web search and returns the top results."
119
- inputs = {'query': {'type': 'string', 'description': 'The search query.'}}
120
  output_type = "string"
121
 
122
  def __init__(self, max_results=10, **kwargs):
123
  super().__init__()
124
  self.max_results = max_results
125
- from duckduckgo_search import DDGS
 
 
 
 
 
126
  self.ddgs = DDGS(**kwargs)
127
 
128
  def forward(self, query: str) -> str:
129
  results = self.ddgs.text(query, max_results=self.max_results)
130
  if len(results) == 0:
131
- raise Exception("No results found! Try a less restrictive query.")
132
  postprocessed_results = [
133
  f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
134
  ]
@@ -136,28 +148,44 @@ class DuckDuckGoSearchTool(Tool):
136
 
137
  class VisitWebpageTool(Tool):
138
  name = "visit_webpage"
139
- description = "Visits a webpage and returns its content as markdown."
140
- inputs = {'url': {'type': 'string', 'description': 'The URL to visit.'}}
141
  output_type = "string"
142
 
143
  def __init__(self, *args, **kwargs):
144
  self.is_initialized = False
145
 
146
  def forward(self, url: str) -> str:
147
- import requests
148
- from markdownify import markdownify
149
- from smolagents.utils import truncate_content
150
  try:
 
 
 
 
 
 
 
 
 
 
 
151
  response = requests.get(url, timeout=20)
152
- response.raise_for_status()
 
 
153
  markdown_content = markdownify(response.text).strip()
 
 
154
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 
155
  return truncate_content(markdown_content, 10000)
156
- except requests.exceptions.Timeout:
157
- return "The request timed out."
158
- except requests.exceptions.RequestException as e:
159
- return f"Error fetching webpage: {str(e)}"
160
 
 
 
 
 
 
 
 
161
  # rAgent Reasoning using Llama mode OpenAI
162
 
163
  from openai import OpenAI
@@ -169,15 +197,22 @@ ragent_client = OpenAI(
169
  )
170
 
171
  SYSTEM_PROMPT = """
172
- "You are an expert assistant who solves tasks using Python code. Follow these steps:
173
- 1. **Thought**: Explain your reasoning and plan.
174
- 2. **Code**: Write Python code to implement your solution.
175
- 3. **Observation**: Analyze the output and summarize results.
176
- 4. **Final Answer**: Provide a concise conclusion."
 
 
 
177
  """
178
 
179
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
 
 
 
180
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
 
181
  for msg in history:
182
  if msg.get("role") == "user":
183
  messages.append({"role": "user", "content": msg["content"]})
@@ -186,23 +221,76 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
186
  messages.append({"role": "user", "content": prompt})
187
  response = ""
188
  stream = ragent_client.chat.completions.create(
189
- model="meta-llama/Meta-Llama-3.1-8B-Instruct",
190
- max_tokens=max_tokens,
191
- stream=True,
192
- temperature=temperature,
193
- top_p=top_p,
194
- messages=messages,
195
  )
196
  for message in stream:
197
- token = message.choices[0].delta.content
198
- response += token
199
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- # Load Models
 
 
202
 
203
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
204
 
205
- # Text-only model
 
 
206
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
207
  tokenizer = AutoTokenizer.from_pretrained(model_id)
208
  model = AutoModelForCausalLM.from_pretrained(
@@ -212,8 +300,14 @@ model = AutoModelForCausalLM.from_pretrained(
212
  )
213
  model.eval()
214
 
215
- # Multimodal model (Qwen2-VL)
216
- MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 
 
 
 
 
 
217
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
218
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
219
  MODEL_ID,
@@ -221,58 +315,55 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
221
  torch_dtype=torch.float16
222
  ).to("cuda").eval()
223
 
224
- # Phi-4 Multimodal Model
225
- phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
226
- phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
227
- phi4_model = AutoModelForCausalLM.from_pretrained(
228
- phi4_model_path,
229
- device_map="auto",
230
- torch_dtype="auto",
231
- trust_remote_code=True,
232
- _attn_implementation="eager",
233
- )
234
- phi4_model.eval()
235
-
236
- # Stable Diffusion XL Pipeline
237
- MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
238
- sd_pipe = StableDiffusionXLPipeline.from_pretrained(
239
- MODEL_ID_SD,
240
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
241
- use_safetensors=True,
242
- add_watermarker=False,
243
- ).to(device)
244
- sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
245
- if torch.cuda.is_available():
246
- sd_pipe.text_encoder = sd_pipe.text_encoder.half()
247
-
248
- # YOLO Object Detection
249
- YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
250
- YOLO_CHECKPOINT_NAME = "images/demo.pt"
251
- yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
252
- yolo_detector = YOLODetector(yolo_model_path)
253
-
254
- # TTS Voices
255
- TTS_VOICES = ["en-US-JennyNeural", "en-US-GuyNeural"]
256
-
257
- MAX_MAX_NEW_TOKENS = 2048
258
- DEFAULT_MAX_NEW_TOKENS = 1024
259
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
260
-
261
- # Utility Functions
262
 
263
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 
264
  communicate = edge_tts.Communicate(text, voice)
265
  await communicate.save(output_file)
266
  return output_file
267
 
 
 
268
  def clean_chat_history(chat_history):
 
 
 
 
269
  cleaned = []
270
  for msg in chat_history:
271
  if isinstance(msg, dict) and isinstance(msg.get("content"), str):
272
  cleaned.append(msg)
273
  return cleaned
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  def save_image(img: Image.Image) -> str:
 
276
  unique_name = str(uuid.uuid4()) + ".png"
277
  img.save(unique_name)
278
  return unique_name
@@ -292,8 +383,10 @@ def generate_image_fn(
292
  num_images: int = 1,
293
  progress=gr.Progress(track_tqdm=True),
294
  ):
 
295
  seed = int(randomize_seed_fn(seed, randomize_seed))
296
  generator = torch.Generator(device=device).manual_seed(seed)
 
297
  options = {
298
  "prompt": [prompt] * num_images,
299
  "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
@@ -306,12 +399,14 @@ def generate_image_fn(
306
  }
307
  if use_resolution_binning:
308
  options["use_resolution_binning"] = True
 
309
  images = []
310
- for i in range(0, num_images, 1): # Simplified batching
 
311
  batch_options = options.copy()
312
- batch_options["prompt"] = options["prompt"][i:i+1]
313
- if "negative_prompt" in batch_options and batch_options["negative_prompt"]:
314
- batch_options["negative_prompt"] = options["negative_prompt"][i:i+1]
315
  if device.type == "cuda":
316
  with torch.autocast("cuda", dtype=torch.float16):
317
  outputs = sd_pipe(**batch_options)
@@ -321,6 +416,8 @@ def generate_image_fn(
321
  image_paths = [save_image(img) for img in images]
322
  return image_paths, seed
323
 
 
 
324
  @spaces.GPU(duration=120, enable_queue=True)
325
  def generate_3d_fn(
326
  prompt: str,
@@ -329,22 +426,36 @@ def generate_3d_fn(
329
  num_steps: int = 64,
330
  randomize_seed: bool = False,
331
  ):
 
 
 
 
332
  seed = int(randomize_seed_fn(seed, randomize_seed))
333
  model3d = Model()
334
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
335
  return glb_path, seed
336
 
 
 
 
 
 
 
337
  def detect_objects(image: np.ndarray):
 
338
  results = yolo_detector(image, verbose=False)[0]
339
  detections = sv.Detections.from_ultralytics(results).with_nms()
 
340
  box_annotator = sv.BoxAnnotator()
341
  label_annotator = sv.LabelAnnotator()
 
342
  annotated_image = image.copy()
343
  annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
344
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
 
345
  return Image.fromarray(annotated_image)
346
 
347
- # Chat Generation Function with @phi4 Added
348
 
349
  @spaces.GPU
350
  def generate(
@@ -356,13 +467,23 @@ def generate(
356
  top_k: int = 50,
357
  repetition_penalty: float = 1.2,
358
  ):
 
 
 
 
 
 
 
 
 
 
359
  text = input_dict["text"]
360
  files = input_dict.get("files", [])
361
 
362
- # --- 3D Generation ---
363
  if text.strip().lower().startswith("@3d"):
364
  prompt = text[len("@3d"):].strip()
365
- yield "🌀 Generating 3D mesh GLB file..."
366
  glb_path, used_seed = generate_3d_fn(
367
  prompt=prompt,
368
  seed=1,
@@ -370,31 +491,41 @@ def generate(
370
  num_steps=64,
371
  randomize_seed=True,
372
  )
 
373
  static_folder = os.path.join(os.getcwd(), "static")
374
  if not os.path.exists(static_folder):
375
  os.makedirs(static_folder)
376
  new_filename = f"mesh_{uuid.uuid4()}.glb"
377
  new_filepath = os.path.join(static_folder, new_filename)
378
  shutil.copy(glb_path, new_filepath)
 
379
  yield gr.File(new_filepath)
380
  return
381
 
382
- # --- Image Generation ---
383
  if text.strip().lower().startswith("@image"):
384
  prompt = text[len("@image"):].strip()
385
  yield "🪧 Generating image..."
386
  image_paths, used_seed = generate_image_fn(
387
  prompt=prompt,
 
 
388
  seed=1,
 
 
 
 
389
  randomize_seed=True,
 
390
  num_images=1,
391
  )
392
  yield gr.Image(image_paths[0])
393
  return
394
 
395
- # --- Web Search/Visit ---
396
  if text.strip().lower().startswith("@web"):
397
  web_command = text[len("@web"):].strip()
 
398
  if web_command.lower().startswith("visit"):
399
  url = web_command[len("visit"):].strip()
400
  yield "🌍 Visiting webpage..."
@@ -402,30 +533,36 @@ def generate(
402
  content = visitor.forward(url)
403
  yield content
404
  else:
 
405
  query = web_command
406
- yield "🧤 Performing web search..."
407
  searcher = DuckDuckGoSearchTool()
408
  results = searcher.forward(query)
409
  yield results
410
  return
411
 
412
- # --- rAgent Reasoning ---
413
  if text.strip().lower().startswith("@ragent"):
414
  prompt = text[len("@ragent"):].strip()
415
- yield "📝 Initiating reasoning chain..."
 
416
  for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
417
  yield partial
418
  return
419
 
420
- # --- YOLO Object Detection ---
421
  if text.strip().lower().startswith("@yolo"):
422
- yield "🔍 Running object detection..."
423
  if not files or len(files) == 0:
424
- yield "Error: Please attach an image for YOLO."
425
  return
 
426
  input_file = files[0]
427
  try:
428
- pil_image = Image.open(input_file)
 
 
 
429
  except Exception as e:
430
  yield f"Error loading image: {str(e)}"
431
  return
@@ -434,63 +571,64 @@ def generate(
434
  yield gr.Image(result_img)
435
  return
436
 
437
- # --- Phi-4 Multimodal Branch ---
438
  if text.strip().lower().startswith("@phi4"):
439
- parts = text[len("@phi4"):].strip().split(maxsplit=1)
440
- if len(parts) < 2:
441
- yield "Error: Specify input type and question, e.g., '@phi4 image What is this?'"
442
  return
443
- input_type = parts[0].lower()
444
- question = parts[1]
445
-
446
- if input_type not in ["image", "audio"]:
447
- yield "Error: Input type must be 'image' or 'audio'."
448
  return
449
-
450
- if not files or len(files) == 0:
451
- yield "Error: Please attach a file for Phi-4 processing."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  return
453
 
454
- if len(files) > 1:
455
- yield "Warning: Multiple files attached. Using the first one."
456
-
457
- file_input = files[0]
458
-
459
- try:
460
- if input_type == "image":
461
- prompt = f'<|user|><|image_1|>{question}<|end|><|assistant|>'
462
- image = Image.open(file_input)
463
- inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
464
- elif input_type == "audio":
465
- prompt = f'<|user|><|audio_1|>{question}<|end|><|assistant|>'
466
- audio, samplerate = sf.read(file_input)
467
- inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
468
-
469
- streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
470
- generation_kwargs = {
471
  **inputs,
472
- "streamer": streamer,
473
- "max_new_tokens": max_new_tokens,
474
- }
475
- thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
476
- thread.start()
477
-
478
- buffer = ""
479
- yield "🤔 Thinking..."
480
- for new_text in streamer:
481
- buffer += new_text
482
- buffer = buffer.replace("<|im_end|>", "")
483
- time.sleep(0.01)
484
- yield buffer
485
- except Exception as e:
486
- yield f"Error processing file: {str(e)}"
487
  return
488
 
489
- # --- Text and TTS Branch ---
490
  tts_prefix = "@tts"
491
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
492
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
493
-
494
  if is_tts and voice_index:
495
  voice = TTS_VOICES[voice_index - 1]
496
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -502,7 +640,12 @@ def generate(
502
  conversation.append({"role": "user", "content": text})
503
 
504
  if files:
505
- images = [load_image(image) for image in files]
 
 
 
 
 
506
  messages = [{
507
  "role": "user",
508
  "content": [
@@ -528,7 +671,7 @@ def generate(
528
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
529
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
530
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
531
- gr.Warning(f"Trimmed input to {MAX_INPUT_TOKEN_LENGTH} tokens.")
532
  input_ids = input_ids.to(model.device)
533
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
534
  generation_kwargs = {
@@ -557,24 +700,14 @@ def generate(
557
  output_file = asyncio.run(text_to_speech(final_response, voice))
558
  yield gr.Audio(output_file, autoplay=True)
559
 
560
- # Gradio Interface
561
-
562
- DESCRIPTION = """
563
- # Agent Dino 🌠
564
- Multimodal chatbot with text, image, audio, 3D generation, web search, reasoning, and object detection.
565
- """
566
-
567
- css = '''
568
- h1 { text-align: center; }
569
- #duplicate-button { margin: auto; color: #fff; background: #1565c0; border-radius: 100vh; }
570
- '''
571
 
572
  demo = gr.ChatInterface(
573
  fn=generate,
574
  additional_inputs=[
575
  gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
576
  gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
577
- gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
578
  gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
579
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
580
  ],
@@ -585,10 +718,9 @@ demo = gr.ChatInterface(
585
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
586
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
587
  ["@rAgent Explain how a binary search algorithm works."],
588
- ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning?"],
589
  ["@tts1 Explain Tower of Hanoi"],
590
- [{"text": "@phi4 image What is shown in this image?", "files": ["examples/image.jpg"]}],
591
- [{"text": "@phi4 audio Transcribe this audio.", "files": ["examples/audio.wav"]}],
592
  ],
593
  cache_examples=False,
594
  type="messages",
@@ -596,15 +728,16 @@ demo = gr.ChatInterface(
596
  css=css,
597
  fill_height=True,
598
  textbox=gr.MultimodalTextbox(
599
- label="Query Input",
600
  file_types=["image", "audio"],
601
- file_count="multiple",
602
- placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @phi4-multimodal, default-{text gen}{image-text-text}",
603
  ),
604
  stop_btn="Stop Generation",
605
  multimodal=True,
606
  )
607
 
 
608
  if not os.path.exists("static"):
609
  os.makedirs("static")
610
 
 
17
  from PIL import Image
18
  import edge_tts
19
  import trimesh
20
+ import soundfile as sf # New import for audio file reading
21
 
22
  import supervision as sv
23
  from ultralytics import YOLO as YOLODetector
 
46
  return seed
47
 
48
  def glb_to_data_url(glb_path: str) -> str:
49
+ """
50
+ Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
51
+ (Not used in this method.)
52
+ """
53
  with open(glb_path, "rb") as f:
54
  data = f.read()
55
  b64_data = base64.b64encode(data).decode("utf-8")
 
62
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63
  self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
64
  self.pipe.to(self.device)
65
+ # Ensure the text encoder is in half precision to avoid dtype mismatches.
66
  if torch.cuda.is_available():
67
  try:
68
  self.pipe.text_encoder = self.pipe.text_encoder.half()
 
71
 
72
  self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
73
  self.pipe_img.to(self.device)
74
+ # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
75
  if torch.cuda.is_available():
76
  text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
77
  if text_encoder_img is not None:
 
79
 
80
  def to_glb(self, ply_path: str) -> str:
81
  mesh = trimesh.load(ply_path)
82
+ # Rotate the mesh for proper orientation
83
  rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
84
  mesh.apply_transform(rot)
85
  rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
 
114
  export_to_ply(images[0], ply_path.name)
115
  return self.to_glb(ply_path.name)
116
 
117
+ # New Tools for Web Functionality using DuckDuckGo and smolagents
118
 
119
  from typing import Any, Optional
120
  from smolagents.tools import Tool
 
122
 
123
  class DuckDuckGoSearchTool(Tool):
124
  name = "web_search"
125
+ description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
126
+ inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
127
  output_type = "string"
128
 
129
  def __init__(self, max_results=10, **kwargs):
130
  super().__init__()
131
  self.max_results = max_results
132
+ try:
133
+ from duckduckgo_search import DDGS
134
+ except ImportError as e:
135
+ raise ImportError(
136
+ "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
137
+ ) from e
138
  self.ddgs = DDGS(**kwargs)
139
 
140
  def forward(self, query: str) -> str:
141
  results = self.ddgs.text(query, max_results=self.max_results)
142
  if len(results) == 0:
143
+ raise Exception("No results found! Try a less restrictive/shorter query.")
144
  postprocessed_results = [
145
  f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
146
  ]
 
148
 
149
  class VisitWebpageTool(Tool):
150
  name = "visit_webpage"
151
+ description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
152
+ inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
153
  output_type = "string"
154
 
155
  def __init__(self, *args, **kwargs):
156
  self.is_initialized = False
157
 
158
  def forward(self, url: str) -> str:
 
 
 
159
  try:
160
+ import requests
161
+ from markdownify import markdownify
162
+ from requests.exceptions import RequestException
163
+
164
+ from smolagents.utils import truncate_content
165
+ except ImportError as e:
166
+ raise ImportError(
167
+ "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
168
+ ) from e
169
+ try:
170
+ # Send a GET request to the URL with a 20-second timeout
171
  response = requests.get(url, timeout=20)
172
+ response.raise_for_status() # Raise an exception for bad status codes
173
+
174
+ # Convert the HTML content to Markdown
175
  markdown_content = markdownify(response.text).strip()
176
+
177
+ # Remove multiple line breaks
178
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
179
+
180
  return truncate_content(markdown_content, 10000)
 
 
 
 
181
 
182
+ except requests.exceptions.Timeout:
183
+ return "The request timed out. Please try again later or check the URL."
184
+ except RequestException as e:
185
+ return f"Error fetching the webpage: {str(e)}"
186
+ except Exception as e:
187
+ return f"An unexpected error occurred: {str(e)}"
188
+
189
  # rAgent Reasoning using Llama mode OpenAI
190
 
191
  from openai import OpenAI
 
197
  )
198
 
199
  SYSTEM_PROMPT = """
200
+
201
+ "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
202
+ "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
203
+ "2. **Code**: Write Python code to implement your solution.\n"
204
+ "3. **Observation**: Analyze the output of the code and summarize the results.\n"
205
+ "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
206
+ f"Task: {task}"
207
+
208
  """
209
 
210
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
211
+ """
212
+ Uses the Llama mode OpenAI model to perform a structured reasoning chain.
213
+ """
214
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
215
+ # Incorporate conversation history (if any)
216
  for msg in history:
217
  if msg.get("role") == "user":
218
  messages.append({"role": "user", "content": msg["content"]})
 
221
  messages.append({"role": "user", "content": prompt})
222
  response = ""
223
  stream = ragent_client.chat.completions.create(
224
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
225
+ max_tokens=max_tokens,
226
+ stream=True,
227
+ temperature=temperature,
228
+ top_p=top_p,
229
+ messages=messages,
230
  )
231
  for message in stream:
232
+ token = message.choices[0].delta.content
233
+ response += token
234
+ yield response
235
+
236
+ # ------------------------------------------------------------------------------
237
+ # New Phi-4 Multimodal Feature (Image & Audio)
238
+ # ------------------------------------------------------------------------------
239
+ # Define prompt structure for Phi-4
240
+ phi4_user_prompt = '<|user|>'
241
+ phi4_assistant_prompt = '<|assistant|>'
242
+ phi4_prompt_suffix = '<|end|>'
243
+
244
+ # Load Phi-4 multimodal model and processor using unique variable names
245
+ phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
246
+ phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
247
+ phi4_model = AutoModelForCausalLM.from_pretrained(
248
+ phi4_model_path,
249
+ device_map="auto",
250
+ torch_dtype="auto",
251
+ trust_remote_code=True,
252
+ _attn_implementation="eager",
253
+ )
254
+
255
+ # ------------------------------------------------------------------------------
256
+ # Gradio UI configuration
257
+ # ------------------------------------------------------------------------------
258
+
259
+ DESCRIPTION = """
260
+ # Agent Dino 🌠
261
+ This chatbot supports various commands:
262
+ - **@tts1 / @tts2:** text-to-speech
263
+ - **@image:** image generation
264
+ - **@3d:** 3D mesh generation
265
+ - **@web:** web search/visit
266
+ - **@rAgent:** reasoning chain
267
+ - **@yolo:** object detection
268
+ - **@phi4:** multimodal (image/audio) question answering
269
+ """
270
+
271
+ css = '''
272
+ h1 {
273
+ text-align: center;
274
+ display: block;
275
+ }
276
+
277
+ #duplicate-button {
278
+ margin: auto;
279
+ color: #fff;
280
+ background: #1565c0;
281
+ border-radius: 100vh;
282
+ }
283
+ '''
284
 
285
+ MAX_MAX_NEW_TOKENS = 2048
286
+ DEFAULT_MAX_NEW_TOKENS = 1024
287
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
288
 
289
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
290
 
291
+ # Load Models and Pipelines for Chat, Image, and Multimodal Processing
292
+ # Load the text-only model and tokenizer (for pure text chat)
293
+
294
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
295
  tokenizer = AutoTokenizer.from_pretrained(model_id)
296
  model = AutoModelForCausalLM.from_pretrained(
 
300
  )
301
  model.eval()
302
 
303
+ # Voices for text-to-speech
304
+ TTS_VOICES = [
305
+ "en-US-JennyNeural", # @tts1
306
+ "en-US-GuyNeural", # @tts2
307
+ ]
308
+
309
+ # Load multimodal processor and model (e.g. for OCR and image processing)
310
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
311
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
312
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
313
  MODEL_ID,
 
315
  torch_dtype=torch.float16
316
  ).to("cuda").eval()
317
 
318
+ # Asynchronous text-to-speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
321
+ """Convert text to speech using Edge TTS and save as MP3"""
322
  communicate = edge_tts.Communicate(text, voice)
323
  await communicate.save(output_file)
324
  return output_file
325
 
326
+ # Utility function to clean conversation history
327
+
328
  def clean_chat_history(chat_history):
329
+ """
330
+ Filter out any chat entries whose "content" is not a string.
331
+ This helps prevent errors when concatenating previous messages.
332
+ """
333
  cleaned = []
334
  for msg in chat_history:
335
  if isinstance(msg, dict) and isinstance(msg.get("content"), str):
336
  cleaned.append(msg)
337
  return cleaned
338
 
339
+ # Stable Diffusion XL Pipeline for Image Generation
340
+ # Model In Use : SG161222/RealVisXL_V5.0_Lightning
341
+
342
+ MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
343
+ MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
344
+ USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
345
+ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
346
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1")) # For batched image generation
347
+
348
+ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
349
+ MODEL_ID_SD,
350
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
351
+ use_safetensors=True,
352
+ add_watermarker=False,
353
+ ).to(device)
354
+ sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
355
+
356
+ if torch.cuda.is_available():
357
+ sd_pipe.text_encoder = sd_pipe.text_encoder.half()
358
+
359
+ if USE_TORCH_COMPILE:
360
+ sd_pipe.compile()
361
+
362
+ if ENABLE_CPU_OFFLOAD:
363
+ sd_pipe.enable_model_cpu_offload()
364
+
365
  def save_image(img: Image.Image) -> str:
366
+ """Save a PIL image with a unique filename and return the path."""
367
  unique_name = str(uuid.uuid4()) + ".png"
368
  img.save(unique_name)
369
  return unique_name
 
383
  num_images: int = 1,
384
  progress=gr.Progress(track_tqdm=True),
385
  ):
386
+ """Generate images using the SDXL pipeline."""
387
  seed = int(randomize_seed_fn(seed, randomize_seed))
388
  generator = torch.Generator(device=device).manual_seed(seed)
389
+
390
  options = {
391
  "prompt": [prompt] * num_images,
392
  "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
 
399
  }
400
  if use_resolution_binning:
401
  options["use_resolution_binning"] = True
402
+
403
  images = []
404
+ # Process in batches
405
+ for i in range(0, num_images, BATCH_SIZE):
406
  batch_options = options.copy()
407
+ batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
408
+ if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
409
+ batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
410
  if device.type == "cuda":
411
  with torch.autocast("cuda", dtype=torch.float16):
412
  outputs = sd_pipe(**batch_options)
 
416
  image_paths = [save_image(img) for img in images]
417
  return image_paths, seed
418
 
419
+ # Text-to-3D Generation using the ShapE Pipeline
420
+
421
  @spaces.GPU(duration=120, enable_queue=True)
422
  def generate_3d_fn(
423
  prompt: str,
 
426
  num_steps: int = 64,
427
  randomize_seed: bool = False,
428
  ):
429
+ """
430
+ Generate a 3D model from text using the ShapE pipeline.
431
+ Returns a tuple of (glb_file_path, used_seed).
432
+ """
433
  seed = int(randomize_seed_fn(seed, randomize_seed))
434
  model3d = Model()
435
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
436
  return glb_path, seed
437
 
438
+ # YOLO Object Detection Setup
439
+ YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
440
+ YOLO_CHECKPOINT_NAME = "images/demo.pt"
441
+ yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
442
+ yolo_detector = YOLODetector(yolo_model_path)
443
+
444
  def detect_objects(image: np.ndarray):
445
+ """Runs object detection on the input image."""
446
  results = yolo_detector(image, verbose=False)[0]
447
  detections = sv.Detections.from_ultralytics(results).with_nms()
448
+
449
  box_annotator = sv.BoxAnnotator()
450
  label_annotator = sv.LabelAnnotator()
451
+
452
  annotated_image = image.copy()
453
  annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
454
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
455
+
456
  return Image.fromarray(annotated_image)
457
 
458
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
459
 
460
  @spaces.GPU
461
  def generate(
 
467
  top_k: int = 50,
468
  repetition_penalty: float = 1.2,
469
  ):
470
+ """
471
+ Generates chatbot responses with support for multimodal input and special commands:
472
+ - "@tts1" or "@tts2": triggers text-to-speech.
473
+ - "@image": triggers image generation using the SDXL pipeline.
474
+ - "@3d": triggers 3D model generation using the ShapE pipeline.
475
+ - "@web": triggers a web search or webpage visit.
476
+ - "@rAgent": initiates a reasoning chain using Llama mode.
477
+ - "@yolo": triggers object detection using YOLO.
478
+ - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
479
+ """
480
  text = input_dict["text"]
481
  files = input_dict.get("files", [])
482
 
483
+ # --- 3D Generation branch ---
484
  if text.strip().lower().startswith("@3d"):
485
  prompt = text[len("@3d"):].strip()
486
+ yield "🌀 Hold tight, generating a 3D mesh GLB file....."
487
  glb_path, used_seed = generate_3d_fn(
488
  prompt=prompt,
489
  seed=1,
 
491
  num_steps=64,
492
  randomize_seed=True,
493
  )
494
+ # Copy the GLB file to a static folder.
495
  static_folder = os.path.join(os.getcwd(), "static")
496
  if not os.path.exists(static_folder):
497
  os.makedirs(static_folder)
498
  new_filename = f"mesh_{uuid.uuid4()}.glb"
499
  new_filepath = os.path.join(static_folder, new_filename)
500
  shutil.copy(glb_path, new_filepath)
501
+
502
  yield gr.File(new_filepath)
503
  return
504
 
505
+ # --- Image Generation branch ---
506
  if text.strip().lower().startswith("@image"):
507
  prompt = text[len("@image"):].strip()
508
  yield "🪧 Generating image..."
509
  image_paths, used_seed = generate_image_fn(
510
  prompt=prompt,
511
+ negative_prompt="",
512
+ use_negative_prompt=False,
513
  seed=1,
514
+ width=1024,
515
+ height=1024,
516
+ guidance_scale=3,
517
+ num_inference_steps=25,
518
  randomize_seed=True,
519
+ use_resolution_binning=True,
520
  num_images=1,
521
  )
522
  yield gr.Image(image_paths[0])
523
  return
524
 
525
+ # --- Web Search/Visit branch ---
526
  if text.strip().lower().startswith("@web"):
527
  web_command = text[len("@web"):].strip()
528
+ # If the command starts with "visit", then treat the rest as a URL
529
  if web_command.lower().startswith("visit"):
530
  url = web_command[len("visit"):].strip()
531
  yield "🌍 Visiting webpage..."
 
533
  content = visitor.forward(url)
534
  yield content
535
  else:
536
+ # Otherwise, treat the rest as a search query.
537
  query = web_command
538
+ yield "🧤 Performing a web search ..."
539
  searcher = DuckDuckGoSearchTool()
540
  results = searcher.forward(query)
541
  yield results
542
  return
543
 
544
+ # --- rAgent Reasoning branch ---
545
  if text.strip().lower().startswith("@ragent"):
546
  prompt = text[len("@ragent"):].strip()
547
+ yield "📝 Initiating reasoning chain using Llama mode..."
548
+ # Pass the current chat history (cleaned) to help inform the chain.
549
  for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
550
  yield partial
551
  return
552
 
553
+ # --- YOLO Object Detection branch ---
554
  if text.strip().lower().startswith("@yolo"):
555
+ yield "🔍 Running object detection with YOLO..."
556
  if not files or len(files) == 0:
557
+ yield "Error: Please attach an image for YOLO object detection."
558
  return
559
+ # Use the first attached image
560
  input_file = files[0]
561
  try:
562
+ if isinstance(input_file, str):
563
+ pil_image = Image.open(input_file)
564
+ else:
565
+ pil_image = input_file
566
  except Exception as e:
567
  yield f"Error loading image: {str(e)}"
568
  return
 
571
  yield gr.Image(result_img)
572
  return
573
 
574
+ # --- Phi-4 Multimodal branch (Image/Audio) ---
575
  if text.strip().lower().startswith("@phi4"):
576
+ question = text[len("@phi4"):].strip()
577
+ if not files:
578
+ yield "Error: Please attach an image or audio file for @phi4 multimodal processing."
579
  return
580
+ if not question:
581
+ yield "Error: Please provide a question after @phi4."
 
 
 
582
  return
583
+ # Determine input type (Image or Audio) from the first file
584
+ input_file = files[0]
585
+ try:
586
+ # If file is already a PIL Image, treat as image
587
+ if isinstance(input_file, Image.Image):
588
+ input_type = "Image"
589
+ file_for_phi4 = input_file
590
+ else:
591
+ # Try opening as image; if it fails, assume audio
592
+ try:
593
+ file_for_phi4 = Image.open(input_file)
594
+ input_type = "Image"
595
+ except Exception:
596
+ input_type = "Audio"
597
+ file_for_phi4 = input_file
598
+ except Exception:
599
+ input_type = "Audio"
600
+ file_for_phi4 = input_file
601
+
602
+ if input_type == "Image":
603
+ phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
604
+ inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
605
+ elif input_type == "Audio":
606
+ phi4_prompt = f'{phi4_user_prompt}<|audio_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
607
+ audio, samplerate = sf.read(file_for_phi4)
608
+ inputs = phi4_processor(text=phi4_prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
609
+ else:
610
+ yield "Invalid file type for @phi4 multimodal processing."
611
  return
612
 
613
+ with torch.no_grad():
614
+ generate_ids = phi4_model.generate(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  **inputs,
616
+ max_new_tokens=200,
617
+ num_logits_to_keep=0,
618
+ )
619
+ input_length = inputs['input_ids'].shape[1]
620
+ generate_ids = generate_ids[:, input_length:]
621
+ response = phi4_processor.batch_decode(
622
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
623
+ )[0]
624
+ yield response
 
 
 
 
 
 
625
  return
626
 
627
+ # --- Text and TTS branch ---
628
  tts_prefix = "@tts"
629
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
630
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
631
+
632
  if is_tts and voice_index:
633
  voice = TTS_VOICES[voice_index - 1]
634
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
 
640
  conversation.append({"role": "user", "content": text})
641
 
642
  if files:
643
+ if len(files) > 1:
644
+ images = [load_image(image) for image in files]
645
+ elif len(files) == 1:
646
+ images = [load_image(files[0])]
647
+ else:
648
+ images = []
649
  messages = [{
650
  "role": "user",
651
  "content": [
 
671
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
672
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
673
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
674
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
675
  input_ids = input_ids.to(model.device)
676
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
677
  generation_kwargs = {
 
700
  output_file = asyncio.run(text_to_speech(final_response, voice))
701
  yield gr.Audio(output_file, autoplay=True)
702
 
703
+ # Gradio Chat Interface Setup and Launch
 
 
 
 
 
 
 
 
 
 
704
 
705
  demo = gr.ChatInterface(
706
  fn=generate,
707
  additional_inputs=[
708
  gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
709
  gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
710
+ gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
711
  gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
712
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
713
  ],
 
718
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
719
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
720
  ["@rAgent Explain how a binary search algorithm works."],
721
+ ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
722
  ["@tts1 Explain Tower of Hanoi"],
723
+ ["@phi4 What is depicted in this image?"], # Example for @phi4
 
724
  ],
725
  cache_examples=False,
726
  type="messages",
 
728
  css=css,
729
  fill_height=True,
730
  textbox=gr.MultimodalTextbox(
731
+ label="Query Input",
732
  file_types=["image", "audio"],
733
+ file_count="multiple",
734
+ placeholder="@tts1, @tts2, @image, @3d, @phi4, @rAgent, @web, @yolo, or plain text"
735
  ),
736
  stop_btn="Stop Generation",
737
  multimodal=True,
738
  )
739
 
740
+ # Ensure the static folder exists
741
  if not os.path.exists("static"):
742
  os.makedirs("static")
743