prithivMLmods commited on
Commit
f60d610
·
verified ·
1 Parent(s): c8e4529

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -15
app.py CHANGED
@@ -259,7 +259,8 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
259
  # ------------------------------------------------------------------------------
260
 
261
  DESCRIPTION = """
262
- # Agent Dino 🌠 """
 
263
 
264
  css = '''
265
  h1 {
@@ -468,7 +469,7 @@ def generate(
468
  - "@web": triggers a web search or webpage visit.
469
  - "@rAgent": initiates a reasoning chain using Llama mode.
470
  - "@yolo": triggers object detection using YOLO.
471
- - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
472
  """
473
  text = input_dict["text"]
474
  files = input_dict.get("files", [])
@@ -564,7 +565,7 @@ def generate(
564
  yield gr.Image(result_img)
565
  return
566
 
567
- # --- Phi-4 Multimodal branch (Image/Audio) ---
568
  if text.strip().lower().startswith("@phi4"):
569
  question = text[len("@phi4"):].strip()
570
  if not files:
@@ -603,18 +604,17 @@ def generate(
603
  yield "Invalid file type for @phi4 multimodal processing."
604
  return
605
 
606
- with torch.no_grad():
607
- generate_ids = phi4_model.generate(
608
- **inputs,
609
- max_new_tokens=200,
610
- num_logits_to_keep=0,
611
- )
612
- input_length = inputs['input_ids'].shape[1]
613
- generate_ids = generate_ids[:, input_length:]
614
- response = phi4_processor.batch_decode(
615
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
616
- )[0]
617
- yield response
618
  return
619
 
620
  # --- Text and TTS branch ---
 
259
  # ------------------------------------------------------------------------------
260
 
261
  DESCRIPTION = """
262
+ # Agent Dino 🌠
263
+ """
264
 
265
  css = '''
266
  h1 {
 
469
  - "@web": triggers a web search or webpage visit.
470
  - "@rAgent": initiates a reasoning chain using Llama mode.
471
  - "@yolo": triggers object detection using YOLO.
472
+ - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model with streaming output.**
473
  """
474
  text = input_dict["text"]
475
  files = input_dict.get("files", [])
 
565
  yield gr.Image(result_img)
566
  return
567
 
568
+ # --- Phi-4 Multimodal branch (Image/Audio) with streaming ---
569
  if text.strip().lower().startswith("@phi4"):
570
  question = text[len("@phi4"):].strip()
571
  if not files:
 
604
  yield "Invalid file type for @phi4 multimodal processing."
605
  return
606
 
607
+ # Set up a streamer for the phi4 model
608
+ streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
609
+ generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
610
+ thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
611
+ thread_phi4.start()
612
+
613
+ outputs_phi4 = []
614
+ yield "🤔 Thinking..."
615
+ for new_text in streamer_phi4:
616
+ outputs_phi4.append(new_text)
617
+ yield "".join(outputs_phi4)
 
618
  return
619
 
620
  # --- Text and TTS branch ---