prithivMLmods commited on
Commit
2bd3ee0
·
verified ·
1 Parent(s): 721c813

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -4
app.py CHANGED
@@ -35,6 +35,9 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
35
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
36
  from diffusers.utils import export_to_ply
37
 
 
 
 
38
  # Global constants and helper functions
39
 
40
  MAX_SEED = np.iinfo(np.int32).max
@@ -424,7 +427,60 @@ def detect_objects(image: np.ndarray):
424
 
425
  return Image.fromarray(annotated_image)
426
 
427
- # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  @spaces.GPU
430
  def generate(
@@ -442,8 +498,9 @@ def generate(
442
  - "@image": triggers image generation using the SDXL pipeline.
443
  - "@3d": triggers 3D model generation using the ShapE pipeline.
444
  - "@web": triggers a web search or webpage visit.
445
- - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
446
  - "@yolo": triggers object detection using YOLO.
 
447
  """
448
  text = input_dict["text"]
449
  files = input_dict.get("files", [])
@@ -539,6 +596,24 @@ def generate(
539
  yield gr.Image(result_img)
540
  return
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # --- Text and TTS branch ---
543
  tts_prefix = "@tts"
544
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -627,12 +702,14 @@ demo = gr.ChatInterface(
627
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
628
  ],
629
  examples=[
 
 
630
  ["@tts2 What causes rainbows to form?"],
631
  ["@image Chocolate dripping from a donut"],
632
  ["@3d A birthday cupcake with cherry"],
633
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
634
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
635
- ["@rAgent Explain how a binary search algorithm works."],
636
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
637
  ["@tts1 Explain Tower of Hanoi"],
638
  ],
@@ -641,7 +718,7 @@ demo = gr.ChatInterface(
641
  description=DESCRIPTION,
642
  css=css,
643
  fill_height=True,
644
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
645
  stop_btn="Stop Generation",
646
  multimodal=True,
647
  )
 
35
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
36
  from diffusers.utils import export_to_ply
37
 
38
+ # Additional import for Phi-4 multimodality (audio support)
39
+ import soundfile as sf
40
+
41
  # Global constants and helper functions
42
 
43
  MAX_SEED = np.iinfo(np.int32).max
 
427
 
428
  return Image.fromarray(annotated_image)
429
 
430
+ # ---------------------------
431
+ # Phi-4 Multimodal Model Setup with Text Streaming
432
+ # ---------------------------
433
+ phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
434
+
435
+ phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
436
+ phi4_model = AutoModelForCausalLM.from_pretrained(
437
+ phi4_model_path,
438
+ device_map="auto",
439
+ torch_dtype="auto",
440
+ trust_remote_code=True,
441
+ _attn_implementation="eager",
442
+ )
443
+
444
+ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
445
+ """
446
+ Process an image or audio input with the Phi-4 multimodal model.
447
+ Uses a text streamer to yield incremental outputs.
448
+ Expects input_type to be either 'image' or 'audio'.
449
+ """
450
+ user_prompt = '<|user|>'
451
+ assistant_prompt = '<|assistant|>'
452
+ prompt_suffix = '<|end|>'
453
+
454
+ if not file or not question:
455
+ yield "Please upload a file and provide a question."
456
+ return
457
+
458
+ if input_type.lower() == "image":
459
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
460
+ image = Image.open(file)
461
+ inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
462
+ elif input_type.lower() == "audio":
463
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
464
+ audio, samplerate = sf.read(file)
465
+ inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
466
+ else:
467
+ yield "Invalid input type selected."
468
+ return
469
+
470
+ # Setup text streamer using TextIteratorStreamer for incremental generation
471
+ streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
472
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
473
+ thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
474
+ thread.start()
475
+ buffer = ""
476
+ yield "🤔 Thinking..."
477
+ for new_text in streamer:
478
+ buffer += new_text
479
+ buffer = buffer.replace("<|im_end|>", "")
480
+ time.sleep(0.01)
481
+ yield buffer
482
+
483
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
484
 
485
  @spaces.GPU
486
  def generate(
 
498
  - "@image": triggers image generation using the SDXL pipeline.
499
  - "@3d": triggers 3D model generation using the ShapE pipeline.
500
  - "@web": triggers a web search or webpage visit.
501
+ - "@ragent": initiates a reasoning chain using Llama mode.
502
  - "@yolo": triggers object detection using YOLO.
503
+ - **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
504
  """
505
  text = input_dict["text"]
506
  files = input_dict.get("files", [])
 
596
  yield gr.Image(result_img)
597
  return
598
 
599
+ # --- Phi-4 Multimodal branch with text streaming ---
600
+ if text.strip().lower().startswith("@phi4"):
601
+ # Expected format: "@phi4 [image|audio] <your question>"
602
+ parts = text.strip().split(maxsplit=2)
603
+ if len(parts) < 3:
604
+ yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
605
+ return
606
+ input_type = parts[1]
607
+ question = parts[2]
608
+ if not files or len(files) == 0:
609
+ yield "Error: Please attach an image or audio file for Phi-4 processing."
610
+ return
611
+ file_input = files[0]
612
+ yield "🔄 Processing multimodal input with Phi-4..."
613
+ for partial in process_phi4(input_type, file_input, question):
614
+ yield partial
615
+ return
616
+
617
  # --- Text and TTS branch ---
618
  tts_prefix = "@tts"
619
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
 
702
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
703
  ],
704
  examples=[
705
+ [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
706
+ [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
707
  ["@tts2 What causes rainbows to form?"],
708
  ["@image Chocolate dripping from a donut"],
709
  ["@3d A birthday cupcake with cherry"],
710
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
711
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
712
+ ["@ragent Explain how a binary search algorithm works."],
713
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
714
  ["@tts1 Explain Tower of Hanoi"],
715
  ],
 
718
  description=DESCRIPTION,
719
  css=css,
720
  fill_height=True,
721
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
722
  stop_btn="Stop Generation",
723
  multimodal=True,
724
  )