microsoft
/

Phi-4-multimodal-instruct-onnx

Automatic Speech Recognition

speech-summarization

speech-translation

visual-question-answering

phi-4-multimodal

Model card Files Files and versions Community

kvaishnavi commited on 1 day ago

Commit

904060b

·

verified ·

1 Parent(s): ab6e0ab

Update onnx/builder.py

Files changed (1) hide show

onnx/builder.py +2 -2

onnx/builder.py CHANGED Viewed

@@ -118,8 +118,8 @@ def build_vision(args):
 def build_speech(args):
     # Speech file:
     prompt = f"{user_prompt}<|audio_1|>\n<|audio_2|>\nWhat are the stories that these audios come from?{prompt_suffix}{assistant_prompt}"
-    audio1 = soundfile.read(os.path.join(args.input, "examples", "1272-128104-0004.wav"))
-    audio2 = soundfile.read(os.path.join(args.input, "examples", "1272-128104-0009.wav"))
     inputs = processor(prompt, audios=[audio1, audio2], return_tensors="pt").to(args.execution_provider.replace("dml", "cuda"))
     inputs["input_audio_embeds"] = inputs["input_audio_embeds"].to(args.precision)

 def build_speech(args):
     # Speech file:
     prompt = f"{user_prompt}<|audio_1|>\n<|audio_2|>\nWhat are the stories that these audios come from?{prompt_suffix}{assistant_prompt}"
+    audio1 = soundfile.read(os.path.join(args.input, "examples", "what_is_the_traffic_sign_in_the_image.wav"))
+    audio2 = soundfile.read(os.path.join(args.input, "examples", "what_is_shown_in_this_image.wav"))
     inputs = processor(prompt, audios=[audio1, audio2], return_tensors="pt").to(args.execution_provider.replace("dml", "cuda"))
     inputs["input_audio_embeds"] = inputs["input_audio_embeds"].to(args.precision)