mispeech
/

r1-aqa

@@ -27,8 +27,9 @@ model = Qwen2AudioForConditionalGeneration.from_pretrained(model_name, torch_dty
 # Load example audio
 wav_path = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # from MMAU dataset
-waveform, _ = torchaudio.load(wav_path)  # 16KHz
-audios = [waveform[0].numpy()]
 # Make prompt text
 question = "Based on the given audio, identify the source of the speaking voice."

 # Load example audio
 wav_path = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # from MMAU dataset
+waveform, sampling_rate = torchaudio.load(wav_path)
+assert sampling_rate == 16000
+audios = [waveform.numpy()]
 # Make prompt text
 question = "Based on the given audio, identify the source of the speaking voice."