mispeech
/

r1-aqa

@@ -30,7 +30,7 @@ wav_path = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # from M
 waveform, _ = torchaudio.load(wav_path)  # 16KHz
 audios = [waveform[0].numpy()]
-# Make prompt
 question = "Based on the given audio, identify the source of the speaking voice."
 options = ["Man", "Woman", "Child", "Robot"]
 prompt = f"{question} Please choose the answer from the following options: {str(options)}. Output the final answer in <answer> </answer>."
@@ -40,6 +40,7 @@ message = [
         {"type": "text", "text": prompt}
     ]}
 ]
 # Process
 inputs = processor(text=texts, audios=audios, sampling_rate=16000, return_tensors="pt", padding=True).to(model.device)

 waveform, _ = torchaudio.load(wav_path)  # 16KHz
 audios = [waveform[0].numpy()]
+# Make prompt text
 question = "Based on the given audio, identify the source of the speaking voice."
 options = ["Man", "Woman", "Child", "Robot"]
 prompt = f"{question} Please choose the answer from the following options: {str(options)}. Output the final answer in <answer> </answer>."
         {"type": "text", "text": prompt}
     ]}
 ]
+texts = processor.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
 # Process
 inputs = processor(text=texts, audios=audios, sampling_rate=16000, return_tensors="pt", padding=True).to(model.device)