Spaces:

jiuuee
/

my-alexa

Runtime error

App Files Files Community

jiuuee commited on May 3, 2024

Commit

c5a564e

verified ·

1 Parent(s): 2482c73

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -18

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import soundfile as sf
 import tempfile
 import uuid
 import torch
 from nemo.collections.asr.models import ASRModel
@@ -12,7 +13,7 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
-MAX_AUDIO_MINUTES = 1 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
@@ -40,11 +41,6 @@ frame_asr = FrameBatchMultiTaskAED(
 amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
-	"""
-	Convert all files to monochannel 16 kHz wav files.
-	Do not convert and raise error if audio too long.
-	Returns output filename and duration.
-	"""
 	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
@@ -68,7 +64,7 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 	return out_filename, duration
-def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
@@ -76,8 +72,8 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 	utt_id = uuid.uuid4()
 	with tempfile.TemporaryDirectory() as tmpdir:
 		converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
-        # make manifest file and save
 		manifest_data = {
 			"audio_filepath": converted_audio_filepath,
 			"source_lang": "en",
@@ -112,9 +108,9 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 					output_text = hyps[0].text
 	return output_text
 with gr.Blocks(
-	title="myAlexa",
 	css="""
 		textarea { font-size: 18px;}
 		#model_output_text_box span {
@@ -125,20 +121,28 @@ with gr.Blocks(
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
-	gr.HTML("<h1 style='text-align: center'>Your amazing AI assistant</h1>")
 	with gr.Row():
 		with gr.Column():
 			gr.HTML(
-				"<p><b>Step 1:</b> Record with your microphone.</p>"
-            )
-			audio_file = gr.Audio(sources=["microphone"], type="filepath")
 		with gr.Column():
 			go_button = gr.Button(
-				value="Transcribe",
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
@@ -146,12 +150,16 @@ with gr.Blocks(
 				label="Model Output",
 				elem_id="model_output_text_box",
 			)
 	go_button.click(
 		fn=transcribe,
 		inputs = [audio_file],
 		outputs = [model_output_text_box]
 	)
 demo.queue()
-demo.launch()

 import soundfile as sf
 import tempfile
 import uuid
 import torch
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
+MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
 amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
 	return out_filename, duration
+def transcribe(audio_filepath):
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
 	utt_id = uuid.uuid4()
 	with tempfile.TemporaryDirectory() as tmpdir:
 		converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
+		# make manifest file and save
 		manifest_data = {
 			"audio_filepath": converted_audio_filepath,
 			"source_lang": "en",
 					output_text = hyps[0].text
 	return output_text
 with gr.Blocks(
+	title="NeMo Canary Model",
 	css="""
 		textarea { font-size: 18px;}
 		#model_output_text_box span {
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
+	gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
 	with gr.Row():
 		with gr.Column():
 			gr.HTML(
+				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
+				"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
+				"You can transcribe longer files locally with this NeMo "
+				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
+			)
+			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
 		with gr.Column():
+			gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
 			go_button = gr.Button(
+				value="Run model",
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
 				label="Model Output",
 				elem_id="model_output_text_box",
 			)
 	go_button.click(
 		fn=transcribe,
 		inputs = [audio_file],
 		outputs = [model_output_text_box]
 	)
 demo.queue()
+demo.launch()