Spaces:

jiuuee
/

my-alexa

Runtime error

App Files Files Community

jiuuee commited on May 3, 2024

Commit

6160888

verified ·

1 Parent(s): c5a564e

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -145

app.py CHANGED Viewed

@@ -5,161 +5,131 @@ import os
 import soundfile as sf
 import tempfile
 import uuid
 import torch
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
-SAMPLE_RATE = 16000 # Hz
-MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
-model = ASRModel.from_pretrained("nvidia/canary-1b")
-model.eval()
-# make sure beam size always 1 for consistency
-model.change_decoding_strategy(None)
-decoding_cfg = model.cfg.decoding
 decoding_cfg.beam.beam_size = 1
-model.change_decoding_strategy(decoding_cfg)
-# setup for buffered inference
-model.cfg.preprocessor.dither = 0.0
-model.cfg.preprocessor.pad_to = 0
-feature_stride = model.cfg.preprocessor['window_stride']
-model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
 frame_asr = FrameBatchMultiTaskAED(
-	asr_model=model,
-	frame_len=40.0,
-	total_buffer=40.0,
-	batch_size=16,
 )
-amp_dtype = torch.float16
-def convert_audio(audio_filepath, tmpdir, utt_id):
-	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
-	duration = librosa.get_duration(y=data, sr=sr)
-	if duration / 60.0 > MAX_AUDIO_MINUTES:
-		raise gr.Error(
-			f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
-			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
-			"(click on the scissors icon to start trimming audio)."
-		)
-	if sr != SAMPLE_RATE:
-		data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
-	out_filename = os.path.join(tmpdir, utt_id + '.wav')
-	# save output audio
-	sf.write(out_filename, data, SAMPLE_RATE)
-	return out_filename, duration
 def transcribe(audio_filepath):
-	if audio_filepath is None:
-		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
-	utt_id = uuid.uuid4()
-	with tempfile.TemporaryDirectory() as tmpdir:
-		converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
-		# make manifest file and save
-		manifest_data = {
-			"audio_filepath": converted_audio_filepath,
-			"source_lang": "en",
-			"target_lang": "en",
-			"taskname": "asr",
-			"pnc": "no",
-			"answer": "predict",
-			"duration": str(duration),
-		}
-		manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
-		with open(manifest_filepath, 'w') as fout:
-			line = json.dumps(manifest_data)
-			fout.write(line + '\n')
-		# call transcribe, passing in manifest filepath
-		if duration < 40:
-			output_text = model.transcribe(manifest_filepath)[0]
-		else: # do buffered inference
-			with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
-				with torch.no_grad():
-					hyps = get_buffered_pred_feat_multitaskAED(
-						frame_asr,
-						model.cfg.preprocessor,
-						model_stride_in_secs,
-						model.device,
-						manifest=manifest_filepath,
-						filepaths=None,
-					)
-					output_text = hyps[0].text
-	return output_text
-with gr.Blocks(
-	title="NeMo Canary Model",
-	css="""
-		textarea { font-size: 18px;}
-		#model_output_text_box span {
-			font-size: 18px;
-			font-weight: bold;
-		}
-	""",
-	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
-) as demo:
-	gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
-	with gr.Row():
-		with gr.Column():
-			gr.HTML(
-				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
-				"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
-				"You can transcribe longer files locally with this NeMo "
-				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
-			)
-			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
-		with gr.Column():
-			gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
-			go_button = gr.Button(
-				value="Run model",
-				variant="primary", # make "primary" so it stands out (default is "secondary")
-			)
-			model_output_text_box = gr.Textbox(
-				label="Model Output",
-				elem_id="model_output_text_box",
-			)
-	go_button.click(
-		fn=transcribe,
-		inputs = [audio_file],
-		outputs = [model_output_text_box]
-	)
-demo.queue()
-demo.launch()

 import soundfile as sf
 import tempfile
 import uuid
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
+from transformers import VitsTokenizer, VitsModel, set_seed
+import scipy.io.wavfile as wav
+# Constants
+SAMPLE_RATE = 16000  # Hz
+# Load ASR model
+asr_model = ASRModel.from_pretrained("nvidia/canary-1b")
+asr_model.eval()
+asr_model.change_decoding_strategy(None)
+decoding_cfg = asr_model.cfg.decoding
 decoding_cfg.beam.beam_size = 1
+asr_model.change_decoding_strategy(decoding_cfg)
+feature_stride = asr_model.cfg.preprocessor['window_stride']
+model_stride_in_secs = feature_stride * 8
 frame_asr = FrameBatchMultiTaskAED(
+    asr_model=asr_model,
+    frame_len=40.0,
+    total_buffer=40.0,
+    batch_size=16,
 )
+# Load LLM model
+torch.random.manual_seed(0)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-128k-instruct",
+    device_map="auto",
+    torch_dtype="auto",
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
+pipe = pipeline("text-generation", model=llm_model, tokenizer=tokenizer)
+# Load TTS model
+tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+# Function to convert audio to text using ASR
 def transcribe(audio_filepath):
+    if audio_filepath is None:
+        raise gr.InterfaceError("Please provide some input audio.")
+    utt_id = uuid.uuid4()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Convert to 16 kHz
+        data, sr = librosa.load(audio_filepath, sr=None, mono=True)
+        if sr != SAMPLE_RATE:
+            data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
+        converted_audio_filepath = os.path.join(tmpdir, f"{utt_id}.wav")
+        sf.write(converted_audio_filepath, data, SAMPLE_RATE)
+        # Transcribe audio
+        duration = len(data) / SAMPLE_RATE
+        manifest_data = {
+            "audio_filepath": converted_audio_filepath,
+            "source_lang": "en",
+            "target_lang": "en",
+            "taskname": "asr",
+            "pnc": "no",
+            "answer": "predict",
+            "duration": str(duration),
+        }
+        manifest_filepath = os.path.join(tmpdir, f"{utt_id}.json")
+        with open(manifest_filepath, 'w') as fout:
+            fout.write(json.dumps(manifest_data))
+        if duration < 40:
+            transcription = asr_model.transcribe(manifest_filepath)[0]
+        else:
+            transcription = get_buffered_pred_feat_multitaskAED(
+                frame_asr,
+                asr_model.cfg.preprocessor,
+                model_stride_in_secs,
+                asr_model.device,
+                manifest=manifest_filepath,
+            )[0].text
+    return transcription
+# Function to generate text using LLM
+def generate_text(input_text):
+    generation_args = {
+        "max_new_tokens": 500,
+        "return_full_text": True,
+        "temperature": 0.0,
+        "do_sample": False,
+    }
+    generated_text = pipe(
+        [{"role": "user", "content": input_text}],
+        **generation_args
+    )[0]["generated_text"]
+    return generated_text
+# Function to convert text to speech using TTS
+def gen_speech(text):
+    set_seed(555)  # Make it deterministic
+    input_text = tts_tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        outputs = tts_model(**input_text)
+    waveform_np = outputs.waveform[0].cpu().numpy()
+    output_file = f"{str(uuid.uuid4())}.wav"
+    wav.write(output_file, rate=tts_model.config.sampling_rate, data=waveform_np)
+    return output_file
+# Combined function for Gradio interface
+def process_audio(audio_filepath):
+    transcription = transcribe(audio_filepath)
+    generated_text = generate_text(transcription)
+    audio_output_filepath = gen_speech(generated_text)
+    return transcription, generated_text, audio_output_filepath
+# Create Gradio interface
+gr.Interface(
+    fn=process_audio,
+    inputs=[gr.Audio(sources=["microphone"], type="filepath", label="Input Audio")],
+    outputs=[
+        gr.Textbox(label="Transcription"),
+        gr.Textbox(label="Generated Text"),
+        gr.Audio(type="filepath", label="Generated Speech")
+    ],
+    title="ASR to LLM to TTS",
+    description="Transcribe audio with ASR, generate text with LLM, and convert it back to speech with TTS."
+).launch(inbrowser=True)