rishabhsabnavis commited on
Commit
dbafb09
·
verified ·
1 Parent(s): 1ae8b7f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from deep_translator import GoogleTranslator
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ VitsModel,
8
+ pipeline
9
+ )
10
+
11
+ device = "cpu"
12
+
13
+ # Load speech recognition pipeline
14
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
15
+
16
+ # Load text-to-speech model (for Telugu)
17
+ model = VitsModel.from_pretrained("facebook/mms-tts-tel")
18
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-tel")
19
+
20
+ def translate(audio):
21
+ outputs = asr_pipe(audio, generate_kwargs={"task": "translate"})
22
+ return outputs["text"]
23
+
24
+ def synthesise(text):
25
+ inputs = tokenizer(text=text, return_tensors="pt")
26
+ with torch.no_grad():
27
+ speech = model(**inputs).waveform
28
+ return speech.reshape(-1, 1).cpu()
29
+
30
+ def speech_to_speech_translation(audio):
31
+ translated_text = translate(audio)
32
+ google_translated = GoogleTranslator(source="en", target="tel").translate(translated_text)
33
+ synthesised_speech = synthesise(google_translated)
34
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
+ return 16000, synthesised_speech
36
+
37
+ title = "Cascaded STST"
38
+ description = """
39
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Telugu. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech transcription, [Deep Translator](https://github.com/nidhaloff/deep-translator) for translation, and Meta's [MMS TTS TEL](https://huggingface.co/facebook/mms-tts-tel) model for text-to-speech:
40
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
41
+ """
42
+
43
+ demo = gr.Blocks()
44
+
45
+ mic_translate = gr.Interface(
46
+ fn=speech_to_speech_translation,
47
+ inputs=gr.Audio(sources="microphone", type="filepath"),
48
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
49
+ title=title,
50
+ description=description
51
+ )
52
+
53
+ file_translate = gr.Interface(
54
+ fn=speech_to_speech_translation,
55
+ inputs=gr.Audio(sources="upload", type="filepath"),
56
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
57
+ examples=[["./example.wav"]],
58
+ title=title,
59
+ description=description
60
+ )
61
+
62
+ with demo:
63
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
64
+
65
+ demo.launch()