rishabhsabnavis commited on
Commit
f9bbf89
·
verified ·
1 Parent(s): e6c1904

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from datasets import load_dataset
5
+ from deep_translator import GoogleTranslator
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ SpeechT5ForTextToSpeech,
9
+ SpeechT5HifiGan,
10
+ SpeechT5Processor,
11
+ VitsModel,
12
+ pipeline,
13
+ )
14
+
15
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+
17
+ device = "cpu"
18
+ # load speech translation checkpoint
19
+ asr_pipe = pipeline("automatic-speech-recognition",
20
+ model="openai/whisper-base", device=device)
21
+
22
+ # load text-to-speech mms-tts-id model (speaker embeddings included)
23
+ model = VitsModel.from_pretrained("facebook/mms-tts-tel")
24
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-tel")
25
+
26
+
27
+ def translate(audio):
28
+ outputs = asr_pipe(audio, max_new_tokens=256,
29
+ generate_kwargs={"task": "translate"})
30
+ return outputs["text"]
31
+
32
+
33
+ def synthesise(text):
34
+ inputs = tokenizer(text=text, return_tensors="pt")
35
+ with torch.no_grad():
36
+ speech = model(**inputs).waveform
37
+ return speech.reshape(-1, 1).cpu()
38
+
39
+
40
+ def speech_to_speech_translation(audio):
41
+ translated_text = translate(audio)
42
+ google_translated = GoogleTranslator(
43
+ source="en", target="tel").translate(translated_text)
44
+ synthesised_speech = synthesise(google_translated)
45
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
46
+ return 16000, synthesised_speech
47
+
48
+
49
+ title = "Cascaded STST"
50
+ description = """
51
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Indonesian. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech transcription, [Deep Translator](https://github.com/nidhaloff/deep-translator) for translation, and Meta's
52
+ [MMS TTS IND](https://huggingface.co/facebook/mms-tts-ind) model for text-to-speech:
53
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
54
+ """
55
+
56
+ demo = gr.Blocks()
57
+
58
+ mic_translate = gr.Interface(
59
+ fn=speech_to_speech_translation,
60
+ inputs=gr.Audio(sources="microphone", type="filepath"),
61
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
62
+ title=title,
63
+ description=description,
64
+ )
65
+
66
+ file_translate = gr.Interface(
67
+ fn=speech_to_speech_translation,
68
+ inputs=gr.Audio(sources="upload", type="filepath"),
69
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
70
+ examples=[["./example.wav"]],
71
+ title=title,
72
+ description=description,
73
+ )
74
+
75
+ with demo:
76
+ gr.TabbedInterface([mic_translate, file_translate],
77
+ ["Microphone", "Audio File"])
78
+
79
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ git+https://github.com/huggingface/transformers
3
+ datasets
4
+ sentencepiece
5
+ deep-translator