| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from transformers import AutoProcessor, SeamlessM4TModel | |
| processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") | |
| model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") | |
| model.to('cuda') | |
| language_dict = { | |
| "Modern Standard Arabic" : "arb", | |
| "Bengali" : "ben", | |
| "Catalan" : "cat", | |
| "Czech" : "ces", | |
| "Mandarin Chinese" : "cmn", | |
| "Welsh" : "cym", | |
| "Danish" : "dan", | |
| "German" : "deu", | |
| "English" : "eng", | |
| "Estonian" : "est", | |
| "Finnish" : "fin", | |
| "French" : "fra", | |
| "Hindi" : "hin", | |
| "Indonesian" : "ind", | |
| "Italian" : "ita", | |
| "Japanese" : "jpn", | |
| "Korean" : "kor", | |
| "Maltese" : "mlt", | |
| "Dutch" : "nld", | |
| "Western Persian" : "pes", | |
| "Polish" : "pol", | |
| "Portuguese" : "por", | |
| "Romanian" : "ron", | |
| "Russian" : "rus", | |
| "Slovak" : "slk", | |
| "Spanish" : "spa", | |
| "Swedish" : "swe", | |
| "Swahili" : "swh", | |
| "Telugu" : "tel", | |
| "Tagalog" : "tgl", | |
| "Thai" : "tha", | |
| "Turkish" : "tur", | |
| "Ukrainian" : "ukr", | |
| "Urdu" : "urd", | |
| "Northern Uzbek" : "uzn", | |
| "Vietnamese" : "vie" | |
| } | |
| languages = list(language_dict.keys()) | |
| def png(source_lang,target_lang,audio,text): | |
| source_lang_code = language_dict[source_lang] | |
| target_lang_code = language_dict[target_lang] | |
| if audio == None: | |
| processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt") | |
| else: | |
| sample_rate, audio_data = audio | |
| audio_tokens = torch.from_numpy(audio_data).to(torch.device("cuda")) | |
| audio_tokens = audio_tokens.to(torch.float32) | |
| audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000) | |
| audio_tokens = audio_tokens.cpu() | |
| processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt") | |
| processed_inputs = processed_inputs.to("cuda") | |
| generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze() | |
| output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False) | |
| generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) | |
| return (16000,generated_audio),generated_text | |
| title = "36 Language Translator" | |
| description = """ | |
| This Demo can translate either Speech or Text form any of the selected SOURCE language amoung 36 languages to both Speech and Text in any of the selected TARGET language. | |
| This Demo is powered by "facebook/hf-seamless-m4t-medium" model. Thanks for checking out. | |
| """ | |
| iface = gr.Interface( | |
| png, | |
| inputs=[ | |
| gr.Dropdown(languages, label="Source Language"), | |
| gr.Dropdown(languages, label="Target Language"), | |
| gr.Audio(), | |
| gr.Textbox(label="Enter Text in Source Language") | |
| ], | |
| outputs=[ | |
| gr.Audio(label = "Translated Audio"), | |
| gr.Textbox(label="Translated Text") | |
| ], | |
| title="Language Translation App", | |
| description="Select source and target languages for translation.", | |
| ) | |
| iface.launch(debug=True) | |