Spaces:

rbcurzon
/

speech-to-text

Sleeping

rbcurzon commited on Aug 14

Commit

739d409

verified ·

1 Parent(s): c6e5581

refactor: Allow synthesize() to use gpu if available

Files changed (1) hide show

app.py CHANGED Viewed

@@ -165,27 +165,25 @@ async def translate_text(text: str,
         "tgtLang": tgtLang
     }
     return result_dict
 @app.post("/synthesize/")
 async def synthesize(text: str):
     model = VitsModel.from_pretrained("facebook/mms-tts-tgl")
-    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl")
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
-        output = model(**inputs).waveform
-    data_np = output.numpy()
-    data_np_squeezed = np.squeeze(data_np)
     temp_file = create_temp_filename()
-    scipy.io.wavfile.write(
-        temp_file,
-        rate=model.config.sampling_rate,
-        data=data_np_squeezed
-        )
     logging.info(f"Synthesizing completed for text: {text}")
     return FileResponse(

         "tgtLang": tgtLang
     }
     return result_dict
 @app.post("/synthesize/")
 async def synthesize(text: str):
     model = VitsModel.from_pretrained("facebook/mms-tts-tgl")
+    tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-tgl")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
     inputs = tokenizer(text, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
     with torch.no_grad():
+        outputs = model(input_ids)
+    speech = outputs["waveform"]
     temp_file = create_temp_filename()
+    torchaudio.save(temp_file, speech.cpu(), 16000)
     logging.info(f"Synthesizing completed for text: {text}")
     return FileResponse(