File size: 1,607 Bytes
aaf0168 d65f95a aaf0168 d65f95a 7b5f670 aaf0168 7b5f670 d65f95a aaf0168 9f3bcf5 402e7d8 aaf0168 d65f95a aaf0168 d65f95a aaf0168 a4c5e15 d65f95a a4c5e15 aaf0168 d65f95a 74cf751 d65f95a aaf0168 d65f95a aaf0168 d65f95a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from huggingface_hub.inference_api import InferenceApi
from typing import Dict, List, Any
class EndpointHandler:
def __init__(self, path=""):
self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech")
self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
def __call__(self, data):
text = data.get("inputs", "")
# Extract speaker_embedding using the index from your dataset, or replace with your own logic.
speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Convert embedding to list to avoid serialization issues
speaker_embedding_list = speaker_embedding.tolist()
# Use the API to run the model
#parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}}
response = self.api(inputs=text, options={"wait_for_model": True})
# Write the response audio to a file
# Note: This might not be possible in all environments, ensure this is suitable for your deployment
sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"])
# Return the expected response format
return {
"statusCode": 200,
"body": {
"audio": response["audio"], # Consider encoding this to a suitable format
"sampling_rate": response["sampling_rate"]
}
}
handler = EndpointHandler() |