File size: 1,607 Bytes

aaf0168
d65f95a
 
aaf0168
d65f95a
7b5f670
 
aaf0168
 
7b5f670
d65f95a
aaf0168
 
9f3bcf5
402e7d8
aaf0168
d65f95a
aaf0168
d65f95a
 
aaf0168
a4c5e15
d65f95a
a4c5e15
 
aaf0168
d65f95a
 
 
74cf751
d65f95a
 
aaf0168
d65f95a
 
 
 
aaf0168
 
d65f95a

import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from huggingface_hub.inference_api import InferenceApi
from typing import Dict, List, Any


class EndpointHandler:
    def __init__(self, path=""):
        self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech")
        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    def __call__(self, data):
        
        text = data.get("inputs", "")
        # Extract speaker_embedding using the index from your dataset, or replace with your own logic.
        speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        # Convert embedding to list to avoid serialization issues
        speaker_embedding_list = speaker_embedding.tolist()


        # Use the API to run the model
        #parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}}
        response = self.api(inputs=text, options={"wait_for_model": True})

        # Write the response audio to a file
        # Note: This might not be possible in all environments, ensure this is suitable for your deployment
        sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"])

        # Return the expected response format
        return {
            "statusCode": 200,
            "body": {
                "audio": response["audio"],  # Consider encoding this to a suitable format
                "sampling_rate": response["sampling_rate"]
            }
        }

handler = EndpointHandler()