import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from huggingface_hub.inference_api import InferenceApi
from typing import Dict, List, Any


class EndpointHandler:
    def __init__(self, path=""):
        self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech")
        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    def __call__(self, data):
        
        text = data.get("inputs", "")
        # Extract speaker_embedding using the index from your dataset, or replace with your own logic.
        speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        # Convert embedding to list to avoid serialization issues
        speaker_embedding_list = speaker_embedding.tolist()


        # Use the API to run the model
        #parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}}
        response = self.api(inputs=text, options={"wait_for_model": True})

        # Write the response audio to a file
        # Note: This might not be possible in all environments, ensure this is suitable for your deployment
        sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"])

        # Return the expected response format
        return {
            "statusCode": 200,
            "body": {
                "audio": response["audio"],  # Consider encoding this to a suitable format
                "sampling_rate": response["sampling_rate"]
            }
        }

handler = EndpointHandler()