Spaces:

mrolando
/

asistente_voz

Sleeping

File size: 2,934 Bytes

cbab4ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56cf024
 
 
 
 
 
 
 
 
 
cbab4ab
 
56cf024
cbab4ab
 
 
56cf024
 
cbab4ab
 
 
 
03fc7c5
 
 
 
 
 
 
cbab4ab
 
 
 
 
 
 
 
 
 
03fc7c5
 
 
 
cbab4ab
56cf024
cbab4ab
56cf024
 
cbab4ab
 
 
 
 
 
56cf024
 
 
 
cbab4ab
 
 
56cf024
cbab4ab
 
56cf024
cbab4ab
56cf024
 
 
cbab4ab

from transformers import pipeline
import torch

import os
import openai
from dotenv import load_dotenv
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface

model_id = "openai/whisper-base" 
pipe = pipeline("automatic-speech-recognition", model=model_id)

def transcribe_speech(filepath):
    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "task": "transcribe",
            "language": "spanish",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
    )
    return output["text"]


# Load environment variables from the .env file de forma local
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']


def clear_chat():
     global chat_history
     chat_history=[]


def query_chatgpt(message,chat_history):
    chat_history.append({'role': 'user', 'content': '{}'.format(message)})
    print("Preguntando "+message)
    print("historial", chat_history)
    response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages= chat_history,
            temperature=0.5,
            max_tokens=256
            ).choices[0].message.content
    chat_history.append({'role': 'assistant', 'content': '{}'.format(response)})
    return response, chat_history




# models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
#     "facebook/tts_transformer-es-css10",
#     arg_overrides={"vocoder": "hifigan", "fp16": False}
# )
# model = models[0]
# TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
# generator = task.build_generator([model], cfg)

# text = "Había una vez."

# sample = TTSHubInterface.get_model_input(task, text)
# wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)

# ipd.Audio(wav, rate=rate)
from tts import synthesize


# def syn_facebookmms(text):
#     sample = TTSHubInterface.get_model_input(task, text)
#     wav,rate = TTSHubInterface.get_prediction(task, model, generator, sample)
#     return wav,rate

def answer_question(filepath,chat_history):
    transcription = transcribe_speech(filepath)
    response,chat_history = query_chatgpt(transcription,chat_history)
    print("historial",chat_history)
    # audio = synthesise(response)
    # audio, rate = syn_facebookmms(response)
    rate,audio = synthesize(response,1,"spa")
    print(audio)
    return rate,audio

def reset_state(chat_history):
    chat_history = []
    return chat_history


import gradio as gr
with gr.Blocks() as demo:
    chat_history = gr.State([])
    entrada = gr.Audio(source="microphone",type="filepath")
    boton = gr.Button("Responder")
    button = gr.Button("Reset State")
    salida = gr.Audio()
    boton.click(answer_question,[entrada,chat_history],salida)
    button.click(reset_state,chat_history,chat_history)
  
demo.launch(debug=True)