import gradio as gr
import tensorflow as tf
from transformers import pipeline

inception_net = tf.keras.applications.MobileNetV2()
def classify_imagen(inp):
  inp = inp.reshape((-1, 224, 224, 3))
  inp = tf.keras.applications.mobilenet_v2.preprocess_input(inp)
  prediction = inception_net.predict(inp).reshape(1,1000)
  pred_scores = tf.keras.applications.mobilenet_v2.decode_predictions(prediction, top=100)
  confidence = {f'{pred_scores[0][i][1]}': float(pred_scores[0][i][2]) for i in range(100)}  
  return confidence


def audio2text(audio):
  text = trans(audio)["text"]
  return text
    

def text2sentiment(text):
  return classificator(text)[0]['label']


trans = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-spanish")
classificator = pipeline("text-classification", model="pysentimiento/robertuito-sentiment-analysis")

demo = gr.Blocks()

with demo:
  gr.Markdown("# Multimodal Demo")
  with gr.Tabs():

    with gr.TabItem("Transcribe Audio en espaƱol"):
      with gr.Row():
        audio = gr.Audio(source='microphone', type='filepath')
        transcript = gr.Textbox()
      b1 = gr.Button("Transcribe")

    with gr.TabItem("Analisis de sentimiento"):
      with gr.Row():
        texto = gr.Textbox()
        label = gr.Label()
      b2 = gr.Button("Sentimiento")

    b1.click(audio2text, inputs=audio, outputs=transcript)
    b2.click(text2sentiment, inputs=texto, outputs=label)

    with gr.TabItem("Clasificador de imagenes"):
      with gr.Row():
        image = gr.Image(shape=(224, 224))
        label= gr.Label(num_top_classes=3)
      bimage= gr.Button("Clasifica")

    bimage.click(classify_imagen, inputs=image, outputs=label)


if __name__ == '__main__':

    demo.launch()