from gensim.models.keyedvectors import KeyedVectors
import pickle
from nltk.tokenize import word_tokenize
import gradio as gr


# Use gensim Keyvectors to read the embbedings 
wordvectors_file_vec = 'smaller_model_spa.txt'
smaller_model = KeyedVectors.load_word2vec_format(wordvectors_file_vec)
with open('stop_words.pkl', 'rb') as f:
    stop_words = pickle.load(f)


def filter_words(x):
    word_tokens = x.split(' ')  # shitty tokenization because ntlk tokenize on hf is working weeeird
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

def reverse_dictionary(definicion):
    words = filter_words(definicion)
    list_similar = smaller_model.most_similar_cosmul(positive= words)
    return list_to_html(list_similar)

def list_to_html(lst, title="Results"):
    html_str = f"<h3>{title}</h3><ul>"
    for word, score in lst:
        html_str += f"<li><b>{word}</b>: {score:.2f}</li>"
    html_str += "</ul>"
    return html_str


title = "Diccionario inverso en español"
description = "Un diccionario inverso utilizando embeddings Word2Vec de SBWCA y filtrando palabras de Wikcionario. Creado como demo para Gradio y HuggingFace Spaces."
examples = ['angustia porque se te olvido algo',
            'actor mujer',
            'tardanza o lentitud con que se hace algo',
            'miedo a las alturas',
            'vehículo que anda bajo el agua',
            'grupo de lobos que andan juntos']


gr.Interface(fn = reverse_dictionary, 
             inputs = gr.inputs.Textbox(lines=5, placeholder="Enter your text here..."), 
             outputs= gr.outputs.HTML(),
             title = title,
             description = description,
             examples = examples).launch()