from gensim.models.keyedvectors import KeyedVectors import pickle from nltk.tokenize import word_tokenize import gradio as gr # Use gensim Keyvectors to read the embbedings wordvectors_file_vec = 'smaller_model_spa.txt' smaller_model = KeyedVectors.load_word2vec_format(wordvectors_file_vec) with open('stop_words.pkl', 'rb') as f: stop_words = pickle.load(f) def filter_words(x): word_tokens = x.split(' ') # shitty tokenization because ntlk tokenize on hf is working weeeird filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] return filtered_sentence def reverse_dictionary(definicion): words = filter_words(definicion) list_similar = smaller_model.most_similar_cosmul(positive= words) return list_to_html(list_similar) def list_to_html(lst, title="Results"): html_str = f"

{title}

" return html_str title = "Diccionario inverso en español" description = "Un diccionario inverso utilizando embeddings Word2Vec de SBWCA y filtrando palabras de Wikcionario. Creado como demo para Gradio y HuggingFace Spaces." examples = ['angustia porque se te olvido algo', 'actor mujer', 'tardanza o lentitud con que se hace algo', 'miedo a las alturas', 'vehículo que anda bajo el agua', 'grupo de lobos que andan juntos'] gr.Interface(fn = reverse_dictionary, inputs = gr.inputs.Textbox(lines=5, placeholder="Enter your text here..."), outputs= gr.outputs.HTML(), title = title, description = description, examples = examples).launch()