Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') | |
import pandas as pd | |
import numpy as np | |
import pickle | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from tqdm import tqdm | |
tqdm.pandas() | |
nltk.download('punkt') | |
from numpy import dot | |
from numpy.linalg import norm | |
import json | |
import ast | |
import requests | |
import gradio as gr | |
from datetime import datetime | |
import time | |
import dataframe_image as dfi | |
print("Packages loaded!") | |
# write out functions | |
def load_pickle(): | |
master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb')) | |
print("Exploded DF Shape:", master_exploded.shape) | |
print("Successfully Loaded!") | |
return master_exploded | |
def sentence_embedding_generator(query): | |
# query = input('What kind of mentor are you looking for?: ') | |
print(f'You entered {query}') | |
print("Loading Model...") | |
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') | |
print("all-mpnet-base-v2 Model loaded!") | |
embeddings = model.encode(query) | |
return embeddings, query | |
def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")): | |
# current_datetime = datetime.now() | |
print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S")) | |
master_exploded['query'] = query | |
master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) / | |
(norm(embeddings)*norm(x)))) | |
master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6] | |
print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6])) | |
top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10) | |
print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min()) | |
# print(master_exploded_top_k) | |
cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index() | |
print("Taking sum of cosine similarities above 0.6 threshold...") | |
cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join) | |
ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False) | |
cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise") | |
# path = "./Ranked_Results_Gradio/" | |
# ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv' | |
# cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv' | |
# top_10_filename = path+'top_10_'+str(filename)+'.csv' | |
# above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv" | |
# save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run. | |
# ranked_mentors.head(10).to_csv(ranked_mentors_filename) | |
# cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename) | |
# cosine_sum_by_name.to_csv(cos_sum_filename) | |
# master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename) | |
return master_exploded_top, top_k, cosine_sum_by_name | |
def dataframe_output(cosine_sum_by_name): | |
# return master_exploded_top_k | |
json_df = cosine_sum_by_name.to_json(orient="columns") | |
return json_df | |
def generate_results(input): | |
master_exploded = load_pickle() | |
embeddings, query = sentence_embedding_generator(str(input)) | |
ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded, | |
embeddings, | |
query, | |
time.strftime("%Y%m%d-%H%M%S")) | |
print(cosine_sum_by_name.columns) | |
df_output = pd.read_json(dataframe_output(cosine_sum_by_name)) | |
print(df_output) | |
# df_output = dataframe_output(cosine_sum_by_name) | |
top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]] | |
sentence_output = pd.read_json(dataframe_output(top_10)) | |
print("JSON created...") | |
subset = df_output.head(10) # Select the first 10 rows | |
return subset, sentence_output | |
iface = gr.Interface( | |
fn=generate_results, | |
inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"), | |
outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")], | |
title="SharpestMinds Mentor Recommender Semantic Search App", | |
description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.", | |
) | |
iface.launch(auth=("admin", "russell2023")) | |