Spaces:
Sleeping
Sleeping
File size: 4,860 Bytes
446a690 76dd2ad 446a690 76dd2ad 446a690 76dd2ad 446a690 9987ebf 446a690 9987ebf 76dd2ad 446a690 c070508 76dd2ad ed4c499 76dd2ad ed4c499 76dd2ad ed4c499 76dd2ad 0e01be7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
tqdm.pandas()
nltk.download('punkt')
from numpy import dot
from numpy.linalg import norm
import json
import ast
import requests
import gradio as gr
from datetime import datetime
import time
import dataframe_image as dfi
print("Packages loaded!")
# write out functions
def load_pickle():
master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb'))
print("Exploded DF Shape:", master_exploded.shape)
print("Successfully Loaded!")
return master_exploded
def sentence_embedding_generator(query):
# query = input('What kind of mentor are you looking for?: ')
print(f'You entered {query}')
print("Loading Model...")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("all-mpnet-base-v2 Model loaded!")
embeddings = model.encode(query)
return embeddings, query
def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")):
# current_datetime = datetime.now()
print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S"))
master_exploded['query'] = query
master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) /
(norm(embeddings)*norm(x))))
master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6]
print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6]))
top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)
print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min())
# print(master_exploded_top_k)
cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index()
print("Taking sum of cosine similarities above 0.6 threshold...")
cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join)
ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False)
cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise")
# path = "./Ranked_Results_Gradio/"
# ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv'
# cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv'
# top_10_filename = path+'top_10_'+str(filename)+'.csv'
# above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv"
# save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run.
# ranked_mentors.head(10).to_csv(ranked_mentors_filename)
# cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename)
# cosine_sum_by_name.to_csv(cos_sum_filename)
# master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename)
return master_exploded_top, top_k, cosine_sum_by_name
def dataframe_output(cosine_sum_by_name):
# return master_exploded_top_k
json_df = cosine_sum_by_name.to_json(orient="columns")
return json_df
def generate_results(input):
master_exploded = load_pickle()
embeddings, query = sentence_embedding_generator(str(input))
ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded,
embeddings,
query,
time.strftime("%Y%m%d-%H%M%S"))
print(cosine_sum_by_name.columns)
df_output = pd.read_json(dataframe_output(cosine_sum_by_name))
print(df_output)
# df_output = dataframe_output(cosine_sum_by_name)
top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]]
sentence_output = pd.read_json(dataframe_output(top_10))
print("JSON created...")
subset = df_output.head(10) # Select the first 10 rows
return subset, sentence_output
iface = gr.Interface(
fn=generate_results,
inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"),
outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")],
title="SharpestMinds Mentor Recommender Semantic Search App",
description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.",
)
iface.launch(auth=("admin", "russell2023"))
|