Spaces:

at2507
/

SM_NLP_RecoSys

Sleeping

App Files Files Community

SM_NLP_RecoSys / app.py

at2507

Upload app.py

ed4c499 almost 2 years ago

raw

history blame contribute delete

4.86 kB

	from sentence_transformers import SentenceTransformer
	model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

	import pandas as pd
	import numpy as np
	import pickle
	import nltk
	from nltk.tokenize import sent_tokenize

	from tqdm import tqdm
	tqdm.pandas()

	nltk.download('punkt')

	from numpy import dot
	from numpy.linalg import norm

	import json
	import ast
	import requests



	import gradio as gr
	from datetime import datetime
	import time
	import dataframe_image as dfi


	print("Packages loaded!")

	# write out functions

	def load_pickle():
	master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb'))
	print("Exploded DF Shape:", master_exploded.shape)
	print("Successfully Loaded!")
	return master_exploded




	def sentence_embedding_generator(query):
	# query = input('What kind of mentor are you looking for?: ')
	print(f'You entered {query}')
	print("Loading Model...")
	model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
	print("all-mpnet-base-v2 Model loaded!")
	embeddings = model.encode(query)
	return embeddings, query


	def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")):
	# current_datetime = datetime.now()
	print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S"))
	master_exploded['query'] = query
	master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) /
	(norm(embeddings)*norm(x))))
	master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6]
	print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6]))

	top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)
	print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min())
	# print(master_exploded_top_k)
	cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index()
	print("Taking sum of cosine similarities above 0.6 threshold...")
	cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join)

	ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False)
	cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise")
	# path = "./Ranked_Results_Gradio/"
	# ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv'
	# cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv'
	# top_10_filename = path+'top_10_'+str(filename)+'.csv'
	# above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv"

	# save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run.
	# ranked_mentors.head(10).to_csv(ranked_mentors_filename)
	# cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename)
	# cosine_sum_by_name.to_csv(cos_sum_filename)
	# master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename)
	return master_exploded_top, top_k, cosine_sum_by_name

	def dataframe_output(cosine_sum_by_name):
	# return master_exploded_top_k
	json_df = cosine_sum_by_name.to_json(orient="columns")
	return json_df


	def generate_results(input):
	master_exploded = load_pickle()
	embeddings, query = sentence_embedding_generator(str(input))
	ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded,
	embeddings,
	query,
	time.strftime("%Y%m%d-%H%M%S"))
	print(cosine_sum_by_name.columns)
	df_output = pd.read_json(dataframe_output(cosine_sum_by_name))
	print(df_output)
	# df_output = dataframe_output(cosine_sum_by_name)
	top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]]
	sentence_output = pd.read_json(dataframe_output(top_10))
	print("JSON created...")
	subset = df_output.head(10) # Select the first 10 rows

	return subset, sentence_output

	iface = gr.Interface(
	fn=generate_results,
	inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"),
	outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")],
	title="SharpestMinds Mentor Recommender Semantic Search App",
	description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.",
	)

	iface.launch(auth=("admin", "russell2023"))