File size: 4,860 Bytes
446a690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76dd2ad
 
 
446a690
 
 
 
 
 
76dd2ad
446a690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76dd2ad
446a690
 
9987ebf
446a690
 
 
 
9987ebf
76dd2ad
 
 
 
 
446a690
 
c070508
 
 
 
76dd2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed4c499
 
76dd2ad
 
 
ed4c499
76dd2ad
 
 
 
ed4c499
76dd2ad
 
 
 
0e01be7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
tqdm.pandas()

nltk.download('punkt')

from numpy import dot
from numpy.linalg import norm

import json
import ast
import requests



import gradio as gr
from datetime import datetime
import time 
import dataframe_image as dfi


print("Packages loaded!")

# write out functions 

def load_pickle():
  master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb'))
  print("Exploded DF Shape:", master_exploded.shape)
  print("Successfully Loaded!")
  return master_exploded




def sentence_embedding_generator(query): 
#   query = input('What kind of mentor are you looking for?: ')
  print(f'You entered {query}')
  print("Loading Model...")
  model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
  print("all-mpnet-base-v2 Model loaded!")
  embeddings = model.encode(query)
  return embeddings, query


def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")): 
#   current_datetime = datetime.now()
  print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S"))
  master_exploded['query'] = query
  master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) /
                                                         (norm(embeddings)*norm(x))))
  master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6]
  print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6]))
  
  top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)
  print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min())
  # print(master_exploded_top_k)
  cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index()
  print("Taking sum of cosine similarities above 0.6 threshold...")
  cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join)
  
  ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False) 
  cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise")
  # path = "./Ranked_Results_Gradio/"
  # ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv'
  # cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv'
  # top_10_filename = path+'top_10_'+str(filename)+'.csv'
  # above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv"

  # save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run. 
  # ranked_mentors.head(10).to_csv(ranked_mentors_filename)
  # cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename)
  # cosine_sum_by_name.to_csv(cos_sum_filename)
  # master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename)
  return master_exploded_top, top_k, cosine_sum_by_name

def dataframe_output(cosine_sum_by_name): 
  # return master_exploded_top_k
  json_df = cosine_sum_by_name.to_json(orient="columns")
  return json_df


def generate_results(input): 
  master_exploded = load_pickle()
  embeddings, query = sentence_embedding_generator(str(input))
  ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded,  
                                                                            embeddings, 
                                                                            query, 
                                                                            time.strftime("%Y%m%d-%H%M%S"))
  print(cosine_sum_by_name.columns)
  df_output = pd.read_json(dataframe_output(cosine_sum_by_name))
  print(df_output)
  # df_output = dataframe_output(cosine_sum_by_name)
  top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]]
  sentence_output = pd.read_json(dataframe_output(top_10))
  print("JSON created...")
  subset = df_output.head(10)  # Select the first 10 rows

  return subset, sentence_output

iface = gr.Interface(
    fn=generate_results,
    inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"),
    outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")],
    title="SharpestMinds Mentor Recommender Semantic Search App",
    description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.",
)

iface.launch(auth=("admin", "russell2023"))