Spaces:

hamza50
/

hotelfinder

Configuration error

App Files Files Community

hotelfinder / paris.py

hamza50

Upload 23 files

27cc973 about 3 years ago

raw

history blame

10.3 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""


	@author: Hamza Farooq
	"""

	import spacy
	from spacy.lang.en.stop_words import STOP_WORDS
	from string import punctuation
	from collections import Counter
	from heapq import nlargest
	import os
	nlp = spacy.load("en_core_web_sm")

	from spacy import displacy
	import streamlit as st
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	from matplotlib import pyplot as plt

	import nltk





	# import utils as utl

	import time
	import torch
	import transformers
	from transformers import BartTokenizer, BartForConditionalGeneration
	from string import punctuation
	# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import scipy.spatial
	import pickle as pkl
	from sentence_transformers import SentenceTransformer, util
	import torch



	def main():




	# Settings
	st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
	from string import punctuation
	punctuation=punctuation+ '\n'

	# def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
	#
	# text = text.replace('\n','')
	# text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
	# summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
	# summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
	# return summary_txt

	from sentence_transformers import SentenceTransformer, util
	import torch
	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import scipy.spatial
	import pickle as pkl
	from sentence_transformers import SentenceTransformer, util
	import torch
	#import os
	@st.cache(allow_output_mutation=True)
	def load_model():
	return SentenceTransformer('all-MiniLM-L6-v2')
	embedder = load_model()
	# embedder = SentenceTransformer('all-MiniLM-L6-v2')

	# gc = geonamescache.GeonamesCache()
	#
	# # gets nested dictionary for countries
	# countries = gc.get_countries()
	#
	# # gets nested dictionary for cities
	# cities = gc.get_cities()
	# def gen_dict_extract(var, key):
	# if isinstance(var, dict):
	# for k, v in var.items():
	# if k == key:
	# yield v
	# if isinstance(v, (dict, list)):
	# yield from gen_dict_extract(v, key)
	# elif isinstance(var, list):
	# for d in var:
	# yield from gen_dict_extract(d, key)
	#
	# cities = [*gen_dict_extract(cities, 'name')]
	# countries = [*gen_dict_extract(countries, 'name')]
	#
	# cities.append('New York')




	# mask = np.array(Image.open('upvote.png'))


	#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
	st.title("Parisian Hotel Finder")
	with st.expander("ℹ️ - About this app", expanded=True):

	st.write(
	"""
	- This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
	- It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
	"""
	)


	punctuation=punctuation+ '\n'


	#import os

	# embedder = SentenceTransformer('all-MiniLM-L6-v2')

	df_all = pd.read_csv('combined_paris.csv')

	df_all = df_all[['Hotel','review']]


	df_all = df_all.drop_duplicates()
	df_all = df_all.reset_index(drop=True)
	summary_hotel = pd.read_csv('df_combined_paris.csv')
	#
	# df['hotel_name'].drop_duplicates()

	df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')

	import re

	# df_combined = pd.read_csv('df_combined.csv')

	df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

	def lower_case(input_str):
	input_str = input_str.lower()
	return input_str

	df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))

	df = df_combined

	df_sentences = df_combined.set_index("all_review")

	df_sentences = df_sentences["Hotel"].to_dict()
	df_sentences_list = list(df_sentences.keys())

	import pandas as pd
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer, util

	df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
	#
	corpus = df_sentences_list
	# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
	corpus_embeddings = np.load('embeddings_review.npy')
	corpus_embeddings_h = np.load('embeddings_h_r.npy')
	#
	# model = SentenceTransformer('all-MiniLM-L6-v2')
	# paraphrases = util.paraphrase_mining(model, corpus)

	#queries = ['Hotel close to Central Park',
	# 'Hotel with breakfast'
	# ]


	# from transformers import AutoTokenizer, AutoModel
	# import torch
	# import torch.nn.functional as F
	#
	# #Mean Pooling - Take attention mask into account for correct averaging
	# def mean_pooling(model_output, attention_mask):
	# token_embeddings = model_output[0] #First element of model_output contains all token embeddings
	# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
	#
	#
	# # Sentences we want sentence embeddings for
	# sentences = corpus
	#
	# # Load model from HuggingFace Hub
	# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
	# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
	#
	# # Tokenize sentences
	# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
	#
	# # Compute token embeddings
	# with torch.no_grad():
	# model_output = model(**encoded_input)
	#
	# # Perform pooling
	# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
	#
	# # Normalize embeddings
	# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
	#
	# st.text("Sentence embeddings:")
	# st.text(sentence_embeddings)
	#
	#

	#corpus_embeddings = sentence_embeddings
	# Query sentences

	def plot_cloud(wordcloud):
	# Set figure size
	st.pyplot.figure(figsize=(20, 10))
	# Display image
	st.pyplot(wordcloud)
	# No axis details
	#st.pyplot.axis("off");
	sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
	userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
	if not userinput or userinput == sampletext:
	st.write("Please enter a query to get results")
	else:
	query = [str(userinput)]
	doc = nlp(str(userinput))
	# for ent in doc.ents:
	# if ent.label_ == 'GPE':
	# if ent.text in countries:
	# st.write(f"Country : {ent.text}")
	# elif ent.text in cities:
	# st.write("city")
	# st.write(ent.text)
	# st.write(f"City : {ent.text}")
	# else:
	# print(f"Other GPE : {ent.text}")
	# query_embeddings = embedder.encode(queries,show_progress_bar=True)
	top_k = min(5, len(corpus))

	query_embedding = embedder.encode(query, convert_to_tensor=True)

	# We use cosine-similarity and torch.topk to find the highest 5 scores
	cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
	top_results = torch.topk(cos_scores, k=top_k)

	# st.write("\n\n======================\n\n")
	# st.write("Query:", query)
	# # doc = nlp(query)
	sentence_spans = list(doc.sents)
	ent_html = displacy.render(doc, style="ent", jupyter=False)
	# Display the entity visualization in the browser:
	st.markdown(ent_html, unsafe_allow_html=True)

	#displacy.render(doc, jupyter = True, style="ent")
	st.write("##")
	st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
	st.write("\n\n======================\n\n")

	for score, idx in zip(top_results[0], top_results[1]):

	row_dict = df.loc[df['all_review']== corpus[idx]]
	st.subheader(row_dict['Hotel'].values[0])

	hotel_subset = df_all.loc[df_all['Hotel']==row_dict['Hotel'].values[0]]
	hotel_sub = summary_hotel.loc[summary_hotel['Hotel']==row_dict['Hotel'].values[0]]
	st.caption("Review Summary:")
	st.write(hotel_sub['summary'].values[0])
	st.caption("Relevancy: {:.4f}".format(score))
	st.caption("Relevant reviews:")

	df_sentences_h = hotel_subset.set_index("review")

	df_sentences_h = df_sentences_h["Hotel"].to_dict()
	df_sentences_list_h = list(df_sentences_h.keys())



	df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
	#
	corpus_h = df_sentences_list_h
	# corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
	sublist = [element for i, element in enumerate(corpus_embeddings_h) if i in (df_all[df_all['Hotel'] == row_dict['Hotel'].values[0]].index.values)]
	cos_scores_h = util.pytorch_cos_sim(query_embedding, sublist)[0]
	top_results_h = torch.topk(cos_scores_h, k=top_k)

	for score, idx in zip(top_results_h[0], top_results_h[1]):
	st.write(corpus_h[idx])


	if __name__ == '__main__':
	main()