Spaces:

jchen8000
/

Recommendation_Demo

Sleeping

App Files Files Community

Recommendation_Demo / app.py

jchen8000

Update app.py

274400d verified 6 months ago

raw

history blame contribute delete

11.8 kB

	import pandas as pd
	import numpy as np
	from scipy.sparse import csr_matrix
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.metrics.pairwise import linear_kernel
	from sklearn.preprocessing import MinMaxScaler
	import tensorflow as tf
	from tensorflow.keras.models import Model
	from tensorflow.keras.layers import Input, Dense
	import gradio as gr
	import zipfile
	import random

	input_count = 300
	result_count = 21

	# Extract the MovieLens dataset
	with zipfile.ZipFile('ml-latest-small.zip') as z:
	with z.open('ml-latest-small/movies.csv') as f:
	movies = pd.read_csv(f)
	with z.open('ml-latest-small/ratings.csv') as f:
	ratings = pd.read_csv(f)

	######################################
	#
	# Content-based Filtering
	#
	######################################

	# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
	tfidf = TfidfVectorizer(stop_words='english')

	# Replace NaN with an empty string
	movies['genres'] = movies['genres'].fillna('')

	# Construct the required TF-IDF matrix by fitting and transforming the data
	tfidf_matrix = tfidf.fit_transform(movies['genres'])

	# Compute the cosine similarity matrix
	cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

	# Construct a reverse map of indices and movie titles
	indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

	# Function that takes in movie title as input and outputs most similar movies
	def get_cb_recommendations(title, cosine_sim=cosine_sim):

	# Get the index of the movie that matches the title
	idx = indices[title]

	# Get the pairwise similarity scores of all movies with that movie
	sim_scores = list(enumerate(cosine_sim[idx]))

	# Sort the movies based on the similarity scores
	sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

	# Get the scores of the 20 most similar movies
	sim_scores = sim_scores[1:result_count]

	# Get the movie indices
	movie_indices = [i[0] for i in sim_scores]

	# Return the top 20 most similar movies with their scores
	recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]
	return recommendations

	# Gradio interface
	def recommend_movies_cb(movie):
	if not movie:
	return "No movie selected. Please select one from the dropdown."

	recommendations = get_cb_recommendations(movie)
	format_string = "{:>5.2f} {:<20}"
	return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


	######################################
	#
	# Collaborative Filtering (Item-based)
	#
	######################################
	# Create a movie-user matrix
	movie_user_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)

	# Compute the cosine similarity between movies
	movie_similarity = cosine_similarity(movie_user_matrix)

	# Create a DataFrame with movie similarities
	movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)

	# Function to get movie recommendations using item-based collaborative filtering
	def get_cf_recommendations(movie_title, movie_similarity_df=movie_similarity_df, movies=movies, n=result_count):
	# Get the movieId for the input movie title
	movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]

	# Check if the movie is in our similarity matrix
	if movie_id not in movie_similarity_df.index:
	return []

	# Get the row of similarity scores for this movie
	similar_scores = movie_similarity_df.loc[movie_id]

	# Sort the scores in descending order
	similar_scores = similar_scores.sort_values(ascending=False)

	# Get the indices of the top-n most similar movies (excluding the input movie itself)
	similar_movie_indices = similar_scores.index[1:n+1]

	# Get the titles and similarity scores of the recommended movies
	recommendations = []
	for idx in similar_movie_indices:
	title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
	score = similar_scores[idx]
	recommendations.append((title, score))

	return recommendations

	# Function for Gradio interface
	def recommend_movies_cf(movie_title):
	if not movie_title:
	return "No movie selected. Please select one from the dropdown."

	if movie_title not in movies['title'].values:
	return f"Movie '{movie_title}' not found in the dataset."

	recommendations = get_cf_recommendations(movie_title)
	format_string = "{:>5.2f} {:<20}"
	return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


	######################################
	#
	# Collaborative Filtering with Neural Network (Item-based)
	#
	######################################

	# Normalize the ratings
	scaler = MinMaxScaler()
	movie_user_matrix_scaled = scaler.fit_transform(movie_user_matrix)

	# Define the autoencoder model
	input_dim = movie_user_matrix.shape[1]
	encoding_dim = 32

	input_layer = Input(shape=(input_dim,))
	encoded = Dense(64, activation='relu')(input_layer)
	encoded = Dense(encoding_dim, activation='relu')(encoded)
	decoded = Dense(64, activation='relu')(encoded)
	decoded = Dense(input_dim, activation='sigmoid')(decoded)

	autoencoder = Model(input_layer, decoded)
	autoencoder.compile(optimizer='adam', loss='mean_squared_error')

	# Train the autoencoder
	autoencoder.fit(movie_user_matrix_scaled, movie_user_matrix_scaled,
	epochs=50, batch_size=64, shuffle=True, validation_split=0.2,
	verbose=0)

	# Use the trained autoencoder to predict the complete matrix
	predicted_matrix_scaled = autoencoder.predict(movie_user_matrix_scaled)
	predicted_matrix = scaler.inverse_transform(predicted_matrix_scaled)

	# Create a DataFrame with the predicted matrix
	predicted_matrix_df = pd.DataFrame(predicted_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.columns)

	# Compute the cosine similarity between movies using the predicted matrix
	movie_similarity_cfnn = cosine_similarity(predicted_matrix)

	# Create a DataFrame with movie similarities
	movie_similarity_cfnn_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)

	# Function to get movie recommendations using item-based collaborative filtering
	def get_cfnn_recommendations(movie_title, movie_similarity_df=movie_similarity_cfnn_df, movies=movies, n=result_count):
	# Get the movieId for the input movie title
	movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]

	# Check if the movie is in our similarity matrix
	if movie_id not in movie_similarity_df.index:
	return []

	# Get the row of similarity scores for this movie
	similar_scores = movie_similarity_df.loc[movie_id]

	# Sort the scores in descending order
	similar_scores = similar_scores.sort_values(ascending=False)

	# Get the indices of the top-n most similar movies (excluding the input movie itself)
	similar_movie_indices = similar_scores.index[1:n+1]

	# Get the titles and similarity scores of the recommended movies
	recommendations = []
	for idx in similar_movie_indices:
	title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
	score = similar_scores[idx]
	recommendations.append((title, score))

	return recommendations

	# Function for Gradio interface
	def recommend_movies_cfnn(movie_title):
	if not movie_title:
	return "No movie selected. Please select one from the dropdown."

	if movie_title not in movies['title'].values:
	return f"Movie '{movie_title}' not found in the dataset."

	recommendations = get_cfnn_recommendations(movie_title)
	format_string = "{:>5.2f} {:<20}"
	return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


	######################################
	#
	# Gradio interface
	#
	######################################

	# Create a list of movie titles for the dropdown
	movie_list = random.sample(movies['title'].tolist(), input_count)
	total_movies = len(movies)

	with gr.Blocks() as iface:
	with gr.Tab("Content-Based Filtering"):
	gr.Markdown("""## Movie Recommender - Content-Based Filtering
	How it works:
	* Use the 'genres' feature of movies, and convert genres into numerical vectors.
	* For a given movie, find the most similar movies based on the genre similarity.
	* This approach uses genres of movies only, without considering user preferences or viewing history.
	* Simple to implement and computationally efficient.
	""")
	gr.Interface(fn=recommend_movies_cb,
	inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
	outputs=[gr.Textbox(label="Recommended Movies:")],
	# title="Movie Recommender - Content-Based Filtering",
	description="Select a movie to get recommendations based on content filtering.")

	with gr.Tab("Collaborative Filtering"):
	gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering
	How it works:
	* Create a movie-user matrix where rows represent movies and columns represent users, each cell contains the rating a user gave to a movie, or 0 if no rating exists.
	* Calculate the cosine similarity between movies based on their rating patterns, results in a movie-movie similarity matrix.
	* For a given movie, find the most similar movies based on this similarity matrix, and recommend these movies.
	* Simple to implement and computationally efficient, but doesn't handle sparsity well (when many missing ratings).
	""")
	gr.Interface(fn=recommend_movies_cf,
	inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
	outputs=[gr.Textbox(label="Recommended Movies:")],
	# title="Movie Recommender - Item-Based Collaborative Filtering",
	description="Select a movie to get recommendations based on collaborative filtering.")

	with gr.Tab("Collaborative Filtering with Neural Network"):
	gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering with Neural Network
	How it works:
	* Use a Neural Network to predict the missing values in the movie-user matrix to improve the collaborative filtering recommendations.
	* The NN model learns to reconstruct the movie-user matrix, effectively predicting missing ratings. This results in a dense, predicted movie-user matrix.
	* Calculate movie-movie similarities using the predicted matrix. And use this similarity matrix to find and recommend similar movies.
	* This approach often provides more accurate recommendations especially with large sparse datasets. But more complex to implement and require more computational resources.
	""")
	gr.Interface(fn=recommend_movies_cfnn,
	inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
	outputs=[gr.Textbox(label="Recommended Movies:")],
	# title="Movie Recommender - Item-Based Collaborative Filtering",
	description="Select a movie to get recommendations based on collaborative filtering.")

	# Launch the app
	iface.launch()