jchen8000's picture
Update app.py
274400d verified
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import gradio as gr
import zipfile
import random
input_count = 300
result_count = 21
# Extract the MovieLens dataset
with zipfile.ZipFile('ml-latest-small.zip') as z:
with z.open('ml-latest-small/movies.csv') as f:
movies = pd.read_csv(f)
with z.open('ml-latest-small/ratings.csv') as f:
ratings = pd.read_csv(f)
######################################
#
# Content-based Filtering
#
######################################
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['genres'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
# Function that takes in movie title as input and outputs most similar movies
def get_cb_recommendations(title, cosine_sim=cosine_sim):
# Get the index of the movie that matches the title
idx = indices[title]
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 20 most similar movies
sim_scores = sim_scores[1:result_count]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 20 most similar movies with their scores
recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]
return recommendations
# Gradio interface
def recommend_movies_cb(movie):
if not movie:
return "No movie selected. Please select one from the dropdown."
recommendations = get_cb_recommendations(movie)
format_string = "{:>5.2f} {:<20}"
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])
######################################
#
# Collaborative Filtering (Item-based)
#
######################################
# Create a movie-user matrix
movie_user_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
# Compute the cosine similarity between movies
movie_similarity = cosine_similarity(movie_user_matrix)
# Create a DataFrame with movie similarities
movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)
# Function to get movie recommendations using item-based collaborative filtering
def get_cf_recommendations(movie_title, movie_similarity_df=movie_similarity_df, movies=movies, n=result_count):
# Get the movieId for the input movie title
movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
# Check if the movie is in our similarity matrix
if movie_id not in movie_similarity_df.index:
return []
# Get the row of similarity scores for this movie
similar_scores = movie_similarity_df.loc[movie_id]
# Sort the scores in descending order
similar_scores = similar_scores.sort_values(ascending=False)
# Get the indices of the top-n most similar movies (excluding the input movie itself)
similar_movie_indices = similar_scores.index[1:n+1]
# Get the titles and similarity scores of the recommended movies
recommendations = []
for idx in similar_movie_indices:
title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
score = similar_scores[idx]
recommendations.append((title, score))
return recommendations
# Function for Gradio interface
def recommend_movies_cf(movie_title):
if not movie_title:
return "No movie selected. Please select one from the dropdown."
if movie_title not in movies['title'].values:
return f"Movie '{movie_title}' not found in the dataset."
recommendations = get_cf_recommendations(movie_title)
format_string = "{:>5.2f} {:<20}"
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])
######################################
#
# Collaborative Filtering with Neural Network (Item-based)
#
######################################
# Normalize the ratings
scaler = MinMaxScaler()
movie_user_matrix_scaled = scaler.fit_transform(movie_user_matrix)
# Define the autoencoder model
input_dim = movie_user_matrix.shape[1]
encoding_dim = 32
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
# Train the autoencoder
autoencoder.fit(movie_user_matrix_scaled, movie_user_matrix_scaled,
epochs=50, batch_size=64, shuffle=True, validation_split=0.2,
verbose=0)
# Use the trained autoencoder to predict the complete matrix
predicted_matrix_scaled = autoencoder.predict(movie_user_matrix_scaled)
predicted_matrix = scaler.inverse_transform(predicted_matrix_scaled)
# Create a DataFrame with the predicted matrix
predicted_matrix_df = pd.DataFrame(predicted_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.columns)
# Compute the cosine similarity between movies using the predicted matrix
movie_similarity_cfnn = cosine_similarity(predicted_matrix)
# Create a DataFrame with movie similarities
movie_similarity_cfnn_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)
# Function to get movie recommendations using item-based collaborative filtering
def get_cfnn_recommendations(movie_title, movie_similarity_df=movie_similarity_cfnn_df, movies=movies, n=result_count):
# Get the movieId for the input movie title
movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
# Check if the movie is in our similarity matrix
if movie_id not in movie_similarity_df.index:
return []
# Get the row of similarity scores for this movie
similar_scores = movie_similarity_df.loc[movie_id]
# Sort the scores in descending order
similar_scores = similar_scores.sort_values(ascending=False)
# Get the indices of the top-n most similar movies (excluding the input movie itself)
similar_movie_indices = similar_scores.index[1:n+1]
# Get the titles and similarity scores of the recommended movies
recommendations = []
for idx in similar_movie_indices:
title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
score = similar_scores[idx]
recommendations.append((title, score))
return recommendations
# Function for Gradio interface
def recommend_movies_cfnn(movie_title):
if not movie_title:
return "No movie selected. Please select one from the dropdown."
if movie_title not in movies['title'].values:
return f"Movie '{movie_title}' not found in the dataset."
recommendations = get_cfnn_recommendations(movie_title)
format_string = "{:>5.2f} {:<20}"
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])
######################################
#
# Gradio interface
#
######################################
# Create a list of movie titles for the dropdown
movie_list = random.sample(movies['title'].tolist(), input_count)
total_movies = len(movies)
with gr.Blocks() as iface:
with gr.Tab("Content-Based Filtering"):
gr.Markdown("""## Movie Recommender - Content-Based Filtering
How it works:
* Use the 'genres' feature of movies, and convert genres into numerical vectors.
* For a given movie, find the most similar movies based on the genre similarity.
* This approach uses genres of movies only, without considering user preferences or viewing history.
* Simple to implement and computationally efficient.
""")
gr.Interface(fn=recommend_movies_cb,
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
outputs=[gr.Textbox(label="Recommended Movies:")],
# title="Movie Recommender - Content-Based Filtering",
description="Select a movie to get recommendations based on content filtering.")
with gr.Tab("Collaborative Filtering"):
gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering
How it works:
* Create a movie-user matrix where rows represent movies and columns represent users, each cell contains the rating a user gave to a movie, or 0 if no rating exists.
* Calculate the cosine similarity between movies based on their rating patterns, results in a movie-movie similarity matrix.
* For a given movie, find the most similar movies based on this similarity matrix, and recommend these movies.
* Simple to implement and computationally efficient, but doesn't handle sparsity well (when many missing ratings).
""")
gr.Interface(fn=recommend_movies_cf,
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
outputs=[gr.Textbox(label="Recommended Movies:")],
# title="Movie Recommender - Item-Based Collaborative Filtering",
description="Select a movie to get recommendations based on collaborative filtering.")
with gr.Tab("Collaborative Filtering with Neural Network"):
gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering with Neural Network
How it works:
* Use a Neural Network to predict the missing values in the movie-user matrix to improve the collaborative filtering recommendations.
* The NN model learns to reconstruct the movie-user matrix, effectively predicting missing ratings. This results in a dense, predicted movie-user matrix.
* Calculate movie-movie similarities using the predicted matrix. And use this similarity matrix to find and recommend similar movies.
* This approach often provides more accurate recommendations especially with large sparse datasets. But more complex to implement and require more computational resources.
""")
gr.Interface(fn=recommend_movies_cfnn,
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
outputs=[gr.Textbox(label="Recommended Movies:")],
# title="Movie Recommender - Item-Based Collaborative Filtering",
description="Select a movie to get recommendations based on collaborative filtering.")
# Launch the app
iface.launch()