Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from scipy.sparse import csr_matrix | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.metrics.pairwise import linear_kernel | |
from sklearn.preprocessing import MinMaxScaler | |
import tensorflow as tf | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.layers import Input, Dense | |
import gradio as gr | |
import zipfile | |
import random | |
input_count = 300 | |
result_count = 21 | |
# Extract the MovieLens dataset | |
with zipfile.ZipFile('ml-latest-small.zip') as z: | |
with z.open('ml-latest-small/movies.csv') as f: | |
movies = pd.read_csv(f) | |
with z.open('ml-latest-small/ratings.csv') as f: | |
ratings = pd.read_csv(f) | |
###################################### | |
# | |
# Content-based Filtering | |
# | |
###################################### | |
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a' | |
tfidf = TfidfVectorizer(stop_words='english') | |
# Replace NaN with an empty string | |
movies['genres'] = movies['genres'].fillna('') | |
# Construct the required TF-IDF matrix by fitting and transforming the data | |
tfidf_matrix = tfidf.fit_transform(movies['genres']) | |
# Compute the cosine similarity matrix | |
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) | |
# Construct a reverse map of indices and movie titles | |
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates() | |
# Function that takes in movie title as input and outputs most similar movies | |
def get_cb_recommendations(title, cosine_sim=cosine_sim): | |
# Get the index of the movie that matches the title | |
idx = indices[title] | |
# Get the pairwise similarity scores of all movies with that movie | |
sim_scores = list(enumerate(cosine_sim[idx])) | |
# Sort the movies based on the similarity scores | |
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) | |
# Get the scores of the 20 most similar movies | |
sim_scores = sim_scores[1:result_count] | |
# Get the movie indices | |
movie_indices = [i[0] for i in sim_scores] | |
# Return the top 20 most similar movies with their scores | |
recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)] | |
return recommendations | |
# Gradio interface | |
def recommend_movies_cb(movie): | |
if not movie: | |
return "No movie selected. Please select one from the dropdown." | |
recommendations = get_cb_recommendations(movie) | |
format_string = "{:>5.2f} {:<20}" | |
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) | |
###################################### | |
# | |
# Collaborative Filtering (Item-based) | |
# | |
###################################### | |
# Create a movie-user matrix | |
movie_user_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0) | |
# Compute the cosine similarity between movies | |
movie_similarity = cosine_similarity(movie_user_matrix) | |
# Create a DataFrame with movie similarities | |
movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index) | |
# Function to get movie recommendations using item-based collaborative filtering | |
def get_cf_recommendations(movie_title, movie_similarity_df=movie_similarity_df, movies=movies, n=result_count): | |
# Get the movieId for the input movie title | |
movie_id = movies[movies['title'] == movie_title]['movieId'].values[0] | |
# Check if the movie is in our similarity matrix | |
if movie_id not in movie_similarity_df.index: | |
return [] | |
# Get the row of similarity scores for this movie | |
similar_scores = movie_similarity_df.loc[movie_id] | |
# Sort the scores in descending order | |
similar_scores = similar_scores.sort_values(ascending=False) | |
# Get the indices of the top-n most similar movies (excluding the input movie itself) | |
similar_movie_indices = similar_scores.index[1:n+1] | |
# Get the titles and similarity scores of the recommended movies | |
recommendations = [] | |
for idx in similar_movie_indices: | |
title = movies.loc[movies['movieId'] == idx, 'title'].values[0] | |
score = similar_scores[idx] | |
recommendations.append((title, score)) | |
return recommendations | |
# Function for Gradio interface | |
def recommend_movies_cf(movie_title): | |
if not movie_title: | |
return "No movie selected. Please select one from the dropdown." | |
if movie_title not in movies['title'].values: | |
return f"Movie '{movie_title}' not found in the dataset." | |
recommendations = get_cf_recommendations(movie_title) | |
format_string = "{:>5.2f} {:<20}" | |
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) | |
###################################### | |
# | |
# Collaborative Filtering with Neural Network (Item-based) | |
# | |
###################################### | |
# Normalize the ratings | |
scaler = MinMaxScaler() | |
movie_user_matrix_scaled = scaler.fit_transform(movie_user_matrix) | |
# Define the autoencoder model | |
input_dim = movie_user_matrix.shape[1] | |
encoding_dim = 32 | |
input_layer = Input(shape=(input_dim,)) | |
encoded = Dense(64, activation='relu')(input_layer) | |
encoded = Dense(encoding_dim, activation='relu')(encoded) | |
decoded = Dense(64, activation='relu')(encoded) | |
decoded = Dense(input_dim, activation='sigmoid')(decoded) | |
autoencoder = Model(input_layer, decoded) | |
autoencoder.compile(optimizer='adam', loss='mean_squared_error') | |
# Train the autoencoder | |
autoencoder.fit(movie_user_matrix_scaled, movie_user_matrix_scaled, | |
epochs=50, batch_size=64, shuffle=True, validation_split=0.2, | |
verbose=0) | |
# Use the trained autoencoder to predict the complete matrix | |
predicted_matrix_scaled = autoencoder.predict(movie_user_matrix_scaled) | |
predicted_matrix = scaler.inverse_transform(predicted_matrix_scaled) | |
# Create a DataFrame with the predicted matrix | |
predicted_matrix_df = pd.DataFrame(predicted_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.columns) | |
# Compute the cosine similarity between movies using the predicted matrix | |
movie_similarity_cfnn = cosine_similarity(predicted_matrix) | |
# Create a DataFrame with movie similarities | |
movie_similarity_cfnn_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index) | |
# Function to get movie recommendations using item-based collaborative filtering | |
def get_cfnn_recommendations(movie_title, movie_similarity_df=movie_similarity_cfnn_df, movies=movies, n=result_count): | |
# Get the movieId for the input movie title | |
movie_id = movies[movies['title'] == movie_title]['movieId'].values[0] | |
# Check if the movie is in our similarity matrix | |
if movie_id not in movie_similarity_df.index: | |
return [] | |
# Get the row of similarity scores for this movie | |
similar_scores = movie_similarity_df.loc[movie_id] | |
# Sort the scores in descending order | |
similar_scores = similar_scores.sort_values(ascending=False) | |
# Get the indices of the top-n most similar movies (excluding the input movie itself) | |
similar_movie_indices = similar_scores.index[1:n+1] | |
# Get the titles and similarity scores of the recommended movies | |
recommendations = [] | |
for idx in similar_movie_indices: | |
title = movies.loc[movies['movieId'] == idx, 'title'].values[0] | |
score = similar_scores[idx] | |
recommendations.append((title, score)) | |
return recommendations | |
# Function for Gradio interface | |
def recommend_movies_cfnn(movie_title): | |
if not movie_title: | |
return "No movie selected. Please select one from the dropdown." | |
if movie_title not in movies['title'].values: | |
return f"Movie '{movie_title}' not found in the dataset." | |
recommendations = get_cfnn_recommendations(movie_title) | |
format_string = "{:>5.2f} {:<20}" | |
return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) | |
###################################### | |
# | |
# Gradio interface | |
# | |
###################################### | |
# Create a list of movie titles for the dropdown | |
movie_list = random.sample(movies['title'].tolist(), input_count) | |
total_movies = len(movies) | |
with gr.Blocks() as iface: | |
with gr.Tab("Content-Based Filtering"): | |
gr.Markdown("""## Movie Recommender - Content-Based Filtering | |
How it works: | |
* Use the 'genres' feature of movies, and convert genres into numerical vectors. | |
* For a given movie, find the most similar movies based on the genre similarity. | |
* This approach uses genres of movies only, without considering user preferences or viewing history. | |
* Simple to implement and computationally efficient. | |
""") | |
gr.Interface(fn=recommend_movies_cb, | |
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), | |
outputs=[gr.Textbox(label="Recommended Movies:")], | |
# title="Movie Recommender - Content-Based Filtering", | |
description="Select a movie to get recommendations based on content filtering.") | |
with gr.Tab("Collaborative Filtering"): | |
gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering | |
How it works: | |
* Create a movie-user matrix where rows represent movies and columns represent users, each cell contains the rating a user gave to a movie, or 0 if no rating exists. | |
* Calculate the cosine similarity between movies based on their rating patterns, results in a movie-movie similarity matrix. | |
* For a given movie, find the most similar movies based on this similarity matrix, and recommend these movies. | |
* Simple to implement and computationally efficient, but doesn't handle sparsity well (when many missing ratings). | |
""") | |
gr.Interface(fn=recommend_movies_cf, | |
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), | |
outputs=[gr.Textbox(label="Recommended Movies:")], | |
# title="Movie Recommender - Item-Based Collaborative Filtering", | |
description="Select a movie to get recommendations based on collaborative filtering.") | |
with gr.Tab("Collaborative Filtering with Neural Network"): | |
gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering with Neural Network | |
How it works: | |
* Use a Neural Network to predict the missing values in the movie-user matrix to improve the collaborative filtering recommendations. | |
* The NN model learns to reconstruct the movie-user matrix, effectively predicting missing ratings. This results in a dense, predicted movie-user matrix. | |
* Calculate movie-movie similarities using the predicted matrix. And use this similarity matrix to find and recommend similar movies. | |
* This approach often provides more accurate recommendations especially with large sparse datasets. But more complex to implement and require more computational resources. | |
""") | |
gr.Interface(fn=recommend_movies_cfnn, | |
inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), | |
outputs=[gr.Textbox(label="Recommended Movies:")], | |
# title="Movie Recommender - Item-Based Collaborative Filtering", | |
description="Select a movie to get recommendations based on collaborative filtering.") | |
# Launch the app | |
iface.launch() |