File size: 11,799 Bytes
725c528
eceea93
 
68a2379
d3012ab
68a2379
65c83a0
 
 
 
725c528
30059ee
88a2d43
 
e28d195
35aecae
725c528
02f4def
 
30059ee
 
eceea93
 
725c528
bd9a4ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3012ab
 
725c528
d3012ab
 
725c528
d3012ab
 
725c528
d3012ab
 
 
 
 
 
 
eceea93
dd07bd3
d3012ab
 
 
 
 
eceea93
d3012ab
 
eceea93
d3012ab
eceea93
d3012ab
 
 
 
eceea93
d3012ab
eceea93
d3012ab
 
6d5ff05
c517e46
 
d3012ab
 
eceea93
d3012ab
993d08d
c2e19f2
725c528
6d5ff05
48de9e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d5ff05
c517e46
 
48de9e2
 
 
 
 
 
 
 
 
 
 
 
 
 
d3012ab
 
 
 
7dbbd88
 
e67968d
274400d
e67968d
 
 
274400d
e67968d
bd9a4ce
 
 
e67968d
bd9a4ce
7dbbd88
 
e67968d
274400d
87dfed9
 
 
e67968d
87dfed9
eceea93
d3012ab
eceea93
e67968d
d3012ab
eceea93
48de9e2
e67968d
274400d
e67968d
 
 
 
 
48de9e2
 
 
e67968d
48de9e2
 
725c528
eceea93
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import gradio as gr
import zipfile
import random

input_count = 300
result_count = 21

# Extract the MovieLens dataset
with zipfile.ZipFile('ml-latest-small.zip') as z:
    with z.open('ml-latest-small/movies.csv') as f:
        movies = pd.read_csv(f)
    with z.open('ml-latest-small/ratings.csv') as f:
        ratings = pd.read_csv(f)

######################################
#
# Content-based Filtering
#
######################################

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_cb_recommendations(title, cosine_sim=cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar movies
    sim_scores = sim_scores[1:result_count]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 20 most similar movies with their scores
    recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]
    return recommendations

# Gradio interface
def recommend_movies_cb(movie):
    if not movie:
        return "No movie selected. Please select one from the dropdown."
        
    recommendations = get_cb_recommendations(movie)
    format_string = "{:>5.2f}       {:<20}"
    return "Score     Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


######################################
#
# Collaborative Filtering (Item-based)
#
######################################
# Create a movie-user matrix
movie_user_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)

# Compute the cosine similarity between movies
movie_similarity = cosine_similarity(movie_user_matrix)

# Create a DataFrame with movie similarities
movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)

# Function to get movie recommendations using item-based collaborative filtering
def get_cf_recommendations(movie_title, movie_similarity_df=movie_similarity_df, movies=movies, n=result_count):
    # Get the movieId for the input movie title
    movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
    
    # Check if the movie is in our similarity matrix
    if movie_id not in movie_similarity_df.index:
        return []
    
    # Get the row of similarity scores for this movie
    similar_scores = movie_similarity_df.loc[movie_id]
    
    # Sort the scores in descending order
    similar_scores = similar_scores.sort_values(ascending=False)
    
    # Get the indices of the top-n most similar movies (excluding the input movie itself)
    similar_movie_indices = similar_scores.index[1:n+1]
    
    # Get the titles and similarity scores of the recommended movies
    recommendations = []
    for idx in similar_movie_indices:
        title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
        score = similar_scores[idx]
        recommendations.append((title, score))
    
    return recommendations

# Function for Gradio interface
def recommend_movies_cf(movie_title):
    if not movie_title:
        return "No movie selected. Please select one from the dropdown."
    
    if movie_title not in movies['title'].values:
        return f"Movie '{movie_title}' not found in the dataset."
    
    recommendations = get_cf_recommendations(movie_title)
    format_string = "{:>5.2f}       {:<20}"
    return "Score     Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


######################################
#
# Collaborative Filtering with Neural Network (Item-based)
#
######################################

# Normalize the ratings
scaler = MinMaxScaler()
movie_user_matrix_scaled = scaler.fit_transform(movie_user_matrix)

# Define the autoencoder model
input_dim = movie_user_matrix.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(movie_user_matrix_scaled, movie_user_matrix_scaled, 
                epochs=50, batch_size=64, shuffle=True, validation_split=0.2, 
                verbose=0)

# Use the trained autoencoder to predict the complete matrix
predicted_matrix_scaled = autoencoder.predict(movie_user_matrix_scaled)
predicted_matrix = scaler.inverse_transform(predicted_matrix_scaled)

# Create a DataFrame with the predicted matrix
predicted_matrix_df = pd.DataFrame(predicted_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.columns)

# Compute the cosine similarity between movies using the predicted matrix
movie_similarity_cfnn = cosine_similarity(predicted_matrix)

# Create a DataFrame with movie similarities
movie_similarity_cfnn_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)

# Function to get movie recommendations using item-based collaborative filtering
def get_cfnn_recommendations(movie_title, movie_similarity_df=movie_similarity_cfnn_df, movies=movies, n=result_count):
    # Get the movieId for the input movie title
    movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
    
    # Check if the movie is in our similarity matrix
    if movie_id not in movie_similarity_df.index:
        return []
    
    # Get the row of similarity scores for this movie
    similar_scores = movie_similarity_df.loc[movie_id]
    
    # Sort the scores in descending order
    similar_scores = similar_scores.sort_values(ascending=False)
    
    # Get the indices of the top-n most similar movies (excluding the input movie itself)
    similar_movie_indices = similar_scores.index[1:n+1]
    
    # Get the titles and similarity scores of the recommended movies
    recommendations = []
    for idx in similar_movie_indices:
        title = movies.loc[movies['movieId'] == idx, 'title'].values[0]
        score = similar_scores[idx]
        recommendations.append((title, score))

    return recommendations

# Function for Gradio interface
def recommend_movies_cfnn(movie_title):
    if not movie_title:
        return "No movie selected. Please select one from the dropdown."
        
    if movie_title not in movies['title'].values:
        return f"Movie '{movie_title}' not found in the dataset."
    
    recommendations = get_cfnn_recommendations(movie_title)
    format_string = "{:>5.2f}       {:<20}"
    return "Score     Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])


######################################
#
# Gradio interface
#
######################################

# Create a list of movie titles for the dropdown
movie_list = random.sample(movies['title'].tolist(), input_count)
total_movies = len(movies)

with gr.Blocks() as iface:
    with gr.Tab("Content-Based Filtering"):
        gr.Markdown("""## Movie Recommender - Content-Based Filtering
        How it works:
        * Use the 'genres' feature of movies, and convert genres into numerical vectors.
        * For a given movie, find the most similar movies based on the genre similarity.
        * This approach uses genres of movies only, without considering user preferences or viewing history.
        * Simple to implement and computationally efficient.
        """)        
        gr.Interface(fn=recommend_movies_cb, 
                     inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), 
                     outputs=[gr.Textbox(label="Recommended Movies:")],
                     # title="Movie Recommender - Content-Based Filtering", 
                     description="Select a movie to get recommendations based on content filtering.")
    
    with gr.Tab("Collaborative Filtering"):
        gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering
        How it works:
        * Create a movie-user matrix where rows represent movies and columns represent users, each cell contains the rating a user gave to a movie, or 0 if no rating exists.
        * Calculate the cosine similarity between movies based on their rating patterns, results in a movie-movie similarity matrix.
        * For a given movie, find the most similar movies based on this similarity matrix, and recommend these movies.
        * Simple to implement and computationally efficient, but doesn't handle sparsity well (when many missing ratings).
        """)
        gr.Interface(fn=recommend_movies_cf,
                     inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
                     outputs=[gr.Textbox(label="Recommended Movies:")],
                     # title="Movie Recommender - Item-Based Collaborative Filtering",
                     description="Select a movie to get recommendations based on collaborative filtering.")

    with gr.Tab("Collaborative Filtering with Neural Network"):
        gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering with Neural Network
        How it works:
        * Use a Neural Network to predict the missing values in the movie-user matrix to improve the collaborative filtering recommendations.
        * The NN model learns to reconstruct the movie-user matrix, effectively predicting missing ratings. This results in a dense, predicted movie-user matrix.
        * Calculate movie-movie similarities using the predicted matrix. And use this similarity matrix to find and recommend similar movies.
        * This approach often provides more accurate recommendations especially with large sparse datasets. But more complex to implement and require more computational resources.
        """)
        gr.Interface(fn=recommend_movies_cfnn,
                     inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"),
                     outputs=[gr.Textbox(label="Recommended Movies:")],
                     # title="Movie Recommender - Item-Based Collaborative Filtering",
                     description="Select a movie to get recommendations based on collaborative filtering.")

# Launch the app
iface.launch()