chrisaldikaraharja commited on
Commit
01850de
·
verified ·
1 Parent(s): e4a7fa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -31
app.py CHANGED
@@ -5,31 +5,28 @@ from surprise.model_selection import train_test_split
5
  from collections import defaultdict
6
  import kagglehub
7
 
8
- # Step 1: Download the latest version of the dataset and get the path
9
- path = kagglehub.dataset_download("ashpalsingh1525/imdb-movies-dataset")
 
10
 
11
- # Step 2: Define the dataset folder path
12
- dataset_folder = "/home/user/.cache/kagglehub/datasets/ashpalsingh1525/imdb-movies-dataset/versions/1"
13
 
14
- # Step 3: Define the CSV file path (Update if the filename is different)
15
- dataset_path = f"{dataset_folder}/imdb_movies.csv"
16
-
17
- # Load the dataset
18
  df = pd.read_csv(dataset_path)
19
 
20
- # Ensure all categorical columns are strings
21
  categorical_columns = ['genre', 'orig_title', 'orig_lang', 'country', 'crew']
22
-
23
  for col in categorical_columns:
24
- df[col] = df[col].astype(str) # Convert to string explicitly
25
 
26
- # Check unique values in genre column (to ensure it's not numerical)
27
  if df['genre'].str.isnumeric().all():
28
  print("Warning: Genre column is numeric. Mapping needed.")
29
  genre_mapping = {i: f"Genre_{i}" for i in df['genre'].unique()}
30
  df['genre'] = df['genre'].map(genre_mapping)
31
 
32
- # Prepare dataset for Surprise
33
  reader = Reader(rating_scale=(df['score'].min(), df['score'].max()))
34
  data = Dataset.load_from_df(df[['orig_title', 'orig_lang', 'score']], reader)
35
 
@@ -38,45 +35,40 @@ trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
38
  model = SVD(n_factors=50, random_state=42)
39
  model.fit(trainset)
40
 
41
- # Function to get movie recommendations
42
  def get_recommendations(selected_movies, genre):
43
  if not selected_movies:
44
  return ["Please select at least one movie."]
45
 
46
- # Filter dataset by genre
47
  filtered_movies = df[df['genre'] == genre]
48
-
49
- # Store average scores of all movies
50
  movie_scores = defaultdict(float)
51
 
52
- # Predict ratings for all movies in the filtered dataset
53
  for movie in filtered_movies['orig_title'].unique():
54
  est_score = model.predict(uid='user', iid=movie).est
55
  movie_scores[movie] = est_score
56
 
57
- # Sort movies by predicted score (descending)
58
  recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
59
-
60
- # Exclude already selected movies
61
  recommended_movies = [movie for movie, _ in recommended_movies if movie not in selected_movies]
62
 
63
- return recommended_movies[:5] # Return top 5 recommendations
 
 
 
64
 
65
- # Streamlit UI
66
  st.title("🎬 Movie Recommendation System")
67
 
68
  # Genre selection
69
- selected_genre = st.selectbox("Select a Genre", sorted(df['genre'].unique().tolist()))
70
-
71
- # Get available movies for the selected genre
72
- movies_in_genre = df[df['genre'] == selected_genre]['orig_title'].unique().tolist()
73
 
74
- # Movie selection
75
- selected_movies = st.multiselect("Select Up to 3 Movies", movies_in_genre, max_selections=3)
 
76
 
77
  # Recommendation button
78
  if st.button("Get Recommendations"):
79
  recommendations = get_recommendations(selected_movies, selected_genre)
80
- st.subheader("Recommended Movies:")
81
  for movie in recommendations:
82
- st.write(f"- {movie}")
 
5
  from collections import defaultdict
6
  import kagglehub
7
 
8
+ # Step 1: Download the latest version of the dataset
9
+ dataset_ref = "ashpalsingh1525/imdb-movies-dataset"
10
+ path = kagglehub.download(dataset_ref)
11
 
12
+ # Define the dataset path (update filename if needed)
13
+ dataset_path = f"{path}/imdb_movies.csv"
14
 
15
+ # Step 2: Load the dataset
 
 
 
16
  df = pd.read_csv(dataset_path)
17
 
18
+ # Ensure categorical columns are strings
19
  categorical_columns = ['genre', 'orig_title', 'orig_lang', 'country', 'crew']
 
20
  for col in categorical_columns:
21
+ df[col] = df[col].astype(str)
22
 
23
+ # Check if genre column needs mapping
24
  if df['genre'].str.isnumeric().all():
25
  print("Warning: Genre column is numeric. Mapping needed.")
26
  genre_mapping = {i: f"Genre_{i}" for i in df['genre'].unique()}
27
  df['genre'] = df['genre'].map(genre_mapping)
28
 
29
+ # Step 3: Prepare dataset for Surprise library
30
  reader = Reader(rating_scale=(df['score'].min(), df['score'].max()))
31
  data = Dataset.load_from_df(df[['orig_title', 'orig_lang', 'score']], reader)
32
 
 
35
  model = SVD(n_factors=50, random_state=42)
36
  model.fit(trainset)
37
 
38
+ # Step 4: Define functions for recommendation
39
  def get_recommendations(selected_movies, genre):
40
  if not selected_movies:
41
  return ["Please select at least one movie."]
42
 
 
43
  filtered_movies = df[df['genre'] == genre]
 
 
44
  movie_scores = defaultdict(float)
45
 
 
46
  for movie in filtered_movies['orig_title'].unique():
47
  est_score = model.predict(uid='user', iid=movie).est
48
  movie_scores[movie] = est_score
49
 
 
50
  recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
 
 
51
  recommended_movies = [movie for movie, _ in recommended_movies if movie not in selected_movies]
52
 
53
+ return recommended_movies[:5] # Top 5 recommendations
54
+
55
+ def get_movies_by_genre(genre):
56
+ return df[df['genre'] == genre]['orig_title'].unique().tolist()
57
 
58
+ # Step 5: Streamlit UI
59
  st.title("🎬 Movie Recommendation System")
60
 
61
  # Genre selection
62
+ genre_list = sorted(df['genre'].unique().tolist())
63
+ selected_genre = st.selectbox("Select a Genre", genre_list)
 
 
64
 
65
+ # Movie selection (dynamically updates based on genre)
66
+ movie_options = get_movies_by_genre(selected_genre)
67
+ selected_movies = st.multiselect("Select Up to 3 Movies", movie_options)
68
 
69
  # Recommendation button
70
  if st.button("Get Recommendations"):
71
  recommendations = get_recommendations(selected_movies, selected_genre)
72
+ st.write("### Recommended Movies:")
73
  for movie in recommendations:
74
+ st.write(f"- {movie}")