Spaces:

sohvren
/

MovieRecommenderV2

Sleeping

File size: 9,217 Bytes

06d9388

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.2.1+cu121\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.insert(0, \"./interactive_tutorials\")\n",
    "\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import itertools\n",
    "import requests\n",
    "import sys\n",
    "\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from torch.nn import Linear\n",
    "import torch_geometric.transforms as T\n",
    "from torch_geometric.nn import SAGEConv, to_hetero\n",
    "from torch_geometric.transforms import RandomLinkSplit, ToUndirected\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from torch_geometric.data import HeteroData\n",
    "import yaml\n",
    "\n",
    "print(torch.__version__)\n",
    "\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{('user',\n",
       "  'rates',\n",
       "  'movie'): tensor([[   0,    0,    0,  ...,  670,  670,  670],\n",
       "         [   0,    1,    2,  ..., 1327, 1329, 2941]], device='cuda:0'),\n",
       " ('movie',\n",
       "  'rev_rates',\n",
       "  'user'): tensor([[   0,    1,    2,  ..., 1327, 1329, 2941],\n",
       "         [   0,    0,    0,  ...,  670,  670,  670]], device='cuda:0')}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = torch.load(\"./PyGdata.pt\")\n",
    "data.edge_index_dict\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{('user',\n",
       "  'rates',\n",
       "  'movie'): tensor([[   0,    0,    0,  ...,  670,  670,  670],\n",
       "         [   0,    1,    2,  ..., 1327, 1329, 2941]], device='cuda:0'),\n",
       " ('movie',\n",
       "  'rev_rates',\n",
       "  'user'): tensor([[   0,    1,    2,  ..., 1327, 1329, 2941],\n",
       "         [   0,    0,    0,  ...,  670,  670,  670]], device='cuda:0')}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GNNEncoder(torch.nn.Module):\n",
    "    def __init__(self, hidden_channels, out_channels):\n",
    "        super().__init__()\n",
    "        # these convolutions have been replicated to match the number of edge types\n",
    "        self.conv1 = SAGEConv((-1, -1), hidden_channels)\n",
    "        self.conv2 = SAGEConv((-1, -1), out_channels)\n",
    "\n",
    "    def forward(self, x, edge_index):\n",
    "        x = self.conv1(x, edge_index).relu()\n",
    "        x = self.conv2(x, edge_index)\n",
    "        return x\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class EdgeDecoder(torch.nn.Module):\n",
    "    def __init__(self, hidden_channels):\n",
    "        super().__init__()\n",
    "        self.lin1 = Linear(2 * hidden_channels, hidden_channels)\n",
    "        self.lin2 = Linear(hidden_channels, 1)\n",
    "\n",
    "    def forward(self, z_dict, edge_label_index):\n",
    "        row, col = edge_label_index\n",
    "        # concat user and movie embeddings\n",
    "        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)\n",
    "        # concatenated embeddings passed to linear layer\n",
    "        z = self.lin1(z).relu()\n",
    "        z = self.lin2(z)\n",
    "        return z.view(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model(torch.nn.Module):\n",
    "    def __init__(self, hidden_channels):\n",
    "        super().__init__()\n",
    "        self.encoder = GNNEncoder(hidden_channels, hidden_channels)\n",
    "        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')\n",
    "        self.decoder = EdgeDecoder(hidden_channels)\n",
    "\n",
    "    def forward(self, x_dict, edge_index_dict, edge_label_index):\n",
    "        # z_dict contains dictionary of movie and user embeddings returned from GraphSage\n",
    "        z_dict = self.encoder(x_dict, edge_index_dict)\n",
    "        return self.decoder(z_dict, edge_label_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HeteroData(\n",
      "  user={ x=[671, 671] },\n",
      "  movie={ x=[9025, 404] },\n",
      "  (user, rates, movie)={\n",
      "    edge_index=[2, 99810],\n",
      "    edge_label=[99810],\n",
      "  },\n",
      "  (movie, rev_rates, user)={ edge_index=[2, 99810] }\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "model = Model(hidden_channels=32).to(device)\n",
    "model2 = Model(hidden_channels=32).to(device)\n",
    "model.load_state_dict(torch.load(\"PyGTrainedModelState.pt\"))\n",
    "model.eval()\n",
    "\n",
    "total_users = data['user'].num_nodes \n",
    "total_movies = data['movie'].num_nodes \n",
    "print(data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 671/671 [00:05<00:00, 121.64it/s]\n"
     ]
    }
   ],
   "source": [
    "movie_recs = []\n",
    "for user_id in tqdm(range(0, total_users)):\n",
    "    user_row = torch.tensor([user_id] * total_movies)\n",
    "    all_movie_ids = torch.arange(total_movies)\n",
    "    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)\n",
    "    pred = model(data.x_dict, data.edge_index_dict,\n",
    "             edge_label_index)\n",
    "    pred = pred.clamp(min=0, max=5)\n",
    "    # we will only select movies for the user where the predicting rating is =5\n",
    "    rec_movie_ids = (pred == 5).nonzero(as_tuple=True)\n",
    "    top_ten_recs = [rec_movies for rec_movies in rec_movie_ids[0].tolist()[:10]]\n",
    "    movie_recs.append({'user': user_id, 'rec_movies': top_ten_recs})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Movie not found\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\aj\\AppData\\Local\\Temp\\ipykernel_24552\\778055959.py:2: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(metadata_path)\n"
     ]
    }
   ],
   "source": [
    "metadata_path = './sampled_movie_dataset/movies_metadata.csv'\n",
    "df = pd.read_csv(metadata_path)\n",
    "df.columns\n",
    "\n",
    "def get_movie_title(movie_id):\n",
    "    \"\"\"Looks up a movie title by its ID in the DataFrame.\"\"\"\n",
    "\n",
    "    row = df[df['id'] == movie_id]\n",
    "\n",
    "    if not row.empty:\n",
    "        return row['title'].iloc[0]  # Get the title from the first matching row\n",
    "    else:\n",
    "        return \"Movie not found\"\n",
    "    \n",
    "print(get_movie_title(14))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   user                                         rec_movies\n",
      "0     0   [14, 85, 101, 106, 111, 131, 132, 150, 210, 216]\n",
      "1     1    [13, 45, 95, 108, 109, 126, 130, 132, 213, 220]\n",
      "2     2  [562, 571, 894, 1013, 1169, 1289, 1378, 1405, ...\n",
      "3     3  [126, 137, 502, 571, 616, 696, 811, 966, 999, ...\n",
      "4     4  [364, 436, 493, 502, 509, 706, 781, 811, 1244,...\n",
      "Index(['user', 'rec_movies'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "\n",
    "movie_recs_df = pd.DataFrame(movie_recs)\n",
    "#movie_recs_df = movie_recs_df.set_index('id').join(df[['title']].set_index('id'), how='left')\n",
    "print(movie_recs_df.head()) \n",
    "print(movie_recs_df.columns) "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}