Spaces:

srchakaev
/

find_my_movie_hf

Sleeping

App Files Files Community

srchakaev commited on Sep 1, 2023

Commit

5bdc726

1 Parent(s): fe6fba6

1

Browse files

Files changed (9) hide show

.gitattributes +3 -0
README.md +5 -7
bert_movie.ipynb +178 -0
bert_movie_edited.ipynb +310 -0
clean_mail_movie.csv +3 -0
mail_embeddings.joblib +3 -0
mail_faiss_index.index +3 -0
main.py +61 -0
requirements.txt +76 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+clean_mail_movie.csv filter=lfs diff=lfs merge=lfs -text
+mail_embeddings.joblib filter=lfs diff=lfs merge=lfs -text
+mail_faiss_index.index filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
-title: Find My Movie Hf
-emoji: 🌍
 colorFrom: pink
-colorTo: red
 sdk: streamlit
 sdk_version: 1.26.0
-app_file: app.py
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Find My Movie
+emoji: 🪄
 colorFrom: pink
+colorTo: indigo
 sdk: streamlit
 sdk_version: 1.26.0
+app_file: main.py
 pinned: false
+---

bert_movie.ipynb ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "import re\n",
+    "import string\n",
+    "import numpy as np\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import streamlit as st\n",
+    "import faiss\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = '/clean_mail_movie.csv'\n",
+    "\n",
+    "df = pd.read_csv(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = df['concat2embedding'].tolist() # Это объединённый столбец\n",
+    "titles = df['movie_title'].tolist()\n",
+    "images = df['image_url'].tolist()\n",
+    "descr = df['description'].tolist()\n",
+    "links = df['page_url'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean(text):\n",
+    "    text = text.lower()  # Нижний регистр\n",
+    "    # text = re.sub(r'\\d+', ' ', text)  # Удаляем числа\n",
+    "    # text = text.translate(str.maketrans('', '', string.punctuation))  # Удаляем пунктуацию\n",
+    "    text = re.sub(r'\\s+', ' ', text)  # Удаляем лишние пробелы\n",
+    "    text = text.strip()  # Удаляем начальные и конечные пробелы\n",
+    "    # text = re.sub(r'\\b\\w{1,2}\\b', '', text)  # Удаляем слова длиной менее 3 символов\n",
+    "    # Дополнительные шаги, которые могут быть полезны в данном контексте:\n",
+    "    # text = re.sub(r'\\b\\w+\\b', '', text)  # Удаляем отдельные слова (без чисел и знаков препинания)\n",
+    "    # text = ' '.join([word for word in text.split() if word not in stop_words])  # Удаляем стоп-слова\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "cleaned_text = []\n",
+    "\n",
+    "for text in dataset:\n",
+    "    cleaned_text.append(clean(text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pip install transformers sentencepiece\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
+    "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
+    "# model.cuda()  # uncomment it if you have a GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Дефолтная функция, шла в комплекте с моделью\n",
+    "\n",
+    "def embed_bert_cls(text, model, tokenizer):\n",
+    "    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024) # Модель сама создаёт пэддинги и маску.\n",
+    "    with torch.no_grad():\n",
+    "        model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
+    "    embeddings = model_output.last_hidden_state[:, 0, :]\n",
+    "    embeddings = torch.nn.functional.normalize(embeddings)\n",
+    "    return embeddings[0].cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Векторизация отзывов\n",
+    "text_embeddings = np.array([embed_bert_cls(text, model, tokenizer) for text in cleaned_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Создание FAISS индекса после определения text_embeddings\n",
+    "dimension = text_embeddings.shape[1]\n",
+    "index = faiss.IndexFlatL2(dimension)\n",
+    "index.add(text_embeddings.astype('float32'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mail_embeddings.joblib']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from joblib import dump, load\n",
+    "\n",
+    "# Сохранение эмбеддингов\n",
+    "dump(text_embeddings, 'mail_embeddings.joblib')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Сохранение индекса\n",
+    "faiss.write_index(index, \"mail_faiss_index.index\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

bert_movie_edited.ipynb ADDED Viewed

	@@ -0,0 +1,310 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "S52EVP7k-rl7"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import torch\n",
+        "import re\n",
+        "import string\n",
+        "import numpy as np\n",
+        "import streamlit as st\n",
+        "import faiss # хранение индексов\n",
+        "from tqdm import tqdm\n",
+        "from transformers import AutoTokenizer, AutoModel\n",
+        "from joblib import dump, load # Для сохранения/загрузки эмбэддингов"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "12BEEwcF-rl9"
+      },
+      "outputs": [],
+      "source": [
+        "path = '/content/movies_filtered.csv' # ИЗМЕНИ ТУТ ПУТЬ!\n",
+        "a\n",
+        "df = pd.read_csv(path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "df5lg8-m-rl-"
+      },
+      "outputs": [],
+      "source": [
+        "def clean(text):\n",
+        "    text = text.lower()  # Нижний регистр\n",
+        "    text = re.sub(r'\\d+', ' ', text)  # Удаляем числа\n",
+        "    # text = text.translate(str.maketrans('', '', string.punctuation))  # Удаляем пунктуацию\n",
+        "    text = re.sub(r'\\s+', ' ', text)  # Удаляем лишние пробелы\n",
+        "    text = text.strip()  # Удаляем начальные и конечные пробелы\n",
+        "    text = re.sub(r'\\s+|\\n', ' ', text) # Удаляет \\n и \\xa0\n",
+        "    # text = re.sub(r'\\b\\w{1,2}\\b', '', text)  # Удаляем слова длиной менее 3 символов\n",
+        "    # Дополнительные шаги, которые могут быть полезны в данном контексте:\n",
+        "    # text = re.sub(r'\\b\\w+\\b', '', text)  # Удаляем отдельные слова (без чисел и знаков препинания)\n",
+        "    # text = ' '.join([word for word in text.split() if word not in stop_words])  # Удаляем стоп-слова\n",
+        "    return text\n",
+        "\n",
+        "for i, row in df.iterrows():\n",
+        "    df.at[i, 'description'] = clean(row['description'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 19,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0huKeMs4-rl_",
+        "outputId": "8659997c-9b8a-45bb-e2d7-fcc05422b92a"
+      },
+      "outputs": [],
+      "source": [
+        "# pip install transformers sentencepiece\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
+        "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
+        "# model.cuda()  # uncomment it if you have a GPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 20,
+      "metadata": {
+        "id": "Xsxq-Ohx-rmA"
+      },
+      "outputs": [],
+      "source": [
+        "# применяем токенизатор:\n",
+        "# -≥ add_special_tokens = добавляем служебные токены (CLS=101, EOS=102)\n",
+        "# -≥ truncation = обрезаем по максимальной длине\n",
+        "# -≥ max_length = максимальная длина последовательности\n",
+        "tokenized = df['description'].apply((lambda x: tokenizer.encode(x,\n",
+        "                                                                      add_special_tokens=True,\n",
+        "                                                                      truncation=True,\n",
+        "                                                                      max_length=1024)))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 21,
+      "metadata": {
+        "id": "OuaXqHNj-rmB"
+      },
+      "outputs": [],
+      "source": [
+        "max_len = 1024\n",
+        "# Делаю пэддинг чтобы добить до max_len последовательности\n",
+        "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])\n",
+        "# И маску чтобы не применять self-attention на pad\n",
+        "attention_mask = np.where(padded != 0, 1, 0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {
+        "id": "h3bfQh2o-rmC"
+      },
+      "outputs": [],
+      "source": [
+        "# Датасет для массивов\n",
+        "class BertInputs(torch.utils.data.Dataset):\n",
+        "    def __init__(self, tokenized_inputs, attention_masks):\n",
+        "        super().__init__()\n",
+        "        self.tokenized_inputs = tokenized_inputs\n",
+        "        self.attention_masks = attention_masks\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return self.tokenized_inputs.shape[0]\n",
+        "\n",
+        "    def __getitem__(self, idx):\n",
+        "        ids = self.tokenized_inputs[idx]\n",
+        "        ams = self.attention_masks[idx]\n",
+        "\n",
+        "        return ids, ams\n",
+        "\n",
+        "dataset = BertInputs(padded, attention_mask)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Q7yYgEP3-rmC",
+        "outputId": "76047d40-f793-4cef-fc02-b98b232661f8"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "torch.Size([100, 1024]) torch.Size([100, 1024])\n"
+          ]
+        }
+      ],
+      "source": [
+        "#DataLoader чтобы отправлять бачи в цикл обучения\n",
+        "loader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=True)\n",
+        "sample_ids, sample_ams = next(iter(loader))\n",
+        "print(sample_ids.shape, sample_ams.shape)\n",
+        "\n",
+        "# shape BATCH_SIZE x MAX_LEN - что заходит в BERT"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 25,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "r1h0BNy1-rmD",
+        "outputId": "adea19c9-a0f2-418c-9a21-ebe8daa00077"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "100%|██████████| 94/94 [01:13<00:00,  1.28it/s]"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 1min 10s, sys: 145 ms, total: 1min 10s\n",
+            "Wall time: 1min 13s\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%time\n",
+        "\n",
+        "vectors_in_batch = []\n",
+        "\n",
+        "# Iterate over all batches\n",
+        "for inputs, attention_masks in tqdm(loader):\n",
+        "    vectors_in_mini_batch = []  # Store vectors in mini-batch\n",
+        "    with torch.no_grad():\n",
+        "        last_hidden_states = model(inputs.cuda(), attention_mask=attention_masks.cuda())\n",
+        "        vector = last_hidden_states[0][:,0,:].detach().cpu().numpy()\n",
+        "        vectors_in_mini_batch.append(vector)\n",
+        "\n",
+        "    vectors_in_batch.extend(vectors_in_mini_batch)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import itertools\n",
+        "\n",
+        "# Open the file and load the nested list\n",
+        "vectors_in_batch = load('vectors_in_batch.joblib')\n",
+        "\n",
+        "# Convert the nested list to an unnested list\n",
+        "text_embeddings = list(itertools.chain.from_iterable(vectors_in_batch))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Сохранение эмбеддингов\n",
+        "dump(vectors_in_batch, 'vectors_in_batch.joblib')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "94"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "len(vectors_in_batch)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "9366"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "len(text_embeddings)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.4"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

clean_mail_movie.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:057369f23a3dd85ab0cc93d9e24b3669067e1023346f40ae7d0d6dc846613d86
+size 46078303

mail_embeddings.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7275e4c9f962ec2e50e02f876716f0de3f75c2548d7615a59dfc14a883fe2f2e
+size 15097281

mail_faiss_index.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1ae5c60728b9d5d7f610dc02c8978a5802b5456ab93e55cb28da8f4cb0bc56
+size 15097101

main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import faiss
+import streamlit as st
+from transformers import AutoTokenizer, AutoModel
+import torch
+import joblib
+import pandas as pd
+# Загрузка сохраненных данных и индекса
+text_embeddings = joblib.load('mail_embeddings.joblib')
+index = faiss.read_index('mail_faiss_index.index')
+# Датасет
+df = pd.read_csv('clean_mail_movie.csv')
+titles = df['movie_title'].tolist()
+images = df['image_url'].tolist()
+descr = df['description'].tolist()
+links = df['page_url'].tolist()
+# Загрузка модели и токенизатора
+tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+# Функция для векторизации текста
+def embed_bert_cls(text, model, tokenizer):
+    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024)
+    with torch.no_grad():
+        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
+    embeddings = model_output.last_hidden_state[:, 0, :]
+    embeddings = torch.nn.functional.normalize(embeddings)
+    return embeddings[0].cpu().numpy()
+# Streamlit интерфейс
+st.title("Умный поиск фильмов")
+user_input = st.text_area("Введите описание фильма:")
+num_recs = st.selectbox("Количество рекомендаций:", [1, 3, 5, 10])
+if st.button("Найти"):
+    if user_input:
+        user_embedding = embed_bert_cls(user_input, model, tokenizer).astype('float32').reshape(1, -1)
+        distances, top_indices = index.search(user_embedding, num_recs)  # Здесь добавляем переменную distances
+        st.write(f"Рекомендованные фильмы (Топ-{num_recs}):")
+        for i, index in enumerate(top_indices[0]):
+            col1, col2, col3 = st.columns([1, 4, 1])  # Добавляем ещё одну колонку для уверенности
+            with col1:
+                try:
+                    st.image(images[index])  # Загружаем обложку фильма
+                except Exception as e:
+                    st.write(f"Could not display image at index {index}. Error: {e}")  # Это на случай отсутствия обложки
+            with col2:
+                st.markdown(f"[{titles[index]}]({links[index]})")  # Название фильма сделано кликабельным
+                st.write(descr[index])  # Выводим описание фильма
+            with col3:
+                st.write(f"Уверенность: {1 / (1 + distances[0][i]):.2f}")  # Выводим уверенность

requirements.txt ADDED Viewed

	@@ -0,0 +1,76 @@

+altair==5.1.1
+attrs==23.1.0
+blinker==1.6.2
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.7
+cmake==3.27.2
+faiss-gpu==1.7.2
+filelock==3.12.3
+fsspec==2023.6.0
+gitdb==4.0.10
+GitPython==3.1.33
+huggingface-hub==0.16.4
+idna==3.4
+importlib-metadata==6.8.0
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+lit==16.0.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.1
+numpy==1.25.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+packaging==23.1
+pandas==2.1.0
+Pillow==9.5.0
+protobuf==4.24.2
+pyarrow==13.0.0
+pydeck==0.8.0
+Pygments==2.16.1
+Pympler==1.0.1
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0.1
+referencing==0.30.2
+regex==2023.8.8
+requests==2.31.0
+rich==13.5.2
+rpds-py==0.10.0
+safetensors==0.3.3
+six==1.16.0
+smmap==5.0.0
+streamlit==1.26.0
+sympy==1.12
+tenacity==8.2.3
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.0
+torch==2.0.1
+tornado==6.3.3
+tqdm==4.66.1
+transformers==4.32.1
+triton==2.0.0
+typing_extensions==4.7.1
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.4
+validators==0.21.2
+watchdog==3.0.0
+zipp==3.16.2