{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "7f817eb8-9b0a-4340-9c36-8d3eb7f55f97", "metadata": {}, "outputs": [], "source": [ "import gensim\n", "\n", "# Load the model\n", "ft_model = gensim.models.FastText.load(\"ft_model1.model\")\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "c91f18bf-21e1-4bca-aad6-a4441c079f8f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['vana', 'vakomana']\n", "['vasikana']\n" ] } ], "source": [ "expression = 'vana + vakomana - vasikana'\n", "\n", "positive = []\n", "negative = []\n", "\n", "parts = expression.replace('-', '+-').split('+')\n", "\n", "for part in parts:\n", " part = part.strip()\n", " if part.startswith('-'):\n", " negative.append(part[1:].strip())\n", " elif part:\n", " positive.append(part)\n", "\n", "print(positive)\n", "print(negative)" ] }, { "cell_type": "code", "execution_count": 7, "id": "a9ffbb58-f536-49e5-8e06-f7a5c9b52326", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "output array is read-only", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m positive[\u001b[38;5;241m1\u001b[39m:]:\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m ft_model\u001b[38;5;241m.\u001b[39mwv:\n\u001b[0;32m----> 6\u001b[0m result_vector \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m ft_model\u001b[38;5;241m.\u001b[39mwv\u001b[38;5;241m.\u001b[39mget_vector(word)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWord \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m not in vocabulary\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mValueError\u001b[0m: output array is read-only" ] } ], "source": [ "\n", "result_vector = ft_model.wv.get_vector(positive[0]) \n", "\n", "# Add vectors for positive words\n", "for word in positive[1:]:\n", " if word in ft_model.wv:\n", " result_vector += ft_model.wv.get_vector(word)\n", " else:\n", " raise KeyError(f\"Word '{word}' not in vocabulary\")\n", "\n", "# Subtract vectors for negative words\n", "for word in negative:\n", " if word in ft_model.wv:\n", " result_vector -= ft_model.wv.get_vector(word)\n", " else:\n", " raise KeyError(f\"Word '{word}' not in vocabulary\")\n", "\n", "# Find similar words\n", "result = ft_model.wv.most_similar(positive=[result_vector], topn=top_n)\n", "\n", "# Format the results\n", "response = [{'word': word, 'similarity': similarity} for word, similarity in result]\n", "\n", "\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 9, "id": "a9c823fa-d1f8-4124-8e60-418986eea4a3", "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 11, "id": "1edd0ca0-38b9-44c8-beb3-25ebc2793223", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'word': 'vana', 'similarity': 0.92390615}, {'word': 'nhevana', 'similarity': 0.7112871}, {'word': 'twevana', 'similarity': 0.6789861}, {'word': 'vanajib', 'similarity': 0.66515744}, {'word': 'svana', 'similarity': 0.6477824}]\n" ] } ], "source": [ "top_n = 5\n", "if not positive:\n", " raise ValueError(\"Positive word list is empty.\")\n", "if not isinstance(positive, list) or not isinstance(negative, list):\n", " raise TypeError(\"Positive and negative should be lists.\")\n", "\n", "# Initialize result_vector with a writable copy of the first positive word vector\n", "result_vector = ft_model.wv.get_vector(positive[0]).copy() \n", "\n", "# Add vectors for positive words\n", "for word in positive[1:]:\n", " if word in ft_model.wv:\n", " result_vector += ft_model.wv.get_vector(word)\n", " else:\n", " raise KeyError(f\"Word '{word}' not in vocabulary\")\n", "\n", "# Subtract vectors for negative words\n", "for word in negative:\n", " if word in ft_model.wv:\n", " result_vector -= ft_model.wv.get_vector(word)\n", " else:\n", " raise KeyError(f\"Word '{word}' not in vocabulary\")\n", "\n", "# Function to compute cosine similarity between vectors\n", "def cosine_similarity(vec1, vec2):\n", " return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))\n", "\n", "# Compute cosine similarities between result_vector and all words in the model's vocabulary\n", "cosine_similarities = []\n", "for word in ft_model.wv.index_to_key:\n", " word_vector = ft_model.wv.get_vector(word)\n", " similarity = cosine_similarity(result_vector, word_vector)\n", " cosine_similarities.append((word, similarity))\n", "\n", "# Sort by similarity and return top_n results\n", "sorted_similarities = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)\n", "top_results = sorted_similarities[:top_n]\n", "\n", "# Format the results\n", "response = [{'word': word, 'similarity': similarity} for word, similarity in top_results]\n", "print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "7dbc6092-275c-45cf-b137-29908f3d2b73", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }