File size: 6,828 Bytes

7b7289a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f817eb8-9b0a-4340-9c36-8d3eb7f55f97",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "\n",
    "# Load the model\n",
    "ft_model = gensim.models.FastText.load(\"ft_model1.model\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c91f18bf-21e1-4bca-aad6-a4441c079f8f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['vana', 'vakomana']\n",
      "['vasikana']\n"
     ]
    }
   ],
   "source": [
    "expression = 'vana + vakomana - vasikana'\n",
    "\n",
    "positive = []\n",
    "negative = []\n",
    "\n",
    "parts = expression.replace('-', '+-').split('+')\n",
    "\n",
    "for part in parts:\n",
    "    part = part.strip()\n",
    "    if part.startswith('-'):\n",
    "        negative.append(part[1:].strip())\n",
    "    elif part:\n",
    "        positive.append(part)\n",
    "\n",
    "print(positive)\n",
    "print(negative)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a9ffbb58-f536-49e5-8e06-f7a5c9b52326",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "output array is read-only",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m positive[\u001b[38;5;241m1\u001b[39m:]:\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m ft_model\u001b[38;5;241m.\u001b[39mwv:\n\u001b[0;32m----> 6\u001b[0m         result_vector \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m ft_model\u001b[38;5;241m.\u001b[39mwv\u001b[38;5;241m.\u001b[39mget_vector(word)\n\u001b[1;32m      7\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m      8\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWord \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m not in vocabulary\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mValueError\u001b[0m: output array is read-only"
     ]
    }
   ],
   "source": [
    "\n",
    "result_vector = ft_model.wv.get_vector(positive[0])  \n",
    "\n",
    "# Add vectors for positive words\n",
    "for word in positive[1:]:\n",
    "    if word in ft_model.wv:\n",
    "        result_vector += ft_model.wv.get_vector(word)\n",
    "    else:\n",
    "        raise KeyError(f\"Word '{word}' not in vocabulary\")\n",
    "\n",
    "# Subtract vectors for negative words\n",
    "for word in negative:\n",
    "    if word in ft_model.wv:\n",
    "        result_vector -= ft_model.wv.get_vector(word)\n",
    "    else:\n",
    "        raise KeyError(f\"Word '{word}' not in vocabulary\")\n",
    "\n",
    "# Find similar words\n",
    "result = ft_model.wv.most_similar(positive=[result_vector], topn=top_n)\n",
    "\n",
    "# Format the results\n",
    "response = [{'word': word, 'similarity': similarity} for word, similarity in result]\n",
    "\n",
    "\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a9c823fa-d1f8-4124-8e60-418986eea4a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1edd0ca0-38b9-44c8-beb3-25ebc2793223",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'word': 'vana', 'similarity': 0.92390615}, {'word': 'nhevana', 'similarity': 0.7112871}, {'word': 'twevana', 'similarity': 0.6789861}, {'word': 'vanajib', 'similarity': 0.66515744}, {'word': 'svana', 'similarity': 0.6477824}]\n"
     ]
    }
   ],
   "source": [
    "top_n = 5\n",
    "if not positive:\n",
    "    raise ValueError(\"Positive word list is empty.\")\n",
    "if not isinstance(positive, list) or not isinstance(negative, list):\n",
    "    raise TypeError(\"Positive and negative should be lists.\")\n",
    "\n",
    "# Initialize result_vector with a writable copy of the first positive word vector\n",
    "result_vector = ft_model.wv.get_vector(positive[0]).copy()  \n",
    "\n",
    "# Add vectors for positive words\n",
    "for word in positive[1:]:\n",
    "    if word in ft_model.wv:\n",
    "        result_vector += ft_model.wv.get_vector(word)\n",
    "    else:\n",
    "        raise KeyError(f\"Word '{word}' not in vocabulary\")\n",
    "\n",
    "# Subtract vectors for negative words\n",
    "for word in negative:\n",
    "    if word in ft_model.wv:\n",
    "        result_vector -= ft_model.wv.get_vector(word)\n",
    "    else:\n",
    "        raise KeyError(f\"Word '{word}' not in vocabulary\")\n",
    "\n",
    "# Function to compute cosine similarity between vectors\n",
    "def cosine_similarity(vec1, vec2):\n",
    "    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))\n",
    "\n",
    "# Compute cosine similarities between result_vector and all words in the model's vocabulary\n",
    "cosine_similarities = []\n",
    "for word in ft_model.wv.index_to_key:\n",
    "    word_vector = ft_model.wv.get_vector(word)\n",
    "    similarity = cosine_similarity(result_vector, word_vector)\n",
    "    cosine_similarities.append((word, similarity))\n",
    "\n",
    "# Sort by similarity and return top_n results\n",
    "sorted_similarities = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)\n",
    "top_results = sorted_similarities[:top_n]\n",
    "\n",
    "# Format the results\n",
    "response = [{'word': word, 'similarity': similarity} for word, similarity in top_results]\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dbc6092-275c-45cf-b137-29908f3d2b73",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}