{ "cells": [ { "cell_type": "code", "execution_count": 53, "id": "408df710-efb3-45e0-94e1-5c4bdac72c06", "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import FAISS\n", "from langchain.embeddings import OllamaEmbeddings\n", "from langchain.chains import RetrievalQA\n", "from langchain.chat_models import ChatOllama\n", "\n" ] }, { "cell_type": "code", "execution_count": 54, "id": "b0c1e9a7-85c6-48fb-bc81-b5903b67c044", "metadata": {}, "outputs": [], "source": [ "from langchain.schema import Document\n", "with open(\"untitled.txt\", 'r') as f:\n", " doc = f.read()\n", "\n", "docs = [Document(page_content=doc)]\n" ] }, { "cell_type": "code", "execution_count": 55, "id": "b15e44da-6beb-489c-b847-d3276915ce8d", "metadata": {}, "outputs": [], "source": [ "splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)\n", "chunks = splitter.split_documents(docs)" ] }, { "cell_type": "code", "execution_count": 56, "id": "4f6a18b4-cb85-481a-b43f-f38c60959155", "metadata": {}, "outputs": [], "source": [ "embeddings = OllamaEmbeddings(model='llama3.2')" ] }, { "cell_type": "code", "execution_count": 57, "id": "7a7709bc-48a3-4771-af21-5c48e5ae9296", "metadata": {}, "outputs": [], "source": [ "vector_store = []\n", "for i in range(len(chunks)):\n", " em = embeddings.embed_query(chunks[i].page_content)\n", " vector_store.append(em)\n", " " ] }, { "cell_type": "code", "execution_count": 96, "id": "f95b010f-c384-4111-af98-43ba51568b08", "metadata": {}, "outputs": [], "source": [ "def similarity_search(te):\n", " result_list = [] # create a list to store all the embeddings\n", " emb = embeddings.embed_query(te) # create an embedding for our \"te\"\n", " for i in range(8): # we have created 8 chunks\n", " result = 0 # initialize the result for each chunk\n", " for j in range(3072): # we have 3072 dimentional vector as a representation for each chunk and out text\n", " result += emb[j] * vector_store[i][j]# we then take the dot product\n", " result = result / 55.42\n", " result_list.append({f'chunk {i+1}':result}) # and finally append the dot product in our return_list\n", " return result_list\n", " " ] }, { "cell_type": "code", "execution_count": 97, "id": "e63c7e94-56a7-4333-804d-e7c2b8697c77", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'chunk 1': 84.98757149361835}, {'chunk 2': 75.60246478296749}, {'chunk 3': 79.17318761328006}, {'chunk 4': 80.69328472997623}, {'chunk 5': 68.01708598133246}, {'chunk 6': 67.64770328462416}, {'chunk 7': 87.4843210948032}, {'chunk 8': 78.75659878926277}]\n" ] } ], "source": [ "print(similarity_search(\"Do not share passwords or access credentials. 7. Performance Reviews Formal reviews conducted every 6 months.\"))" ] }, { "cell_type": "code", "execution_count": 84, "id": "5418bed3-8cd4-403d-ab6b-8f476cac7f41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Use company devices for official work only.\\n\\nKeep systems updated and report any security incidents.\\n\\nDo not share passwords or access credentials.\\n\\n7. Performance Reviews\\n\\nFormal reviews conducted every 6 months.\\n\\nFocus on personal growth, goals, and team contributions.\\n\\n8. Learning & Development'" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunks[5].page_content\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "fe2342b3-3f82-4ffa-9ad4-c19f12dded21", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }