{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import os\n", "\n", "from langchain_openai import OpenAIEmbeddings\n", "from langchain.vectorstores import Chroma\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain_openai import ChatOpenAI\n", "from dotenv import load_dotenv\n", "from langchain.prompts import PromptTemplate\n", "from langchain_community.utilities import GoogleSerperAPIWrapper" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:31:50.973351600Z", "start_time": "2024-04-09T13:31:48.724776800Z" } }, "id": "6ced23bcbc0e28e5" }, { "cell_type": "code", "execution_count": 2, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-04-09T13:31:50.992692600Z", "start_time": "2024-04-09T13:31:50.975349700Z" } }, "outputs": [ { "data": { "text/plain": "True" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:31:50.996689600Z", "start_time": "2024-04-09T13:31:50.989345500Z" } }, "id": "a6de359e6f0e68ac" }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0125\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:31:51.750122200Z", "start_time": "2024-04-09T13:31:50.996689600Z" } }, "id": "3b45ee8734cc3396" }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "import pathlib\n", "if pathlib.Path('db').exists():\n", " vectordb = Chroma(persist_directory='db', embedding_function=OpenAIEmbeddings())\n", "else:\n", " loader = PyPDFLoader(\"../data/book.pdf\")\n", " documents = loader.load()\n", " splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", " chunk_overlap=100)\n", " texts = splitter.split_documents(\n", " documents\n", " )\n", " vectordb = Chroma.from_documents(\n", " documents=texts,\n", " embedding=OpenAIEmbeddings(),\n", " persist_directory=\"db\"\n", " )\n", " vectordb.persist()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:31:53.393778600Z", "start_time": "2024-04-09T13:31:51.753134200Z" } }, "id": "6ecda08560566442" }, { "cell_type": "code", "execution_count": 6, "outputs": [], "source": [ "custom_rag_prompt = \"\"\"\n", "Використай наступні **надійні** елементи, для того, щоб відповісти на питання в кінці. \n", "Якщо вони не містять відповіді, зверни увагу на відповідь з інтернету, хоча вона може бути не надійною. \n", "Якщо ти не знаєш відповіді, використаши всі свої джерела, то просто скажи про це, не потрібно вигадувати відповідь.\n", "Використовуй не більше трьох речень, та намагайся відповісти коротко та чітко.\n", "\n", "{context}\n", "\n", "Відповідь з інтернету: {internet}\n", "\n", "Питання: {question}\n", "\n", "Корисна відповідь:\"\"\"\n", "\n", "custom_rag_prompt = PromptTemplate.from_template(custom_rag_prompt)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:31:53.410090300Z", "start_time": "2024-04-09T13:31:53.397777900Z" } }, "id": "1827a7ad093fa60a" }, { "cell_type": "code", "execution_count": 26, "outputs": [], "source": [ "retriever = vectordb.as_retriever()\n", "\n", "search = GoogleSerperAPIWrapper()\n", "def use_google_search(query):\n", " try:\n", " return search.run(query)\n", " except Exception as ex:\n", " return 'NONE'\n", "\n", "def documents_parser(docs):\n", " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", "\n", "rag_chain = (\n", " {\"context\": retriever | documents_parser, \"internet\" : RunnableLambda(use_google_search), \"question\": RunnablePassthrough()}\n", " | custom_rag_prompt\n", " | llm\n", " | StrOutputParser()\n", ")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T18:59:04.620561600Z", "start_time": "2024-04-09T18:59:04.602246900Z" } }, "id": "64cb22281c854513" }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "data": { "text/plain": "langchain_core.runnables.base.RunnableSequence" }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(rag_chain)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T18:59:09.631586300Z", "start_time": "2024-04-09T18:59:09.610952200Z" } }, "id": "c2fe5487662fc6f0" }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": "'До конституційних засад сучасної політичної системи України входять демократія, принцип верховенства права, гарантії прав та свобод громадян, розділення влади на виконавчу, законодавчу та судову.'" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rag_chain.invoke(\"Які конституційні засади сучасної політичної системи України ви знаєте?\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:38:50.033577300Z", "start_time": "2024-04-09T13:38:45.864222300Z" } }, "id": "2a36756422b7544" }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "stuff = search.run('Які конституційні засади сучасної політичної системи України ви знаєте?')" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T13:38:33.477869300Z", "start_time": "2024-04-09T13:38:32.098891500Z" } }, "id": "7c2ec151bf629265" }, { "cell_type": "code", "execution_count": 25, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading book PDF.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z\n", "To: C:\\Users\\bsvja\\PycharmProjects\\pdf-rag\\data\\book.pdf\n", "100%|██████████| 2.37M/2.37M [00:00<00:00, 4.19MB/s]\n" ] } ], "source": [ "def embed_pdf(folder: str = 'data', name: str = 'book.pdf'):\n", " pathlib.Path(folder).mkdir(exist_ok=True)\n", " path = pathlib.Path(folder).joinpath(name)\n", " if not path.exists():\n", " print('Downloading book PDF.')\n", " gdown.download('https://drive.google.com/file/d/1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z/view?usp=drive_link',\n", " str(path), fuzzy=True)\n", " \n", "embed_pdf('../data')" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T18:00:13.190953100Z", "start_time": "2024-04-09T18:00:03.632167700Z" } }, "id": "b0a54b5e476b46e0" }, { "cell_type": "code", "execution_count": 29, "outputs": [], "source": [ "from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T19:11:51.492629300Z", "start_time": "2024-04-09T19:11:51.470103600Z" } }, "id": "5c977fcc519c1a6e" }, { "cell_type": "code", "execution_count": 36, "outputs": [], "source": [ "def flatten_input(d):\n", " ret = d.pop('a')\n", " ret.update(d)\n", " return ret\n", "a = RunnableParallel(a = RunnablePassthrough(), b = RunnableLambda(lambda x: \"abracadabra\")) | RunnableLambda(flatten_input)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T19:14:33.339534100Z", "start_time": "2024-04-09T19:14:33.319287200Z" } }, "id": "720b1320bc0fb7d8" }, { "cell_type": "code", "execution_count": 37, "outputs": [ { "data": { "text/plain": "{'xx': 'yy', 'zz': 11, 'b': 'abracadabra'}" }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.invoke({\"xx\" : \"yy\", \"zz\" : 11})" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-04-09T19:14:33.976773500Z", "start_time": "2024-04-09T19:14:33.918734500Z" } }, "id": "465e8521af889ff6" }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "27c9bf7387f058cf" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }