File size: 1,881 Bytes
ed74064
 
 
 
 
 
 
 
 
 
 
 
 
 
bed851f
 
ed74064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc332e6
 
ed74064
bc332e6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import pathlib
import gdown
from .chains import PdfAndGoogleChain


def embed_pdf(folder: str = 'data', name: str = 'book.pdf'):
    pathlib.Path(folder).mkdir(exist_ok=True)
    path = pathlib.Path(folder).joinpath(name)
    if not path.exists():
        print('Downloading book PDF.')
        gdown.download('https://drive.google.com/file/d/1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z/view?usp=drive_link',
                       str(path), fuzzy=True)
    loader = PyPDFLoader(str(path))
    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100)
    return splitter.split_documents(
        documents
    )


class AcademicClient:

    def create_vectordb(self):
        if pathlib.Path('db').exists():
            self.vectordb = Chroma(persist_directory='db', embedding_function=OpenAIEmbeddings())
        elif pathlib.Path('src/db').exists():
            self.vectordb = Chroma(persist_directory='src/db', embedding_function=OpenAIEmbeddings())
        else:
            print('Not found cached DB. Rebuilding DB state, could use money from OPENAI!!!!')
            texts = embed_pdf()
            self.vectordb = Chroma.from_documents(
                documents=texts,
                embedding=OpenAIEmbeddings(),
                persist_directory="db"
            )
            self.vectordb.persist()

    def __init__(self):
        self.create_vectordb()
        self.chain = PdfAndGoogleChain(
            self.vectordb.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 30, "k": 6}))

    def answer(self, query, options):
        return self.chain.answer(query, options)