|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_community.vectorstores import Chroma |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyPDFLoader |
|
import pathlib |
|
import gdown |
|
from .chains import PdfAndGoogleChain |
|
|
|
|
|
def embed_pdf(folder: str = 'data', name: str = 'book.pdf'): |
|
pathlib.Path(folder).mkdir(exist_ok=True) |
|
path = pathlib.Path(folder).joinpath(name) |
|
if not path.exists(): |
|
print('Downloading book PDF.') |
|
gdown.download('https://drive.google.com/file/d/1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z/view?usp=drive_link', |
|
str(path), fuzzy=True) |
|
loader = PyPDFLoader(str(path)) |
|
documents = loader.load() |
|
splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=100) |
|
return splitter.split_documents( |
|
documents |
|
) |
|
|
|
|
|
class AcademicClient: |
|
|
|
def create_vectordb(self): |
|
if pathlib.Path('db').exists(): |
|
self.vectordb = Chroma(persist_directory='db', embedding_function=OpenAIEmbeddings()) |
|
elif pathlib.Path('src/db').exists(): |
|
self.vectordb = Chroma(persist_directory='src/db', embedding_function=OpenAIEmbeddings()) |
|
else: |
|
print('Not found cached DB. Rebuilding DB state, could use money from OPENAI!!!!') |
|
texts = embed_pdf() |
|
self.vectordb = Chroma.from_documents( |
|
documents=texts, |
|
embedding=OpenAIEmbeddings(), |
|
persist_directory="db" |
|
) |
|
self.vectordb.persist() |
|
|
|
def __init__(self): |
|
self.create_vectordb() |
|
self.chain = PdfAndGoogleChain( |
|
self.vectordb.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 30, "k": 6})) |
|
|
|
def answer(self, query, options): |
|
return self.chain.answer(query, options) |
|
|