File size: 1,881 Bytes
ed74064 bed851f ed74064 bc332e6 ed74064 bc332e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import pathlib
import gdown
from .chains import PdfAndGoogleChain
def embed_pdf(folder: str = 'data', name: str = 'book.pdf'):
pathlib.Path(folder).mkdir(exist_ok=True)
path = pathlib.Path(folder).joinpath(name)
if not path.exists():
print('Downloading book PDF.')
gdown.download('https://drive.google.com/file/d/1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z/view?usp=drive_link',
str(path), fuzzy=True)
loader = PyPDFLoader(str(path))
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100)
return splitter.split_documents(
documents
)
class AcademicClient:
def create_vectordb(self):
if pathlib.Path('db').exists():
self.vectordb = Chroma(persist_directory='db', embedding_function=OpenAIEmbeddings())
elif pathlib.Path('src/db').exists():
self.vectordb = Chroma(persist_directory='src/db', embedding_function=OpenAIEmbeddings())
else:
print('Not found cached DB. Rebuilding DB state, could use money from OPENAI!!!!')
texts = embed_pdf()
self.vectordb = Chroma.from_documents(
documents=texts,
embedding=OpenAIEmbeddings(),
persist_directory="db"
)
self.vectordb.persist()
def __init__(self):
self.create_vectordb()
self.chain = PdfAndGoogleChain(
self.vectordb.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 30, "k": 6}))
def answer(self, query, options):
return self.chain.answer(query, options)
|