Spaces:
Runtime error
Runtime error
from langchain_mongodb import MongoDBAtlasVectorSearch | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyPDFLoader, TextLoader | |
from embed_with_db import embeddings, config, client | |
from tqdm import tqdm | |
class VectorDataBase(): | |
def __init__(self, file_path, db_collection, file_type='pdf', page_start=0): | |
self.file_path = file_path | |
self.file_type= file_type | |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32) | |
self.db_collection = client[config['DB_NAME']][db_collection] | |
self.start_page = int(page_start) | |
def load_docs_split(self): | |
if str(self.file_type).lower() == 'pdf': | |
loader = PyPDFLoader(self.file_path) | |
elif str(self.file_type).lower() == 'text': | |
loader = TextLoader(self.file_path) | |
else: | |
loader = None | |
if loader: | |
docs = loader.load() | |
return self.text_splitter.split_documents(docs) | |
else: | |
return self.text_splitter.create_documents([self.file_path]) | |
def docs_embeddings(self): | |
texts = self.load_docs_split() | |
if texts: | |
docsearch = MongoDBAtlasVectorSearch.from_documents( | |
texts, | |
embeddings, | |
collection=self.db_collection, | |
index_name=config['VECTOR_SEARCH_INDEX']) | |
print('done!') | |
return docsearch | |
else: | |
print('documents is not embedded') | |
return 'Some issues' | |
def add_collection_database(self,doc): | |
self.db_collection.insert_one( | |
{ | |
'text': doc.page_content, | |
'embedding': embeddings.embed_query(doc.page_content), | |
'source': doc.metadata.get('source', 'Unknown'), | |
'page': doc.metadata.get('page', 0) | |
} | |
) | |
def embedding_with_loop(self): | |
docs = self.load_docs_split() | |
if docs: | |
for doc in tqdm(docs[self.start_page:]): | |
self.add_collection_database(doc) | |
print('Done') | |
else: | |
raise Exception('Some issue with it') | |