File size: 2,369 Bytes
7d6888a
 
 
4850728
7d6888a
 
 
4850728
7d6888a
 
 
 
4850728
7d6888a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4850728
4c04658
7d6888a
 
 
 
 
 
 
 
 
 
80871cf
7d6888a
4850728
 
 
7d6888a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from embed_with_db import embeddings, config, client
from tqdm import tqdm

class VectorDataBase():
    def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
        self.file_path = file_path
        self.file_type= file_type
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
        self.db_collection = client[config['DB_NAME']][db_collection]
        self.start_page = int(page_start)
    def load_docs_split(self):
        if str(self.file_type).lower() == 'pdf':
            loader = PyPDFLoader(self.file_path)
        elif str(self.file_type).lower() == 'text':
            loader = TextLoader(self.file_path)
        else:
            loader = None
        if loader:
            docs = loader.load()
            return self.text_splitter.split_documents(docs)
        else:
            return self.text_splitter.create_documents([self.file_path])

    def docs_embeddings(self):
        texts = self.load_docs_split()
        if texts:
            docsearch = MongoDBAtlasVectorSearch.from_documents(
                                                      texts,
                                                      embeddings,
                                                      collection=self.db_collection,
                                                      index_name=config['VECTOR_SEARCH_INDEX'])
            print('done!')
            return docsearch
        else:
            print('documents is not embedded')
            return 'Some issues'
    def add_collection_database(self,doc):
        self.db_collection.insert_one(
            {
                'text': doc.page_content,
                'embedding': embeddings.embed_query(doc.page_content),
                'source': doc.metadata.get('source', 'Unknown'),
                'page': doc.metadata.get('page', 0)
            }
        )
    def embedding_with_loop(self):
        docs = self.load_docs_split()
        if docs:
            for doc in tqdm(docs[self.start_page:]):
                self.add_collection_database(doc)
            print('Done')    
        else:
            raise Exception('Some issue with it')