itachi-ai commited on
Commit
4850728
·
verified ·
1 Parent(s): 4c04658

updated vectorize

Browse files
Files changed (1) hide show
  1. vectorize.py +8 -5
vectorize.py CHANGED
@@ -1,15 +1,16 @@
1
  from langchain_mongodb import MongoDBAtlasVectorSearch
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
4
- from embed_with_db import embeddings, config, VECTORDB_STORE, client
5
  from tqdm import tqdm
6
 
7
  class VectorDataBase():
8
- def __init__(self, file_path, db_collection, file_type='pdf', ):
9
  self.file_path = file_path
10
  self.file_type= file_type
11
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
12
  self.db_collection = client[config['DB_NAME']][db_collection]
 
13
  def load_docs_split(self):
14
  if str(self.file_type).lower() == 'pdf':
15
  loader = PyPDFLoader(self.file_path)
@@ -36,8 +37,7 @@ class VectorDataBase():
36
  else:
37
  print('documents is not embedded')
38
  return 'Some issues'
39
- @staticmethod
40
- def add_collection_database(doc):
41
  self.db_collection.insert_one(
42
  {
43
  'text': doc.page_content,
@@ -49,6 +49,9 @@ class VectorDataBase():
49
  def embedding_with_loop(self):
50
  docs = self.load_docs_split()
51
  if docs:
52
- for doc in tqdm(docs):
53
  self.add_collection_database(doc)
 
 
 
54
 
 
1
  from langchain_mongodb import MongoDBAtlasVectorSearch
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
4
+ from embed_with_db import embeddings, config, client
5
  from tqdm import tqdm
6
 
7
  class VectorDataBase():
8
+ def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
9
  self.file_path = file_path
10
  self.file_type= file_type
11
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
12
  self.db_collection = client[config['DB_NAME']][db_collection]
13
+ self.start_page = int(page_start)
14
  def load_docs_split(self):
15
  if str(self.file_type).lower() == 'pdf':
16
  loader = PyPDFLoader(self.file_path)
 
37
  else:
38
  print('documents is not embedded')
39
  return 'Some issues'
40
+ def add_collection_database(self,doc):
 
41
  self.db_collection.insert_one(
42
  {
43
  'text': doc.page_content,
 
49
  def embedding_with_loop(self):
50
  docs = self.load_docs_split()
51
  if docs:
52
+ for doc in tqdm(docs[self.start_page:20]):
53
  self.add_collection_database(doc)
54
+ print('Done')
55
+ else:
56
+ raise Exception('Some issue with it')
57