Spaces:
Runtime error
Runtime error
updated vectorize
Browse files- vectorize.py +8 -5
vectorize.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
from langchain_mongodb import MongoDBAtlasVectorSearch
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
4 |
-
from embed_with_db import embeddings, config,
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
class VectorDataBase():
|
8 |
-
def __init__(self, file_path, db_collection, file_type='pdf', ):
|
9 |
self.file_path = file_path
|
10 |
self.file_type= file_type
|
11 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
|
12 |
self.db_collection = client[config['DB_NAME']][db_collection]
|
|
|
13 |
def load_docs_split(self):
|
14 |
if str(self.file_type).lower() == 'pdf':
|
15 |
loader = PyPDFLoader(self.file_path)
|
@@ -36,8 +37,7 @@ class VectorDataBase():
|
|
36 |
else:
|
37 |
print('documents is not embedded')
|
38 |
return 'Some issues'
|
39 |
-
|
40 |
-
def add_collection_database(doc):
|
41 |
self.db_collection.insert_one(
|
42 |
{
|
43 |
'text': doc.page_content,
|
@@ -49,6 +49,9 @@ class VectorDataBase():
|
|
49 |
def embedding_with_loop(self):
|
50 |
docs = self.load_docs_split()
|
51 |
if docs:
|
52 |
-
for doc in tqdm(docs):
|
53 |
self.add_collection_database(doc)
|
|
|
|
|
|
|
54 |
|
|
|
1 |
from langchain_mongodb import MongoDBAtlasVectorSearch
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
4 |
+
from embed_with_db import embeddings, config, client
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
class VectorDataBase():
|
8 |
+
def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
|
9 |
self.file_path = file_path
|
10 |
self.file_type= file_type
|
11 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
|
12 |
self.db_collection = client[config['DB_NAME']][db_collection]
|
13 |
+
self.start_page = int(page_start)
|
14 |
def load_docs_split(self):
|
15 |
if str(self.file_type).lower() == 'pdf':
|
16 |
loader = PyPDFLoader(self.file_path)
|
|
|
37 |
else:
|
38 |
print('documents is not embedded')
|
39 |
return 'Some issues'
|
40 |
+
def add_collection_database(self,doc):
|
|
|
41 |
self.db_collection.insert_one(
|
42 |
{
|
43 |
'text': doc.page_content,
|
|
|
49 |
def embedding_with_loop(self):
|
50 |
docs = self.load_docs_split()
|
51 |
if docs:
|
52 |
+
for doc in tqdm(docs[self.start_page:20]):
|
53 |
self.add_collection_database(doc)
|
54 |
+
print('Done')
|
55 |
+
else:
|
56 |
+
raise Exception('Some issue with it')
|
57 |
|