File size: 4,418 Bytes
1b2e553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# from langchain_text_splitters import CharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_chroma import Chroma
# from langchain.docstore.document import Document
# import pandas as pd
# import os
# import glob

# # Define a function to perform vectorization for multiple CSV files
# def vectorize_documents():
#     embeddings = HuggingFaceEmbeddings()

#     # Directory containing multiple CSV files
#     csv_directory = "Data"  # Replace with your folder name
#     csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))  # Find all CSV files in the folder

#     documents = []

#     # Load and concatenate all CSV files
#     for file_path in csv_files:
#         df = pd.read_csv(file_path)
#         for _, row in df.iterrows():
#             # Combine all columns in the row into a single string
#             row_content = " ".join(row.astype(str))
#             documents.append(Document(page_content=row_content))

#     # Splitting the text and creating chunks of these documents
#     text_splitter = CharacterTextSplitter(
#         chunk_size=2000,
#         chunk_overlap=500
#     )

#     text_chunks = text_splitter.split_documents(documents)

#     # Process text chunks in batches
#     batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
#     for i in range(0, len(text_chunks), batch_size):
#         batch = text_chunks[i:i + batch_size]

#         # Store the batch in Chroma vector DB
#         vectordb = Chroma.from_documents(
#             documents=batch,
#             embedding=embeddings,
#             persist_directory="vector_db_dir"
#         )

#     print("Documents Vectorized and saved in VectorDB")

# # Expose embeddings if needed
# embeddings = HuggingFaceEmbeddings()



# # Main guard to prevent execution on import
# if __name__ == "__main__":
#     vectorize_documents()



from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
import pandas as pd
import os
import glob
from PyPDF2 import PdfReader  # Ensure PyPDF2 is installed

# Define a function to process CSV files
def process_csv_files(csv_files):
    documents = []
    for file_path in csv_files:
        df = pd.read_csv(file_path)
        for _, row in df.iterrows():
            row_content = " ".join(row.astype(str))
            documents.append(Document(page_content=row_content))
    return documents

# Define a function to process PDF files
def process_pdf_files(pdf_files):
    documents = []
    for file_path in pdf_files:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text = page.extract_text()
            if text:  # Only add non-empty text
                documents.append(Document(page_content=text))
    return documents

# Define a function to perform vectorization for CSV and PDF files
def vectorize_documents():
    embeddings = HuggingFaceEmbeddings()

    # Directory containing files
    data_directory = "Data"  # Replace with your folder name
    csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
    pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))

    # Process CSV and PDF files
    documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)

    # Splitting the text and creating chunks of these documents
    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=500
    )

    text_chunks = text_splitter.split_documents(documents)

    # Process text chunks in batches
    batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i + batch_size]

        # Store the batch in Chroma vector DB
        vectordb = Chroma.from_documents(
            documents=batch,
            embedding=embeddings,
            persist_directory="vector_db_dir"
        )

    print("Documents Vectorized and saved in VectorDB")

# Expose embeddings if needed
embeddings = HuggingFaceEmbeddings()

# Main guard to prevent execution on import
if __name__ == "__main__":
    vectorize_documents()