Spaces:
Sleeping
Sleeping
File size: 4,418 Bytes
1b2e553 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# from langchain_text_splitters import CharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_chroma import Chroma
# from langchain.docstore.document import Document
# import pandas as pd
# import os
# import glob
# # Define a function to perform vectorization for multiple CSV files
# def vectorize_documents():
# embeddings = HuggingFaceEmbeddings()
# # Directory containing multiple CSV files
# csv_directory = "Data" # Replace with your folder name
# csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder
# documents = []
# # Load and concatenate all CSV files
# for file_path in csv_files:
# df = pd.read_csv(file_path)
# for _, row in df.iterrows():
# # Combine all columns in the row into a single string
# row_content = " ".join(row.astype(str))
# documents.append(Document(page_content=row_content))
# # Splitting the text and creating chunks of these documents
# text_splitter = CharacterTextSplitter(
# chunk_size=2000,
# chunk_overlap=500
# )
# text_chunks = text_splitter.split_documents(documents)
# # Process text chunks in batches
# batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
# for i in range(0, len(text_chunks), batch_size):
# batch = text_chunks[i:i + batch_size]
# # Store the batch in Chroma vector DB
# vectordb = Chroma.from_documents(
# documents=batch,
# embedding=embeddings,
# persist_directory="vector_db_dir"
# )
# print("Documents Vectorized and saved in VectorDB")
# # Expose embeddings if needed
# embeddings = HuggingFaceEmbeddings()
# # Main guard to prevent execution on import
# if __name__ == "__main__":
# vectorize_documents()
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
import pandas as pd
import os
import glob
from PyPDF2 import PdfReader # Ensure PyPDF2 is installed
# Define a function to process CSV files
def process_csv_files(csv_files):
documents = []
for file_path in csv_files:
df = pd.read_csv(file_path)
for _, row in df.iterrows():
row_content = " ".join(row.astype(str))
documents.append(Document(page_content=row_content))
return documents
# Define a function to process PDF files
def process_pdf_files(pdf_files):
documents = []
for file_path in pdf_files:
reader = PdfReader(file_path)
for page in reader.pages:
text = page.extract_text()
if text: # Only add non-empty text
documents.append(Document(page_content=text))
return documents
# Define a function to perform vectorization for CSV and PDF files
def vectorize_documents():
embeddings = HuggingFaceEmbeddings()
# Directory containing files
data_directory = "Data" # Replace with your folder name
csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))
# Process CSV and PDF files
documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)
# Splitting the text and creating chunks of these documents
text_splitter = CharacterTextSplitter(
chunk_size=2000,
chunk_overlap=500
)
text_chunks = text_splitter.split_documents(documents)
# Process text chunks in batches
batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
for i in range(0, len(text_chunks), batch_size):
batch = text_chunks[i:i + batch_size]
# Store the batch in Chroma vector DB
vectordb = Chroma.from_documents(
documents=batch,
embedding=embeddings,
persist_directory="vector_db_dir"
)
print("Documents Vectorized and saved in VectorDB")
# Expose embeddings if needed
embeddings = HuggingFaceEmbeddings()
# Main guard to prevent execution on import
if __name__ == "__main__":
vectorize_documents()
|