Spaces:
Runtime error
Runtime error
import os | |
from langchain_community.document_loaders.csv_loader import CSVLoader | |
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader | |
from langchain_community.document_loaders.excel import UnstructuredExcelLoader | |
from langchain_community.document_loaders import PDFMinerLoader | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.document_loaders import Docx2txtLoader | |
from langchain.docstore.document import Document | |
import logging | |
INGEST_THREADS = os.cpu_count() or 8 | |
DOCUMENT_MAP = { | |
".txt": TextLoader, | |
".md": TextLoader, | |
".pdf": PDFMinerLoader, | |
".csv": CSVLoader, | |
".csv": UnstructuredCSVLoader, | |
".xls": UnstructuredExcelLoader, | |
".xlsx": UnstructuredExcelLoader, | |
".docx": Docx2txtLoader | |
# Add additional file types here if necessary | |
} | |
logger = logging.getLogger(__name__) | |
class DocumentLoader(): | |
def __init__(self, source_dir: str): | |
""" | |
Initializes the loader with the directory path from which to load documents. | |
""" | |
self.source_dir = source_dir | |
logger.info(f"DocumentLoader initialized with source directory: {self.source_dir}") | |
def load_single_document(self, file_path: str): | |
""" | |
Loads a single document based on its file extension using the appropriate loader. | |
Args: | |
file_path (str): Path to the document file. | |
Returns: | |
List[Document]: Loaded document(s) as LangChain Document instances. | |
""" | |
file_extension = os.path.splitext(file_path)[1] | |
loader_class = DOCUMENT_MAP.get(file_extension) | |
if loader_class: | |
loader = loader_class(file_path) | |
logger.info(f"Loading document: {file_path}") | |
try: | |
documents = loader.load() | |
logger.info(f"Successfully loaded document: {file_path}") | |
return documents | |
except Exception as e: | |
logger.error(f"Error loading document {file_path}: {e}", exc_info=True) | |
raise | |
else: | |
logger.warning(f"Unsupported document type for file: {file_path}") | |
raise ValueError(f"Unsupported document type: {file_extension}") | |
def load_all_documents(self) -> list[Document]: | |
""" | |
Loads all documents from the source directory, including documents in subdirectories. | |
Returns: | |
List[Document]: List of all loaded documents from the source directory. | |
""" | |
paths = self._gather_file_paths() # Gather file paths of documents to load | |
all_docs = [] | |
logger.info(f"Loading all documents from directory: {self.source_dir}") | |
# # Load each document sequentially | |
# for file_path in paths: | |
# documents = self.load_single_document(file_path) | |
# all_docs.extend(documents) # Append loaded documents to the result list | |
# # return all_docs | |
for file_path in paths: | |
try: | |
documents = self.load_single_document(file_path) | |
all_docs.extend(documents) # Append loaded documents to the result list | |
except ValueError as e: | |
logger.error(f"Skipping file {file_path}: {e}") | |
except Exception as e: | |
logger.error(f"An unexpected error occurred while loading {file_path}: {e}", exc_info=True) | |
logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}") | |
return all_docs | |
def _gather_file_paths(self): | |
""" | |
Walks through the source directory and gathers file paths of documents | |
that match the supported file types in DOCUMENT_MAP. | |
Returns: | |
List[str]: List of file paths for documents to load. | |
""" | |
file_paths = [] | |
logger.debug(f"Scanning for files in directory: {self.source_dir}") | |
for root, _, files in os.walk(self.source_dir): | |
for file_name in files: | |
file_extension = os.path.splitext(file_name)[1] | |
if file_extension in DOCUMENT_MAP: | |
full_path = os.path.join(root, file_name) | |
file_paths.append(full_path) | |
logger.debug(f"Found document: {full_path}") | |
logger.info(f"Total files found for loading: {len(file_paths)}") | |
return file_paths | |
# if __name__ == "__main__": | |
# source_directory = os.path.join(os.path.dirname(__file__),'..','Data') | |
# document_loader = DocumentLoader(source_directory) | |
# documents = document_loader.load_all_documents() | |
# from langchain_community.embeddings import OpenAIEmbeddings | |
# from langchain_community.vectorstores import FAISS | |
# directory_path = os.path.join(os.path.dirname(__file__),'..','Data') | |
# documents = load_documents(directory_path) | |
# print(documents) | |
# print(os.path.join(os.path.dirname(__file__),'..','Data')) |