Spaces:

tasmimulhuda
/

fastapi-rag-qa

Runtime error

File size: 4,981 Bytes

abb6f94

import os
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain.docstore.document import Document

import logging

INGEST_THREADS = os.cpu_count() or 8

DOCUMENT_MAP = {
    ".txt": TextLoader,
    ".md": TextLoader,
    ".pdf": PDFMinerLoader,
    ".csv": CSVLoader,
    ".csv": UnstructuredCSVLoader,
    ".xls": UnstructuredExcelLoader,
    ".xlsx": UnstructuredExcelLoader,
    ".docx": Docx2txtLoader
    # Add additional file types here if necessary
}

logger = logging.getLogger(__name__)

class DocumentLoader():
    def __init__(self, source_dir: str):
        """
        Initializes the loader with the directory path from which to load documents.
        """
        self.source_dir = source_dir
        logger.info(f"DocumentLoader initialized with source directory: {self.source_dir}")

    def load_single_document(self, file_path: str):
        """
        Loads a single document based on its file extension using the appropriate loader.

        Args:
            file_path (str): Path to the document file.

        Returns:
            List[Document]: Loaded document(s) as LangChain Document instances.
        """

        file_extension = os.path.splitext(file_path)[1]
        loader_class = DOCUMENT_MAP.get(file_extension)

        if loader_class: 
            loader = loader_class(file_path)
            logger.info(f"Loading document: {file_path}")
            try:
                documents = loader.load()
                logger.info(f"Successfully loaded document: {file_path}")
                return documents
            except Exception as e:
                logger.error(f"Error loading document {file_path}: {e}", exc_info=True)
                raise
        else:
            logger.warning(f"Unsupported document type for file: {file_path}")
            raise ValueError(f"Unsupported document type: {file_extension}")
        

    def load_all_documents(self) -> list[Document]:
        """
        Loads all documents from the source directory, including documents in subdirectories.

        Returns:
            List[Document]: List of all loaded documents from the source directory.
        """
        paths = self._gather_file_paths()  # Gather file paths of documents to load
        all_docs = []

        logger.info(f"Loading all documents from directory: {self.source_dir}")

        # # Load each document sequentially
        # for file_path in paths:
        #     documents = self.load_single_document(file_path)
        #     all_docs.extend(documents)  # Append loaded documents to the result list

        # # return all_docs
    

        for file_path in paths:
            try:
                documents = self.load_single_document(file_path)
                all_docs.extend(documents)  # Append loaded documents to the result list
            except ValueError as e:
                logger.error(f"Skipping file {file_path}: {e}")
            except Exception as e:
                logger.error(f"An unexpected error occurred while loading {file_path}: {e}", exc_info=True)

        logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}")
        return all_docs

    def _gather_file_paths(self):
        """
        Walks through the source directory and gathers file paths of documents 
        that match the supported file types in DOCUMENT_MAP.

        Returns:
            List[str]: List of file paths for documents to load.
        """
        file_paths = []
        logger.debug(f"Scanning for files in directory: {self.source_dir}")
        for root, _, files in os.walk(self.source_dir):
            for file_name in files:
                file_extension = os.path.splitext(file_name)[1]
                if file_extension in DOCUMENT_MAP:
                    full_path = os.path.join(root, file_name)
                    file_paths.append(full_path)
                    logger.debug(f"Found document: {full_path}")
                    
        logger.info(f"Total files found for loading: {len(file_paths)}")
        return file_paths



# if __name__ == "__main__":
#     source_directory = os.path.join(os.path.dirname(__file__),'..','Data')
#     document_loader = DocumentLoader(source_directory)
    
#     documents = document_loader.load_all_documents()






















# from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_community.vectorstores import FAISS
# directory_path = os.path.join(os.path.dirname(__file__),'..','Data')
# documents = load_documents(directory_path)
# print(documents)

# print(os.path.join(os.path.dirname(__file__),'..','Data'))