Spaces:
Runtime error
Runtime error
import os | |
import logging | |
from typing import List | |
from langchain_chroma import Chroma | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_openai import OpenAIEmbeddings | |
from app.settings import Config | |
conf = Config() | |
OPENAI_API_KEY = conf.API_KEY | |
PERSIST_DIRECTORY = conf.PERSIST_DIRECTORY | |
COLLECTION_NAME = conf.COLLECTION_NAME | |
# Set up logging | |
import logging | |
logger = logging.getLogger(__name__) | |
def initialize_embedding_model(): | |
"""Initialize the embedding model based on the availability of the OpenAI API key.""" | |
try: | |
if OPENAI_API_KEY: | |
logger.info("Using OpenAI embedding model.") | |
embedding_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY) | |
else: | |
logger.info(f"Using Hugging Face embedding model.") | |
embedding_model = HuggingFaceEmbeddings( | |
model_name=conf.MODEL_NAME, | |
model_kwargs=conf.MODEL_KWARGS, | |
encode_kwargs=conf.ENCODE_KWARGS | |
) | |
return embedding_model | |
except Exception as e: | |
logger.error(f"Error initializing embedding model: {e}") | |
raise | |
def split_text(documents: List[str]) -> List[str]: | |
"""Split documents into smaller chunks.""" | |
try: | |
logger.info(f"Splitting documents into chunks...") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=conf.CHUNK_SIZE, chunk_overlap=conf.CHUNK_OVERLAP) | |
chunks = text_splitter.split_documents(documents) | |
logger.info(f"Document splitting completed.") | |
return chunks | |
except Exception as e: | |
logger.error(f"Error splitting text: {e}") | |
raise | |
def get_chroma_client(collection_name: str, embedding_function, persist_directory: str): | |
"""Initialize and return a Chroma client for a specific collection.""" | |
try: | |
logger.info(f"Creating Chroma client for collection: {collection_name}") | |
return Chroma( | |
collection_name=collection_name, | |
embedding_function=embedding_function, | |
persist_directory=persist_directory | |
) | |
except Exception as e: | |
logger.error(f"Error creating Chroma client: {e}") | |
raise | |
def create_and_store_embeddings(chunks: List[str], collection_name: str, embedding_function, persist_directory: str): | |
"""Create and store embeddings for document chunks.""" | |
try: | |
vector_db = get_chroma_client(collection_name, embedding_function, persist_directory) | |
vector_db.add_documents(chunks) | |
logger.info(f"Embeddings created for collection {collection_name} and saved to {persist_directory}.") | |
except Exception as e: | |
logger.error("Error creating and storing embeddings: {e}") | |
raise | |
# def main(): | |
# source_directory = conf.DATA_DIRECTORY | |
# document_loader = DocumentLoader(source_directory) | |
# try: | |
# documents = document_loader.load_all_documents() | |
# logger.info(f"Loaded {len(documents)} documents.") | |
# except Exception as e: | |
# logger.error(f"Error loading documents: {e}") | |
# return | |
# # Split documents into chunks | |
# try: | |
# chunks = split_text(documents) | |
# logger.info(f"Processed {len(chunks)} chunks for embedding.", ) | |
# except Exception as e: | |
# logger.error(f"Error processing documents: {e}") | |
# return | |
# # Initialize embedding model | |
# try: | |
# embedding_function = initialize_embedding_model() | |
# except Exception: | |
# return # Stop execution if embedding model fails | |
# # Create and store embeddings | |
# create_and_store_embeddings(chunks, COLLECTION_NAME, embedding_function, PERSIST_DIRECTORY) | |
# if __name__ == "__main__": | |
# main() | |