Spaces:
Running
Running
File size: 3,784 Bytes
6dd04d7 c6cfeff 6dd04d7 c6cfeff 6dd04d7 c6cfeff 6dd04d7 c6cfeff 6dd04d7 c6cfeff 6dd04d7 c6cfeff 6dd04d7 c6cfeff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import ray
import logging
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
# Initialize Ray
ray.init()
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Load documents with logging
logging.info("Loading documents...")
loader = DirectoryLoader('data', glob="./*.txt")
documents = loader.load()
# Extract text from documents and split into manageable texts with logging
#logging.info("Extracting and splitting texts from documents...")
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
#texts = []
#for document in documents:
# if hasattr(document, 'get_text'):
# text_content = document.get_text() # Adjust according to actual method
# else:
# text_content = "" # Default to empty string if no text method is available
#
# texts.extend(text_splitter.split_text(text_content))
# Extract text from documents and split into manageable texts with logging
logging.info("Extracting and splitting texts from documents...")
texts = []
for document in documents:
if hasattr(document, 'get_text'):
text_content = document.get_text() # Adjust according to actual method
else:
text_content = "" # Default to empty string if no text method is available
# Check if text_content is valid before splitting
if text_content and isinstance(text_content, str):
valid_chunks = text_splitter.split_text(text_content)
texts.extend(valid_chunks)
else:
logging.warning(f"Invalid document or empty content encountered: {document}")
# Define embedding function
#def embedding_function(text):
# embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
# return embeddings_model.embed_query(text)
# Define embedding function
def embedding_function(text):
embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
# Ensure input is valid
if not text or not isinstance(text, str):
raise ValueError(f"Invalid text for embedding: {text}")
return embeddings_model.embed_query(text)
# Create FAISS index for embeddings
index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
# Assuming docstore as a simple dictionary to store document texts
docstore = {i: text for i, text in enumerate(texts)}
index_to_docstore_id = {i: i for i in range(len(texts))}
# Initialize FAISS
faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
# Process and store embeddings
#logging.info("Storing embeddings in FAISS...")
#for i, text in enumerate(texts):
# embedding = embedding_function(text)
# faiss_db.add_documents([embedding])
# Store embeddings in FAISS
logging.info("Storing embeddings in FAISS...")
for i, text in enumerate(texts):
try:
if text: # Check that the text is not None or empty
embedding = embedding_function(text)
faiss_db.add_documents([embedding])
else:
logging.warning(f"Skipping invalid or empty text at index {i}.")
except Exception as e:
logging.error(f"Error while processing text at index {i}: {text}, Error: {e}")
# Exporting the vector embeddings database with logging
logging.info("Exporting the vector embeddings database...")
faiss_db.save_local("ipc_embed_db")
# Log a message to indicate the completion of the process
logging.info("Process completed successfully.")
# Shutdown Ray after the process
ray.shutdown() |