File size: 3,784 Bytes
6dd04d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6cfeff
 
 
 
 
 
 
 
 
 
 
 
 
6dd04d7
 
 
 
 
 
 
 
 
c6cfeff
 
 
 
 
 
 
 
 
 
 
 
 
 
6dd04d7
 
 
 
c6cfeff
 
 
 
 
6dd04d7
 
c6cfeff
 
6dd04d7
 
 
 
 
 
 
 
 
 
 
c6cfeff
 
 
 
 
 
 
6dd04d7
 
c6cfeff
 
 
 
 
 
 
 
 
 
6dd04d7
 
 
 
 
 
 
 
 
c6cfeff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import ray
import logging
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from faiss import IndexFlatL2  # Assuming using L2 distance for simplicity

# Initialize Ray
ray.init()

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load documents with logging
logging.info("Loading documents...")
loader = DirectoryLoader('data', glob="./*.txt")
documents = loader.load()

# Extract text from documents and split into manageable texts with logging
#logging.info("Extracting and splitting texts from documents...")
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
#texts = []
#for document in documents:
#    if hasattr(document, 'get_text'):
#        text_content = document.get_text()  # Adjust according to actual method
#    else:
#        text_content = ""  # Default to empty string if no text method is available
#
#    texts.extend(text_splitter.split_text(text_content))


# Extract text from documents and split into manageable texts with logging
logging.info("Extracting and splitting texts from documents...")
texts = []
for document in documents:
    if hasattr(document, 'get_text'):
        text_content = document.get_text()  # Adjust according to actual method
    else:
        text_content = ""  # Default to empty string if no text method is available

    # Check if text_content is valid before splitting
    if text_content and isinstance(text_content, str):
        valid_chunks = text_splitter.split_text(text_content)
        texts.extend(valid_chunks)
    else:
        logging.warning(f"Invalid document or empty content encountered: {document}")




# Define embedding function
#def embedding_function(text):
#    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
#    return embeddings_model.embed_query(text)

# Define embedding function
def embedding_function(text):
    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
    
    # Ensure input is valid
    if not text or not isinstance(text, str):
        raise ValueError(f"Invalid text for embedding: {text}")
    
    return embeddings_model.embed_query(text)



# Create FAISS index for embeddings
index = IndexFlatL2(768)  # Dimension of embeddings, adjust as needed

# Assuming docstore as a simple dictionary to store document texts
docstore = {i: text for i, text in enumerate(texts)}
index_to_docstore_id = {i: i for i in range(len(texts))}

# Initialize FAISS
faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)

# Process and store embeddings
#logging.info("Storing embeddings in FAISS...")
#for i, text in enumerate(texts):
#    embedding = embedding_function(text)
#    faiss_db.add_documents([embedding])


# Store embeddings in FAISS
logging.info("Storing embeddings in FAISS...")
for i, text in enumerate(texts):
    try:
        if text:  # Check that the text is not None or empty
            embedding = embedding_function(text)
            faiss_db.add_documents([embedding])
        else:
            logging.warning(f"Skipping invalid or empty text at index {i}.")
    except Exception as e:
        logging.error(f"Error while processing text at index {i}: {text}, Error: {e}")



# Exporting the vector embeddings database with logging
logging.info("Exporting the vector embeddings database...")
faiss_db.save_local("ipc_embed_db")

# Log a message to indicate the completion of the process
logging.info("Process completed successfully.")

# Shutdown Ray after the process
ray.shutdown()