Shiv22419 commited on
Commit
c6cfeff
·
verified ·
1 Parent(s): 470d90e

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +52 -5
Ingest.py CHANGED
@@ -17,9 +17,21 @@ logging.info("Loading documents...")
17
  loader = DirectoryLoader('data', glob="./*.txt")
18
  documents = loader.load()
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Extract text from documents and split into manageable texts with logging
21
  logging.info("Extracting and splitting texts from documents...")
22
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
  texts = []
24
  for document in documents:
25
  if hasattr(document, 'get_text'):
@@ -27,13 +39,33 @@ for document in documents:
27
  else:
28
  text_content = "" # Default to empty string if no text method is available
29
 
30
- texts.extend(text_splitter.split_text(text_content))
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Define embedding function
33
  def embedding_function(text):
34
  embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
 
 
 
 
 
35
  return embeddings_model.embed_query(text)
36
 
 
 
37
  # Create FAISS index for embeddings
38
  index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
39
 
@@ -45,10 +77,25 @@ index_to_docstore_id = {i: i for i in range(len(texts))}
45
  faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
46
 
47
  # Process and store embeddings
 
 
 
 
 
 
 
48
  logging.info("Storing embeddings in FAISS...")
49
  for i, text in enumerate(texts):
50
- embedding = embedding_function(text)
51
- faiss_db.add_documents([embedding])
 
 
 
 
 
 
 
 
52
 
53
  # Exporting the vector embeddings database with logging
54
  logging.info("Exporting the vector embeddings database...")
@@ -58,4 +105,4 @@ faiss_db.save_local("ipc_embed_db")
58
  logging.info("Process completed successfully.")
59
 
60
  # Shutdown Ray after the process
61
- ray.shutdown()
 
17
  loader = DirectoryLoader('data', glob="./*.txt")
18
  documents = loader.load()
19
 
20
+ # Extract text from documents and split into manageable texts with logging
21
+ #logging.info("Extracting and splitting texts from documents...")
22
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
+ #texts = []
24
+ #for document in documents:
25
+ # if hasattr(document, 'get_text'):
26
+ # text_content = document.get_text() # Adjust according to actual method
27
+ # else:
28
+ # text_content = "" # Default to empty string if no text method is available
29
+ #
30
+ # texts.extend(text_splitter.split_text(text_content))
31
+
32
+
33
  # Extract text from documents and split into manageable texts with logging
34
  logging.info("Extracting and splitting texts from documents...")
 
35
  texts = []
36
  for document in documents:
37
  if hasattr(document, 'get_text'):
 
39
  else:
40
  text_content = "" # Default to empty string if no text method is available
41
 
42
+ # Check if text_content is valid before splitting
43
+ if text_content and isinstance(text_content, str):
44
+ valid_chunks = text_splitter.split_text(text_content)
45
+ texts.extend(valid_chunks)
46
+ else:
47
+ logging.warning(f"Invalid document or empty content encountered: {document}")
48
+
49
+
50
+
51
+
52
+ # Define embedding function
53
+ #def embedding_function(text):
54
+ # embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
55
+ # return embeddings_model.embed_query(text)
56
 
57
  # Define embedding function
58
  def embedding_function(text):
59
  embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
60
+
61
+ # Ensure input is valid
62
+ if not text or not isinstance(text, str):
63
+ raise ValueError(f"Invalid text for embedding: {text}")
64
+
65
  return embeddings_model.embed_query(text)
66
 
67
+
68
+
69
  # Create FAISS index for embeddings
70
  index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
71
 
 
77
  faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
78
 
79
  # Process and store embeddings
80
+ #logging.info("Storing embeddings in FAISS...")
81
+ #for i, text in enumerate(texts):
82
+ # embedding = embedding_function(text)
83
+ # faiss_db.add_documents([embedding])
84
+
85
+
86
+ # Store embeddings in FAISS
87
  logging.info("Storing embeddings in FAISS...")
88
  for i, text in enumerate(texts):
89
+ try:
90
+ if text: # Check that the text is not None or empty
91
+ embedding = embedding_function(text)
92
+ faiss_db.add_documents([embedding])
93
+ else:
94
+ logging.warning(f"Skipping invalid or empty text at index {i}.")
95
+ except Exception as e:
96
+ logging.error(f"Error while processing text at index {i}: {text}, Error: {e}")
97
+
98
+
99
 
100
  # Exporting the vector embeddings database with logging
101
  logging.info("Exporting the vector embeddings database...")
 
105
  logging.info("Process completed successfully.")
106
 
107
  # Shutdown Ray after the process
108
+ ray.shutdown()