import os from bs4 import BeautifulSoup from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer import faiss import numpy as np import pickle def preprocess(legislation_dir="./legislation"): chunks_file = "chunks.pkl" index_file = "index.faiss" # Check if precomputed files already exist if os.path.exists(chunks_file) and os.path.exists(index_file): print("Precomputed files found. Skipping preprocessing.") return print("Precomputed files not found. Running preprocessing...") # Load documents def load_documents(directory): documents = [] if not os.path.exists(directory): raise FileNotFoundError(f"Directory '{directory}' not found. Please upload legislation files.") for filename in os.listdir(directory): if filename.endswith(".html"): file_path = os.path.join(directory, filename) with open(file_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") text = soup.get_text(separator=" ", strip=True) documents.append(text) return documents documents = load_documents(legislation_dir) # Split texts print("Splitting documents into chunks...") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = [] for doc in documents: chunks.extend(text_splitter.split_text(doc)) # Create embeddings and FAISS index print("Generating embeddings...") embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2") embeddings = embedding_model.encode(chunks, show_progress_bar=True) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) # Save precomputed data print("Saving precomputed data...") with open(chunks_file, "wb") as f: pickle.dump(chunks, f) faiss.write_index(index, index_file) print("Preprocessing complete!") if __name__ == "__main__": preprocess()