import os from bs4 import BeautifulSoup from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer import faiss import numpy as np import pickle # Function to load documents def load_documents(directory): documents = [] for filename in os.listdir(directory): if filename.endswith(".html"): file_path = os.path.join(directory, filename) with open(file_path, "r", encoding="latin-1") as f: soup = BeautifulSoup(f, "html.parser") text = soup.get_text(separator=" ", strip=True) documents.append(text) return documents # Load and split documents print("Loading and splitting documents...") documents = load_documents("./legislation") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = [] for doc in documents: chunks.extend(text_splitter.split_text(doc)) # Create embeddings and FAISS index print("Generating embeddings...") embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2") embeddings = embedding_model.encode(chunks, show_progress_bar=True) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) # Save chunks and index print("Saving precomputed data...") with open("chunks.pkl", "wb") as f: pickle.dump(chunks, f) faiss.write_index(index, "index.faiss") print("Preprocessing complete!")