Spaces:

ohalkhateeb
/

Test3

Sleeping

App Files Files Community

ohalkhateeb commited on 15 days ago

Commit

903802f

verified ·

1 Parent(s): bf19ec0

Create preprocess.py

Browse files

Files changed (1) hide show

preprocess.py +43 -0

preprocess.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from bs4 import BeautifulSoup
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import pickle
+# Function to load documents
+def load_documents(directory):
+    documents = []
+    for filename in os.listdir(directory):
+        if filename.endswith(".html"):
+            file_path = os.path.join(directory, filename)
+            with open(file_path, "r", encoding="latin-1") as f:
+                soup = BeautifulSoup(f, "html.parser")
+                text = soup.get_text(separator=" ", strip=True)
+                documents.append(text)
+    return documents
+# Load and split documents
+print("Loading and splitting documents...")
+documents = load_documents("./legislation")
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+chunks = []
+for doc in documents:
+    chunks.extend(text_splitter.split_text(doc))
+# Create embeddings and FAISS index
+print("Generating embeddings...")
+embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+dimension = embeddings.shape[1]
+index = faiss.IndexFlatL2(dimension)
+index.add(np.array(embeddings))
+# Save chunks and index
+print("Saving precomputed data...")
+with open("chunks.pkl", "wb") as f:
+    pickle.dump(chunks, f)
+faiss.write_index(index, "index.faiss")
+print("Preprocessing complete!")