ohalkhateeb commited on
Commit
903802f
·
verified ·
1 Parent(s): bf19ec0

Create preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +43 -0
preprocess.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from bs4 import BeautifulSoup
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ import pickle
8
+
9
+ # Function to load documents
10
+ def load_documents(directory):
11
+ documents = []
12
+ for filename in os.listdir(directory):
13
+ if filename.endswith(".html"):
14
+ file_path = os.path.join(directory, filename)
15
+ with open(file_path, "r", encoding="latin-1") as f:
16
+ soup = BeautifulSoup(f, "html.parser")
17
+ text = soup.get_text(separator=" ", strip=True)
18
+ documents.append(text)
19
+ return documents
20
+
21
+ # Load and split documents
22
+ print("Loading and splitting documents...")
23
+ documents = load_documents("./legislation")
24
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
25
+ chunks = []
26
+ for doc in documents:
27
+ chunks.extend(text_splitter.split_text(doc))
28
+
29
+ # Create embeddings and FAISS index
30
+ print("Generating embeddings...")
31
+ embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
32
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
33
+ dimension = embeddings.shape[1]
34
+ index = faiss.IndexFlatL2(dimension)
35
+ index.add(np.array(embeddings))
36
+
37
+ # Save chunks and index
38
+ print("Saving precomputed data...")
39
+ with open("chunks.pkl", "wb") as f:
40
+ pickle.dump(chunks, f)
41
+ faiss.write_index(index, "index.faiss")
42
+
43
+ print("Preprocessing complete!")