ohalkhateeb commited on
Commit
aa8c5f5
·
verified ·
1 Parent(s): 046c9e2

Rename app.py_ to preprocess.py

Browse files
Files changed (2) hide show
  1. app.py_ +0 -62
  2. preprocess.py +59 -0
app.py_ DELETED
@@ -1,62 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import faiss
4
- import numpy as np
5
- import pickle
6
- from sentence_transformers import SentenceTransformer
7
- from transformers import AutoTokenizer, AutoModelForCausalLM
8
-
9
- # Load precomputed chunks and FAISS index
10
- print("Loading precomputed data...")
11
- with open("chunks.pkl", "rb") as f:
12
- chunks = pickle.load(f)
13
- index = faiss.read_index("index.faiss")
14
-
15
- # Load embedding model (for queries only)
16
- embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
17
-
18
- # Load Jais model and tokenizer
19
- model_name = "inceptionai/jais-13b"
20
- tokenizer = AutoTokenizer.from_pretrained(model_name)
21
- model = AutoModelForCausalLM.from_pretrained(model_name)
22
-
23
- # RAG function
24
- def get_response(query, k=3):
25
- query_embedding = embedding_model.encode([query])
26
- distances, indices = index.search(np.array(query_embedding), k)
27
- retrieved_chunks = [chunks[i] for i in indices[0]]
28
- context = " ".join(retrieved_chunks)
29
- prompt = f"استنادًا إلى الوثائق التالية: {context}، أجب على السؤال: {query}"
30
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
31
- outputs = model.generate(
32
- **inputs,
33
- max_new_tokens=200,
34
- do_sample=True,
35
- temperature=0.7,
36
- top_p=0.9
37
- )
38
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
- return response.split(query)[-1].strip()
40
-
41
- # Gradio interface
42
- with gr.Blocks(title="Dubai Legislation Chatbot") as demo:
43
- gr.Markdown("# Dubai Legislation Chatbot\nاسأل أي سؤال حول تشريعات دبي")
44
- chatbot = gr.Chatbot()
45
- msg = gr.Textbox(placeholder="اكتب سؤالك هنا...", rtl=True)
46
- clear = gr.Button("مسح")
47
-
48
- def user(user_message, history):
49
- return "", history + [[user_message, None]]
50
-
51
- def bot(history):
52
- user_message = history[-1][0]
53
- bot_message = get_response(user_message)
54
- history[-1][1] = bot_message
55
- return history
56
-
57
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
58
- bot, chatbot, chatbot
59
- )
60
- clear.click(lambda: None, None, chatbot, queue=False)
61
-
62
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from bs4 import BeautifulSoup
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ import pickle
8
+
9
+ def preprocess(legislation_dir="./legislation"):
10
+ chunks_file = "chunks.pkl"
11
+ index_file = "index.faiss"
12
+
13
+ # Check if precomputed files already exist
14
+ if os.path.exists(chunks_file) and os.path.exists(index_file):
15
+ print("Precomputed files found. Skipping preprocessing.")
16
+ return
17
+
18
+ print("Precomputed files not found. Running preprocessing...")
19
+
20
+ # Load documents
21
+ def load_documents(directory):
22
+ documents = []
23
+ if not os.path.exists(directory):
24
+ raise FileNotFoundError(f"Directory '{directory}' not found. Please upload legislation files.")
25
+ for filename in os.listdir(directory):
26
+ if filename.endswith(".html"):
27
+ file_path = os.path.join(directory, filename)
28
+ with open(file_path, "r", encoding="utf-8") as f:
29
+ soup = BeautifulSoup(f, "html.parser")
30
+ text = soup.get_text(separator=" ", strip=True)
31
+ documents.append(text)
32
+ return documents
33
+
34
+ documents = load_documents(legislation_dir)
35
+
36
+ # Split texts
37
+ print("Splitting documents into chunks...")
38
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
39
+ chunks = []
40
+ for doc in documents:
41
+ chunks.extend(text_splitter.split_text(doc))
42
+
43
+ # Create embeddings and FAISS index
44
+ print("Generating embeddings...")
45
+ embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
46
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
47
+ dimension = embeddings.shape[1]
48
+ index = faiss.IndexFlatL2(dimension)
49
+ index.add(np.array(embeddings))
50
+
51
+ # Save precomputed data
52
+ print("Saving precomputed data...")
53
+ with open(chunks_file, "wb") as f:
54
+ pickle.dump(chunks, f)
55
+ faiss.write_index(index, index_file)
56
+ print("Preprocessing complete!")
57
+
58
+ if __name__ == "__main__":
59
+ preprocess()