Spaces:

ohalkhateeb
/

Test3

Sleeping

App Files Files Community

ohalkhateeb commited on 15 days ago

Commit

aa8c5f5

verified ·

1 Parent(s): 046c9e2

Rename app.py_ to preprocess.py

Browse files

Files changed (2) hide show

app.py_ +0 -62
preprocess.py +59 -0

app.py_ DELETED Viewed

@@ -1,62 +0,0 @@
-import os
-import gradio as gr
-import faiss
-import numpy as np
-import pickle
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Load precomputed chunks and FAISS index
-print("Loading precomputed data...")
-with open("chunks.pkl", "rb") as f:
-    chunks = pickle.load(f)
-index = faiss.read_index("index.faiss")
-# Load embedding model (for queries only)
-embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
-# Load Jais model and tokenizer
-model_name = "inceptionai/jais-13b"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# RAG function
-def get_response(query, k=3):
-    query_embedding = embedding_model.encode([query])
-    distances, indices = index.search(np.array(query_embedding), k)
-    retrieved_chunks = [chunks[i] for i in indices[0]]
-    context = " ".join(retrieved_chunks)
-    prompt = f"استنادًا إلى الوثائق التالية: {context}، أجب على السؤال: {query}"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=200,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9
-    )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response.split(query)[-1].strip()
-# Gradio interface
-with gr.Blocks(title="Dubai Legislation Chatbot") as demo:
-    gr.Markdown("# Dubai Legislation Chatbot\nاسأل أي سؤال حول تشريعات دبي")
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(placeholder="اكتب سؤالك هنا...", rtl=True)
-    clear = gr.Button("مسح")
-    def user(user_message, history):
-        return "", history + [[user_message, None]]
-    def bot(history):
-        user_message = history[-1][0]
-        bot_message = get_response(user_message)
-        history[-1][1] = bot_message
-        return history
-    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, chatbot, chatbot
-    )
-    clear.click(lambda: None, None, chatbot, queue=False)
-demo.launch()

preprocess.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from bs4 import BeautifulSoup
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import pickle
+def preprocess(legislation_dir="./legislation"):
+    chunks_file = "chunks.pkl"
+    index_file = "index.faiss"
+    # Check if precomputed files already exist
+    if os.path.exists(chunks_file) and os.path.exists(index_file):
+        print("Precomputed files found. Skipping preprocessing.")
+        return
+    print("Precomputed files not found. Running preprocessing...")
+    # Load documents
+    def load_documents(directory):
+        documents = []
+        if not os.path.exists(directory):
+            raise FileNotFoundError(f"Directory '{directory}' not found. Please upload legislation files.")
+        for filename in os.listdir(directory):
+            if filename.endswith(".html"):
+                file_path = os.path.join(directory, filename)
+                with open(file_path, "r", encoding="utf-8") as f:
+                    soup = BeautifulSoup(f, "html.parser")
+                    text = soup.get_text(separator=" ", strip=True)
+                    documents.append(text)
+        return documents
+    documents = load_documents(legislation_dir)
+    # Split texts
+    print("Splitting documents into chunks...")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    chunks = []
+    for doc in documents:
+        chunks.extend(text_splitter.split_text(doc))
+    # Create embeddings and FAISS index
+    print("Generating embeddings...")
+    embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(np.array(embeddings))
+    # Save precomputed data
+    print("Saving precomputed data...")
+    with open(chunks_file, "wb") as f:
+        pickle.dump(chunks, f)
+    faiss.write_index(index, index_file)
+    print("Preprocessing complete!")
+if __name__ == "__main__":
+    preprocess()