Spaces:

TawasoaAi
/

FRA

Runtime error

App Files Files Community

TawasoaAi commited on 23 days ago

Commit

eea4855

verified ·

1 Parent(s): 58861bb

Update app.py

Browse files

Files changed (1) hide show

app.py +208 -237

app.py CHANGED Viewed

@@ -2,22 +2,24 @@
 import os
 import re
 import json
-import time
 import io
 import fitz  # PyMuPDF
 import requests
 import google.generativeai as genai
 import mysql.connector
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from tqdm import tqdm
 from sentence_transformers import SentenceTransformer
 import gradio as gr
-from urllib.parse import urljoin, urlparse
 from collections import deque
-import pandas as pd # --- NEW --- لعرض الملفات
-# --- مكتبات جديدة لدعم OCR و DOCX ---
 try:
     import pytesseract
     from PIL import Image
@@ -35,9 +37,8 @@ except ImportError:
     DOCX_ENABLED = False
     print("⚠️ تحذير: مكتبة python-docx غير موجودة. لن يتم قراءة ملفات وورد.")
 # ============================================================
-# 1. الإعدادات والمتغيرات الأساسية
 # ============================================================
 print("⏳ جاري تحميل الإعدادات...")
 if os.path.exists('.env'):
@@ -48,8 +49,21 @@ DB_CONFIG = {
     "user": os.getenv("DB_USER"),
     "password": os.getenv("DB_PASSWORD"),
     "database": os.getenv("DB_NAME"),
 }
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 if GOOGLE_API_KEY:
     try:
@@ -63,103 +77,100 @@ else:
 SCRAPE_URL = os.getenv("SCRAPE_URL", "https://fra.gov.eg/")
 REQUEST_TIMEOUT = 60
-PAGE_LIMIT = 2000
-print("⏳ جاري تحميل نموذج Embedding المحلي... (قد يستغرق بعض الوقت أول مرة)")
 LOCAL_EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
 print("✅ تم تحميل نموذج Embedding المحلي.")
 # ============================================================
-# 2. إعداد قاعدة البيانات (لا تغيير)
 # ============================================================
 def setup_database():
     try:
-        if not all(DB_CONFIG.values()):
-            missing_keys = [key for key, value in DB_CONFIG.items() if not value]
-            print(f"❌ خطأ: بيانات قاعدة البيانات ناقصة في الـ Secrets. الرجاء إضافة: {missing_keys}")
-            return False, f"بيانات قاعدة البيانات ناقصة: {missing_keys}"
-        conn = mysql.connector.connect(**DB_CONFIG)
-        cursor = conn.cursor()
-        print("✅ تم الاتصال بقاعدة البيانات بنجاح.")
-        cursor.execute("""
-        CREATE TABLE IF NOT EXISTS documents (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            source VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL UNIQUE,
-            content LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
-            summary TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
-            status VARCHAR(50) DEFAULT 'pending',
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-            filename VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL
-        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-        """)
-        cursor.execute("""
-        CREATE TABLE IF NOT EXISTS document_chunks (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            document_id INT NOT NULL,
-            content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
-            embedding JSON,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-            FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
-        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-        """)
-        print(" -> جداول قاعدة البيانات جاهزة.")
-        conn.commit()
-        cursor.close()
-        conn.close()
         return True, "قاعدة البيانات جاهزة."
     except mysql.connector.Error as err:
-        error_msg = f"خطأ فادح في قاعدة البيانات: {err}"
-        print(f"❌ {error_msg}")
-        return False, error_msg
 # ============================================================
-# 3. دوال الزحف (لا تغيير)
 # ============================================================
-def crawl_site(start_url, page_limit=1000):
     visited = set([start_url])
     queue = deque([start_url])
     all_links = set([start_url])
     base_domain = urlparse(start_url).netloc
     with tqdm(total=page_limit, desc="🔍 Crawling Progress", unit="page") as pbar:
         while queue and len(visited) < page_limit:
             url = queue.popleft()
             pbar.set_description(f"🔍 Crawling: {url[:70]}...")
             try:
-                resp = requests.head(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"}, allow_redirects=True)
-                resp.raise_for_status()
-                content_type = resp.headers.get("Content-Type", "").lower()
                 if "text/html" in content_type:
-                    resp = requests.get(url, timeout=REQUEST_TIMEOUT, headers={"User-Agent": "Mozilla/5.0"})
-                    soup = BeautifulSoup(resp.text, "html.parser")
                     for a_tag in soup.find_all("a", href=True):
                         href = urljoin(url, a_tag["href"]).split("#")[0].strip()
                         if not href.startswith("http") or urlparse(href).netloc != base_domain or href in visited:
                             continue
                         if len(visited) >= page_limit: break
                         visited.add(href)
                         all_links.add(href)
                         pbar.update(1)
-                        # لا نضيف ملفات للزحف مرة أخرى
                         if not any(href.lower().endswith(ext) for ext in ['.pdf', '.docx', '.txt']):
                             queue.append(href)
-            except (requests.RequestException, Exception) as e:
-                tqdm.write(f"⚠️ خطأ أثناء الزحف إلى {url}: {e}")
                 continue
     return list(all_links)
 # ============================================================
-# 4. دوال استخلاص النصوص (لا تغيير)
 # ============================================================
 def _perform_ocr(image_bytes):
     if not OCR_ENABLED: return ""
     try:
@@ -194,63 +205,47 @@ def _extract_from_docx(content_bytes):
 def _extract_from_html(content_bytes, url):
     soup = BeautifulSoup(content_bytes, "html.parser")
     text_parts = []
-    # OCR على الصور داخل الصفحة
     if OCR_ENABLED:
         for img_tag in soup.find_all('img'):
             img_src = img_tag.get('src')
             if not img_src: continue
             try:
                 img_url = urljoin(url, img_src)
-                img_resp = requests.get(img_url, timeout=15)
-                img_resp.raise_for_status()
-                tqdm.write(f"📝 OCR on image from URL: {img_url[:80]}...")
-                text_parts.append(_perform_ocr(img_resp.content))
             except Exception as e:
                 tqdm.write(f"⚠️ فشل تحميل أو معالجة الصورة {img_src}: {e}")
-    for element in soup(["script", "style", "nav", "footer", "header", "aside", "form", "button"]):
         element.decompose()
-    text_parts.append(soup.get_text(separator="\n", strip=False))
-    return "\n".join(text_parts)
 def _clean_text(raw_text):
     if not raw_text or not isinstance(raw_text, str):
         return None
-    # إزالة محارف التحكم غير القابلة للطباعة
     text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', raw_text)
-    # إزالة الروابط
     text = re.sub(r'https?://\S+', '', text)
-    # إزالة أي شيء ليس حرفًا أو رقمًا أو مسافة أو علامة ترقيم أساسية
-    text = re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s.,-]', '', text)
-    # تقسيم السطور وتنظيفها
-    clean_lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 15]
-    if not clean_lines:
-        return None
-    final_text = "\n".join(clean_lines)
-    # تطبيع المسافات والأسطر الجديدة
     final_text = re.sub(r" +", " ", final_text)
     final_text = re.sub(r"(\n\s*){2,}", "\n\n", final_text)
     return final_text.strip()
 def extract_and_clean_text(url):
-    """
-    الدالة الرئيسية الموجهة: تحمل المحتوى وتختار الطريقة المناسبة للاستخلاص.
-    """
     try:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        resp = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
-        resp.raise_for_status()
         content_bytes = resp.content
         content_type = resp.headers.get('Content-Type', '').lower()
         raw_text = ""
         if url.lower().endswith(".pdf") or "application/pdf" in content_type:
             raw_text = _extract_from_pdf(content_bytes)
-        elif url.lower().endswith(".docx") or "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
             raw_text = _extract_from_docx(content_bytes)
         elif url.lower().endswith(".txt") or "text/plain" in content_type:
             raw_text = content_bytes.decode('utf-8', errors='ignore')
@@ -258,20 +253,22 @@ def extract_and_clean_text(url):
             raw_text = _extract_from_html(content_bytes, url)
         else:
             tqdm.write(f"⚠️ نوع ملف غير مدعوم في {url} ({content_type})")
-            return None
-        return _clean_text(raw_text)
     except Exception as e:
         tqdm.write(f"❌ فشل استخراج وتنظيف النص من {url}: {e}")
-        return None
 # ============================================================
-# 5. دوال المعالجة والـ Embedding (لا تغيير)
 # ============================================================
-def split_into_chunks(text, chunk_size=1000, overlap=150):
     if not text: return []
-    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
 def get_embedding(text):
     if GOOGLE_API_KEY:
@@ -282,208 +279,182 @@ def get_embedding(text):
     return LOCAL_EMBEDDER.encode([text])[0].tolist()
 # ============================================================
-# 6. التخزين في قاعدة البيانات (لا تغيير)
 # ============================================================
-def process_and_store_links(links, max_docs):
-    conn = mysql.connector.connect(**DB_CONFIG)
-    cursor = conn.cursor()
-    saved_docs_count = 0
-    with tqdm(total=max_docs, desc="💾 Saving Documents") as pbar:
-        for link in links:
-            if saved_docs_count >= max_docs:
-                tqdm.write(f"🏁 تم الوصول للحد الأقصى من المستندات ({max_docs}).")
-                break
-            cursor.execute("SELECT id FROM documents WHERE source = %s", (link,))
-            if cursor.fetchone():
-                tqdm.write(f"🔄 الرابط {link} موجود بالفعل، سيتم تخطيه.")
-                continue
-            clean_content = extract_and_clean_text(link)
-            if not clean_content or len(clean_content) < 150:
-                tqdm.write(f"⚠️ محتوى غير كافٍ أو غير نظيف في {link}، سيتم تخطيه.")
-                continue
-            try:
-                summary = clean_content[:500].rsplit(' ', 1)[0] + '...'
                 cursor.execute(
-                    "INSERT INTO documents (source, content, summary, status) VALUES (%s, %s, %s, 'processing')",
-                    (link, clean_content, summary)
                 )
                 document_id = cursor.lastrowid
                 chunks = split_into_chunks(clean_content)
                 for chunk in chunks:
                     embedding = get_embedding(chunk)
                     cursor.execute(
-                        "INSERT INTO document_chunks (document_id, content, embedding) VALUES (%s, %s, %s)",
-                        (document_id, chunk, json.dumps(embedding))
                     )
-                cursor.execute("UPDATE documents SET status = 'completed' WHERE id = %s", (document_id,))
                 conn.commit()
-                tqdm.write(f"✅ تم حفظ {link} بنجاح مع {len(chunks)} جزء.")
-                saved_docs_count += 1
-                pbar.update(1)
-            except mysql.connector.Error as err:
-                tqdm.write(f"❌ خطأ قاعدة بيانات أثناء معالجة {link}: {err}")
-                conn.rollback()
-            except Exception as e:
-                tqdm.write(f"❌ خطأ غير متوقع أثناء معالجة {link}: {e}")
-                conn.rollback()
-    cursor.close()
-    conn.close()
     return f"✅ اكتملت عملية المعالجة. تم حفظ {saved_docs_count} مستند جديد."
 # ============================================================
-# 7. الواجهة الرسومية والتشغيل (--- MODIFIED ---)
 # ============================================================
 is_running = False
 def run_full_process_wrapper(max_documents_to_save, progress=gr.Progress(track_tqdm=True)):
-    global is_running
     if is_running:
-        return "⏳ العملية قيد التشغيل بالفعل، يرجى الانتظار..."
     is_running = True
     output_log = []
     def log_message(msg):
         nonlocal output_log
         output_log.append(msg)
         return "\n".join(output_log)
     try:
-        yield log_message("🏁 بدء عملية الإعداد والتحقق...")
         db_ok, db_msg = setup_database()
         if not db_ok:
             is_running = False
-            yield log_message(f"❌ فشل إعداد قاعدة البيانات: {db_msg}")
             return
-        if not GOOGLE_API_KEY:
-             yield log_message("⚠️ تحذير: مفتاح Google API غير موجود. سيتم الاعتماد على النموذج المحلي.")
-        if not OCR_ENABLED:
-             yield log_message("⚠️ تحذير: ميزة OCR معطلة. لن يتم قراءة النصوص من الصور.")
-        if not DOCX_ENABLED:
-             yield log_message("⚠️ تحذير: ميزة قراءة DOCX معطلة.")
-        yield log_message(f"🚀 بدء عملية الزحف للموقع (حد وقائي {PAGE_LIMIT} صفحة)...")
         all_links = crawl_site(SCRAPE_URL, page_limit=PAGE_LIMIT)
-        yield log_message(f"✅ تم العثور على {len(all_links)} رابط فريد.")
         if not all_links:
             is_running = False
-            yield log_message("⚠️ لم يتم العثور على أي روابط. انتهت العملية.")
             return
         max_docs_int = int(max_documents_to_save)
-        yield log_message(f"🔄 ستبدأ الآن عملية المعالجة والحفظ (الهدف: {max_docs_int} مستند).")
-        final_message = process_and_store_links(all_links, max_docs_int)
-        yield log_message(f"\n🎉 {final_message}")
     except Exception as e:
-        yield log_message(f"\n💥 حدث خطأ فادح وغير متوقع: {e}")
     finally:
         is_running = False
-        yield log_message("⏹️ العملية انتهت أو تم إيقافها.")
-# --- NEW --- Function to list project files
 def list_project_files():
-    """Lists files in the current directory, returning a Pandas DataFrame."""
     try:
         files = os.listdir('.')
         file_details = []
-        for f in files:
             if os.path.isfile(f):
                 size_bytes = os.path.getsize(f)
                 size_kb = f"{size_bytes / 1024:.2f} KB"
                 file_details.append([f, size_kb])
         if not file_details:
             return pd.DataFrame(columns=["File Name", "Size"])
         df = pd.DataFrame(file_details, columns=["اسم الملف", "الحجم"])
         return df
     except Exception as e:
         return pd.DataFrame([["خطأ في عرض الملفات", str(e)]], columns=["اسم الملف", "الحجم"])
 def main():
     with gr.Blocks(theme=gr.themes.Soft(), title="Web Scraper & Indexer") as demo:
         gr.Markdown(
             """
-            # 🚀 Web Scraper & Intelligent Indexer (V3 - Specialist)
             أداة متكاملة للتحكم في عملية كشط المواقع وفهرسة محتواها في قاعدة البيانات.
             """
         )
-        # --- NEW --- Tabbed interface
         with gr.Tabs():
-            # --- Main App Tab ---
             with gr.Tab("التطبيق الرئيسي"):
-                with gr.Column():
-                    with gr.Row():
-                        run_btn = gr.Button("🚀 ابدأ الزحف والفهرسة", variant="primary")
-                        stop_btn = gr.Button("🛑 إيقاف", variant="stop") # --- NEW --- Stop Button
-                    max_docs_input = gr.Number(
-                        label="الحد الأقصى لعدد المستندات الجديدة للحفظ",
-                        value=50,
-                        minimum=1,
-                        step=10
-                    )
-                    # --- NEW --- Collapsible log
-                    with gr.Accordion("عرض سجل العمليات (Log)", open=True):
-                         output_log = gr.Textbox(
-                            label="سجل العمليات",
-                            lines=20,
-                            interactive=False,
-                            autoscroll=True
-                         )
-                # --- NEW --- Event handling for start and stop buttons
-                run_event = run_btn.click(
-                    fn=run_full_process_wrapper,
-                    inputs=[max_docs_input],
-                    outputs=output_log
-                )
-                stop_btn.click(
-                    fn=None,
-                    inputs=None,
-                    outputs=None,
-                    cancels=[run_event]
-                )
-            # --- Files Tab ---
             with gr.Tab("ملفات المشروع"):
-                gr.Markdown("## عرض الملفات الموجودة في مجلد المشروع على السيرفر.")
-                refresh_files_btn = gr.Button("🔄 تحديث قائمة الملفات")
-                file_display = gr.DataFrame(
-                    headers=["اسم الملف", "الحجم"],
-                    datatype=["str", "str"],
-                    label="قائمة الملفات"
-                )
-                refresh_files_btn.click(
-                    fn=list_project_files,
-                    inputs=[],
-                    outputs=file_display
-                )
     print("\n✅ الواجهة المحسنة جاهزة.")
-    demo.launch()
 if __name__ == "__main__":
     main()

 import os
 import re
 import json
 import io
+import time
+import concurrent.futures
 import fitz  # PyMuPDF
 import requests
 import google.generativeai as genai
 import mysql.connector
+from mysql.connector import pooling
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from tqdm import tqdm
 from sentence_transformers import SentenceTransformer
 import gradio as gr
+from urllib.parse import urljoin, urlparse, unquote
 from collections import deque
+import pandas as pd
+# --- OCR and DOCX Libraries ---
 try:
     import pytesseract
     from PIL import Image
     DOCX_ENABLED = False
     print("⚠️ تحذير: مكتبة python-docx غير موجودة. لن يتم قراءة ملفات وورد.")
 # ============================================================
+# 1. Settings and Global Variables
 # ============================================================
 print("⏳ جاري تحميل الإعدادات...")
 if os.path.exists('.env'):
     "user": os.getenv("DB_USER"),
     "password": os.getenv("DB_PASSWORD"),
     "database": os.getenv("DB_NAME"),
+    "charset": "utf8mb4"
 }
+try:
+    print("⏳ جاري إنشاء مجمع الاتصالات بقاعدة البيانات...")
+    connection_pool = mysql.connector.pooling.MySQLConnectionPool(
+        pool_name="scraper_pool",
+        pool_size=10,
+        **DB_CONFIG
+    )
+    print("✅ تم إنشاء مجمع الاتصالات بنجاح.")
+except mysql.connector.Error as err:
+    print(f"❌ فشل فادح في إنشاء مجمع الاتصالات: {err}")
+    exit()
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 if GOOGLE_API_KEY:
     try:
 SCRAPE_URL = os.getenv("SCRAPE_URL", "https://fra.gov.eg/")
 REQUEST_TIMEOUT = 60
+PAGE_LIMIT = 500
+MAX_WORKERS = 5
+print(f"⏳ جاري تحميل نموذج Embedding المحلي... (قد يستغرق بعض الوقت أول مرة)")
 LOCAL_EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
 print("✅ تم تحميل نموذج Embedding المحلي.")
 # ============================================================
+# 2. Database and Network Helpers
 # ============================================================
 def setup_database():
+    """Checks if the database connection from the pool is valid."""
     try:
+        with connection_pool.get_connection() as conn:
+            with conn.cursor(buffered=True) as cursor:
+                cursor.execute("SELECT 1")
+                cursor.fetchone()
+                print("✅ الاتصال من المجمع (Pool) سليم وجاهز.")
         return True, "قاعدة البيانات جاهزة."
     except mysql.connector.Error as err:
+        error_msg = f"❌ خطأ فادح في قاعدة البيانات: {err}"
+        print(error_msg)
+        return False, str(err)
+def get_document_count():
+    """Connects to the DB via the pool and returns the current number of documents."""
+    try:
+        with connection_pool.get_connection() as conn:
+            with conn.cursor(buffered=True) as cursor:
+                cursor.execute("SELECT COUNT(*) FROM documents")
+                count = cursor.fetchone()[0]
+        return f"### **العدد الحالي:** {count} مستند"
+    except Exception as e:
+        print(f"Error getting document count: {e}")
+        return "### **العدد الحالي:** خطأ في الاتصال بقاعدة البيانات"
+def requests_with_retry(url, method='get', retries=3, delay=5, **kwargs):
+    headers = kwargs.get("headers", {"User-Agent": "Mozilla/5.0"})
+    kwargs["headers"] = headers
+    for i in range(retries):
+        try:
+            if method.lower() == 'get':
+                response = requests.get(url, **kwargs)
+            elif method.lower() == 'head':
+                response = requests.head(url, **kwargs)
+            else:
+                raise ValueError("Unsupported HTTP method")
+            response.raise_for_status()
+            return response
+        except requests.exceptions.RequestException as e:
+            tqdm.write(f"⚠️ فشلت محاولة {i+1} للرابط {url}. خطأ: {e}. سيتم المحاولة مرة أخرى بعد {delay} ثانية.")
+            if i < retries - 1:
+                time.sleep(delay)
+    tqdm.write(f"❌ فشلت كل المحاولات ({retries}) للرابط {url}.")
+    return None
 # ============================================================
+# 3. Crawler Functions
 # ============================================================
+def crawl_site(start_url, page_limit=100):
     visited = set([start_url])
     queue = deque([start_url])
     all_links = set([start_url])
     base_domain = urlparse(start_url).netloc
     with tqdm(total=page_limit, desc="🔍 Crawling Progress", unit="page") as pbar:
         while queue and len(visited) < page_limit:
             url = queue.popleft()
             pbar.set_description(f"🔍 Crawling: {url[:70]}...")
             try:
+                resp_head = requests_with_retry(url, method='head', timeout=15, allow_redirects=True)
+                if not resp_head: continue
+                content_type = resp_head.headers.get("Content-Type", "").lower()
                 if "text/html" in content_type:
+                    resp_get = requests_with_retry(url, timeout=REQUEST_TIMEOUT)
+                    if not resp_get: continue
+                    soup = BeautifulSoup(resp_get.text, "html.parser")
                     for a_tag in soup.find_all("a", href=True):
                         href = urljoin(url, a_tag["href"]).split("#")[0].strip()
                         if not href.startswith("http") or urlparse(href).netloc != base_domain or href in visited:
                             continue
                         if len(visited) >= page_limit: break
                         visited.add(href)
                         all_links.add(href)
                         pbar.update(1)
                         if not any(href.lower().endswith(ext) for ext in ['.pdf', '.docx', '.txt']):
                             queue.append(href)
+            except Exception as e:
+                tqdm.write(f"⚠️ خطأ غير متوقع أثناء الزحف إلى {url}: {e}")
                 continue
     return list(all_links)
 # ============================================================
+# 4. Text Extraction Functions
 # ============================================================
 def _perform_ocr(image_bytes):
     if not OCR_ENABLED: return ""
     try:
 def _extract_from_html(content_bytes, url):
     soup = BeautifulSoup(content_bytes, "html.parser")
     text_parts = []
     if OCR_ENABLED:
         for img_tag in soup.find_all('img'):
             img_src = img_tag.get('src')
             if not img_src: continue
             try:
                 img_url = urljoin(url, img_src)
+                img_resp = requests_with_retry(img_url, timeout=15)
+                if img_resp:
+                    tqdm.write(f"📝 OCR on image from URL: {img_url[:80]}...")
+                    text_parts.append(_perform_ocr(img_resp.content))
             except Exception as e:
                 tqdm.write(f"⚠️ فشل تحميل أو معالجة الصورة {img_src}: {e}")
+    for element in soup(["script", "style", "nav", "footer", "header", "aside", "form", "button", "noscript"]):
         element.decompose()
+    main_content = soup.find("main") or soup.find("article") or soup.find("body")
+    text = main_content.get_text(separator='\n', strip=True) if main_content else ""
+    return text + "\n" + "\n".join(text_parts)
 def _clean_text(raw_text):
     if not raw_text or not isinstance(raw_text, str):
         return None
     text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', raw_text)
     text = re.sub(r'https?://\S+', '', text)
+    lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
+    if not lines: return None
+    final_text = "\n".join(lines)
     final_text = re.sub(r" +", " ", final_text)
     final_text = re.sub(r"(\n\s*){2,}", "\n\n", final_text)
     return final_text.strip()
 def extract_and_clean_text(url):
     try:
+        resp = requests_with_retry(url, timeout=REQUEST_TIMEOUT)
+        if not resp: return None, None
         content_bytes = resp.content
         content_type = resp.headers.get('Content-Type', '').lower()
         raw_text = ""
         if url.lower().endswith(".pdf") or "application/pdf" in content_type:
             raw_text = _extract_from_pdf(content_bytes)
+        elif url.lower().endswith(".docx") or "vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
             raw_text = _extract_from_docx(content_bytes)
         elif url.lower().endswith(".txt") or "text/plain" in content_type:
             raw_text = content_bytes.decode('utf-8', errors='ignore')
             raw_text = _extract_from_html(content_bytes, url)
         else:
             tqdm.write(f"⚠️ نوع ملف غير مدعوم في {url} ({content_type})")
+            return None, None
+        cleaned = _clean_text(raw_text)
+        if cleaned and len(cleaned) >= 50:
+             return cleaned, url
+        return None, None
     except Exception as e:
         tqdm.write(f"❌ فشل استخراج وتنظيف النص من {url}: {e}")
+        return None, None
 # ============================================================
+# 5. Embeddings
 # ============================================================
+def split_into_chunks(text, chunk_size=500, overlap=100):
     if not text: return []
+    step = max(1, chunk_size - overlap)
+    return [text[i:i + chunk_size] for i in range(0, len(text), step)]
 def get_embedding(text):
     if GOOGLE_API_KEY:
     return LOCAL_EMBEDDER.encode([text])[0].tolist()
 # ============================================================
+# 6. Database Storage
 # ============================================================
+def process_single_link(link):
+    """Processes a single link using a connection from the pool."""
+    try:
+        with connection_pool.get_connection() as conn:
+            with conn.cursor(buffered=True) as cursor:
+                cursor.execute("SELECT id FROM documents WHERE original_path = %s", (link,))
+                if cursor.fetchone():
+                    return f"🔄 الرابط {link} موجود بالفعل، سيتم تخطيه."
+                clean_content, source_url = extract_and_clean_text(link)
+                if not clean_content: return None
+                summary = clean_content[:450] + '...' if len(clean_content) > 450 else clean_content
+                # --- FINAL FIX 1: Decode filename to handle Arabic in URLs ---
+                file_name_encoded = os.path.basename(urlparse(link).path)
+                file_name = unquote(file_name_encoded)
+                if not file_name: # If the path is empty (like for the base domain)
+                    file_name = link # Use the full link as a fallback
+                # ---
+                # --- FINAL FIX 2: Change status to 'completed' to match Laravel ENUM ---
+                status_to_insert = 'completed'
+                # ---
                 cursor.execute(
+                    """
+                    INSERT INTO documents (filename, original_path, summary, status, created_at, updated_at)
+                    VALUES (%s, %s, %s, %s, NOW(), NOW())
+                    """,
+                    (file_name, link, summary, status_to_insert)
                 )
                 document_id = cursor.lastrowid
                 chunks = split_into_chunks(clean_content)
                 for chunk in chunks:
                     embedding = get_embedding(chunk)
                     cursor.execute(
+                        "INSERT INTO document_chunks (document_id, content, embedding, created_at, updated_at) VALUES (%s, %s, %s, NOW(), NOW())",
+                        (document_id, chunk, json.dumps(embedding) if embedding else None)
                     )
                 conn.commit()
+                # Use the clean file_name in the success message for clarity
+                return f"✅ تم حفظ '{file_name}' بنجاح مع {len(chunks)} جزء."
+    except mysql.connector.Error as err:
+        return f"❌ خطأ قاعدة بيانات أثناء معالجة {link}: {err}"
+    except Exception as e:
+        return f"❌ خطأ غير متوقع أثناء معالجة {link}: {e}"
+def process_and_store_links(links, max_docs, stop_flag):
+    saved_docs_count = 0
+    with tqdm(total=max_docs, desc="💾 Saving Documents") as pbar:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            future_to_link = {executor.submit(process_single_link, link): link for link in links}
+            for future in concurrent.futures.as_completed(future_to_link):
+                if stop_flag["stop"] or saved_docs_count >= max_docs:
+                    for f in future_to_link: f.cancel()
+                    break
+                try:
+                    result_message = future.result()
+                    if result_message:
+                        tqdm.write(result_message)
+                        if result_message.startswith("✅"):
+                            saved_docs_count += 1
+                            pbar.update(1)
+                except Exception as exc:
+                    link = future_to_link[future]
+                    tqdm.write(f"💥 حدث خطأ فادح في معالج الرابط {link}: {exc}")
     return f"✅ اكتملت عملية المعالجة. تم حفظ {saved_docs_count} مستند جديد."
 # ============================================================
+# 7. Gradio UI and Main Execution
 # ============================================================
+stop_flag = {"stop": False}
 is_running = False
 def run_full_process_wrapper(max_documents_to_save, progress=gr.Progress(track_tqdm=True)):
+    global is_running, stop_flag
     if is_running:
+        yield "⏳ العملية قيد التشغيل بالفعل...", "⏳ قيد التشغيل..."
+        return
     is_running = True
+    stop_flag["stop"] = False
     output_log = []
     def log_message(msg):
         nonlocal output_log
         output_log.append(msg)
         return "\n".join(output_log)
     try:
+        yield log_message("🏁 بدء عملية الإعداد والتحقق..."), "⏳ قيد التشغيل..."
         db_ok, db_msg = setup_database()
         if not db_ok:
             is_running = False
+            yield log_message(f"❌ فشل إعداد قاعدة البيانات: {db_msg}"), "🔴 فشل"
             return
+        yield log_message(f"🚀 بدء عملية الزحف للموقع (حد وقائي {PAGE_LIMIT} صفحة)..."), "⏳ قيد التشغيل..."
         all_links = crawl_site(SCRAPE_URL, page_limit=PAGE_LIMIT)
+        yield log_message(f"✅ تم العثور على {len(all_links)} رابط فريد."), "⏳ قيد التشغيل..."
         if not all_links:
             is_running = False
+            yield log_message("⚠️ لم يتم العثور على أي روابط. انتهت العملية."), "✅ مكتمل"
             return
         max_docs_int = int(max_documents_to_save)
+        yield log_message(f"🔄 ستبدأ الآن عملية المعالجة والحفظ (الهدف: {max_docs_int} مستند) باستخدام {MAX_WORKERS} معالج متوازي."), "⏳ قيد التشغيل..."
+        final_message = process_and_store_links(all_links, max_docs_int, stop_flag)
+        yield log_message(f"\n🎉 {final_message}"), "✅ مكتمل"
     except Exception as e:
+        yield log_message(f"\n💥 حدث خطأ فادح وغير متوقع: {e}"), "🔴 فشل"
     finally:
         is_running = False
+        if stop_flag["stop"]:
+             yield log_message("⏹️ تم إيقاف العملية."), "🛑 متوقف"
+def stop_process():
+    global stop_flag
+    stop_flag["stop"] = True
+    return "🛑 جاري إيقاف العملية...", "🛑 متوقف"
 def list_project_files():
     try:
         files = os.listdir('.')
         file_details = []
+        for f in sorted(files):
             if os.path.isfile(f):
                 size_bytes = os.path.getsize(f)
                 size_kb = f"{size_bytes / 1024:.2f} KB"
                 file_details.append([f, size_kb])
         if not file_details:
             return pd.DataFrame(columns=["File Name", "Size"])
         df = pd.DataFrame(file_details, columns=["اسم الملف", "الحجم"])
         return df
     except Exception as e:
         return pd.DataFrame([["خطأ في عرض الملفات", str(e)]], columns=["اسم الملف", "الحجم"])
 def main():
     with gr.Blocks(theme=gr.themes.Soft(), title="Web Scraper & Indexer") as demo:
         gr.Markdown(
             """
+            # 🚀 Web Scraper & Intelligent Indexer (V9 - Final)
             أداة متكاملة للتحكم في عملية كشط المواقع وفهرسة محتواها في قاعدة البيانات.
             """
         )
         with gr.Tabs():
             with gr.Tab("التطبيق الرئيسي"):
+                with gr.Row():
+                    status_indicator = gr.Textbox(label="الحالة", value="⚪ خامل", interactive=False, scale=3)
+                    doc_count_display = gr.Markdown("### العدد الحالي: جاري التحميل...")
+                    refresh_count_btn = gr.Button("🔄 تحديث العدد")
+                with gr.Row():
+                    run_btn = gr.Button("🚀 ابدأ الزحف والفهرسة", variant="primary", scale=2)
+                    stop_btn = gr.Button("🛑 إيقاف", variant="stop", scale=1)
+                max_docs_input = gr.Number(label="الحد الأقصى لعدد المستندات الجديدة للإضافة", value=50, minimum=1, step=10)
+                with gr.Accordion("عرض سجل العمليات (Log)", open=True):
+                    output_log = gr.Textbox(label="سجل العمليات", lines=20, interactive=False, autoscroll=True)
+                run_event = run_btn.click(fn=run_full_process_wrapper, inputs=[max_docs_input], outputs=[output_log, status_indicator])
+                stop_btn.click(fn=stop_process, inputs=[], outputs=[output_log, status_indicator], cancels=[run_event])
+                refresh_count_btn.click(fn=get_document_count, inputs=None, outputs=doc_count_display)
             with gr.Tab("ملفات المشروع"):
+               gr.Markdown("## عرض الملفات الموجودة في مجلد المشروع.")
+               refresh_files_btn = gr.Button("🔄 تحديث قائمة الملفات")
+               file_display = gr.DataFrame(headers=["اسم الملف", "الحجم"], datatype=["str", "str"], label="قائمة الملفات")
+               refresh_files_btn.click(fn=list_project_files, inputs=[], outputs=file_display)
+        demo.load(fn=get_document_count, inputs=None, outputs=doc_count_display)
+        demo.load(fn=list_project_files, inputs=[], outputs=file_display)
     print("\n✅ الواجهة المحسنة جاهزة.")
+    demo.launch(server_name="0.0.0.0", share=True)
 if __name__ == "__main__":
     main()