Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on Aug 10

Commit

f1150bb

1 Parent(s): a300f9d

fix pnp pimpinan

Browse files

Files changed (1) hide show

scrapping/pnp_scrap.py +286 -416

scrapping/pnp_scrap.py CHANGED Viewed

@@ -3,27 +3,23 @@ from scrapy.crawler import CrawlerProcess
 from datetime import datetime
 import re
 import os
-import tempfile
-import logging
-from typing import Optional, List, Dict, Any
 from supabase import create_client, Client
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-# Environment variables with validation
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
-SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
 class PNPContentSpider(scrapy.Spider):
     name = 'pnp_content_spider'
-    start_urls = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
     excluded_subdomains = [
         'akt.pnp.ac.id',
-        'an.pnp.ac.id',
         'bing.pnp.ac.id',
         'elektro.pnp.ac.id',
         'me.pnp.ac.id',
@@ -32,324 +28,274 @@ class PNPContentSpider(scrapy.Spider):
     ]
     custom_settings = {
-        'DOWNLOAD_DELAY': 2,
         'RETRY_TIMES': 3,
         'HTTPCACHE_ENABLED': False,
         'ROBOTSTXT_OBEY': True,
         'CONCURRENT_REQUESTS': 1,
-        'RETRY_ENABLED': True,
-        'USER_AGENT': 'PNPBot/1.0 (+https://www.pnp.ac.id)',
-        'DOWNLOAD_TIMEOUT': 60,
-        'DEPTH_LIMIT': 3,
-        'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
     }
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Validate environment variables
-        if not all([SUPABASE_URL, SUPABASE_KEY]):
-            raise ValueError("Missing required environment variables: SUPABASE_URL and SUPABASE_KEY")
-        try:
-            self.supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
-        except Exception as e:
-            self.logger.error(f"Failed to initialize Supabase client: {e}")
-            raise
-        self.bucket = SUPABASE_BUCKET
-        self.processed_urls = set()
-        self.upload_stats = {'success': 0, 'failed': 0}
-    def should_follow_link(self, url: str) -> bool:
-        """Check if URL should be followed based on exclusion rules"""
-        if not url or url.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
-            return False
-        # Check for excluded subdomains
-        for subdomain in self.excluded_subdomains:
-            if subdomain in url:
-                return False
-        # Skip certain file types
-        excluded_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
-                             '.jpg', '.jpeg', '.png', '.gif', '.svg', '.zip', '.rar']
-        if any(url.lower().endswith(ext) for ext in excluded_extensions):
-            return False
-        return True
-    def format_paragraph(self, text: str, max_words: int = 150) -> str:
-        """Format text into well-structured paragraphs"""
-        if not text:
-            return ""
-        # Clean and normalize text
-        text = re.sub(r'\s+', ' ', text.strip())
-        sentences = re.split(r'(?<=[.!?])\s+', text)
         paragraph = ''
         word_count = 0
         for sentence in sentences:
             words = sentence.split()
-            if word_count + len(words) > max_words and word_count >= 50:
-                break
             word_count += len(words)
             paragraph += sentence + ' '
         return paragraph.strip()
     def parse(self, response):
-        """Parse main navigation and follow links"""
-        if response.status != 200:
-            self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
-            return
         self.logger.info(f"Processing main page: {response.url}")
-        # Parse navigation items
         nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
         for item in nav_items:
-            # Main menu item
-            main_title = self.extract_menu_title(item)
             main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
-            if main_link and self.should_follow_link(main_link):
                 main_link = response.urljoin(main_link)
-                if main_link not in self.processed_urls:
-                    self.processed_urls.add(main_link)
-                    yield scrapy.Request(
-                        main_link,
-                        callback=self.parse_content,
-                        meta={'page_title': main_title, 'menu_path': main_title},
-                        errback=self.handle_error
-                    )
-            # Submenu items
             submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
             for submenu in submenus:
-                submenu_title = self.extract_menu_title(submenu)
                 submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
-                if submenu_link and self.should_follow_link(submenu_link):
                     submenu_link = response.urljoin(submenu_link)
-                    if submenu_link not in self.processed_urls:
-                        self.processed_urls.add(submenu_link)
-                        menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
-                        yield scrapy.Request(
-                            submenu_link,
-                            callback=self.parse_content,
-                            meta={'page_title': submenu_title, 'menu_path': menu_path},
-                            errback=self.handle_error
-                        )
-    def extract_menu_title(self, item) -> str:
-        """Extract menu title from navigation item"""
-        title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
-        if not title:
-            title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
-        return title or "Unknown"
-    def handle_error(self, failure):
-        """Handle request errors"""
-        self.logger.error(f"Request failed: {failure.request.url} - {failure.value}")
-    def parse_content(self, response):
-        """Parse content from pages"""
-        if response.status != 200:
-            return
-        page_title = response.meta.get('page_title', 'Unknown Page')
-        menu_path = response.meta.get('menu_path', '')
-        # Extract page title if not provided
-        if page_title == 'Unknown Page':
-            title_selectors = ['h1.entry-title::text', 'h1.page-title::text', 'title::text', 'h1::text']
-            for selector in title_selectors:
-                title = response.css(selector).get()
-                if title:
-                    page_title = title.strip()
-                    break
-        self.logger.info(f"Extracting content from: {response.url} ({page_title})")
-        # Special case handling
-        if self.is_leadership_page(response.url):
-            content_text = self.parse_leadership_page(response, page_title)
-        else:
-            content_text = self.parse_general_content(response, page_title, menu_path)
-        if content_text:
-            result = self.upload_content(content_text, page_title, response.url, menu_path)
-            if result:
-                yield result
-        # Follow additional links on same domain
-        yield from self.follow_additional_links(response, menu_path)
-    def is_leadership_page(self, url: str) -> bool:
-        """Check if this is the leadership page"""
-        return url.strip("/") == "https://www.pnp.ac.id/pnp-profil/pimpinan-pnp"
-    def parse_leadership_page(self, response, page_title: str) -> str:
-        """Parse the leadership page with special handling"""
-        self.logger.info("Detected special page: Pimpinan PNP")
-        paragraphs = []
-        # Find all tables that contain leadership data
-        tables = response.css('table')
-        for table in tables:
-            # Skip nested tables (social media icons, etc.)
-            if table.css('img'):
-                continue
-            rows = table.css('tr')
-            leader_info = {}
-            position = ""
-            for i, row in enumerate(rows):
-                cells = row.css('td')
-                # First row often contains the position title
-                if len(cells) == 1 or (len(cells) == 3 and cells[0].get('colspan')):
-                    # Extract position from first row
-                    position_text = row.xpath('string(.)').get('').strip()
-                    if any(title in position_text.upper() for title in ['DIREKTUR', 'WAKIL DIREKTUR']):
-                        position = position_text
-                        # Clean up position text
-                        position = re.sub(r'<[^>]+>', '', position)  # Remove any HTML tags
-                        position = re.sub(r'\s+', ' ', position).strip()  # Normalize whitespace
                     continue
-                # Data rows should have exactly 3 cells (key : value)
-                if len(cells) == 3:
-                    key = cells[0].xpath('string(.)').get('').strip()
-                    colon = cells[1].xpath('string(.)').get('').strip()  # Should be ":"
-                    value = cells[2].xpath('string(.)').get('').strip()
-                    # Clean the value from any links but keep the text
-                    value_elem = cells[2]
-                    links = value_elem.css('a')
-                    if links:
-                        # If there's a link, extract the text content
-                        value = links[0].xpath('string(.)').get('').strip()
-                    # Validate and normalize key-value pairs
-                    if key and value and colon == ':' and key.lower() not in ['no', 'keterangan', 'foto']:
-                        # Normalize field names
-                        key_mapping = {
-                            'nama': 'Nama',
-                            'nama lengkap': 'Nama',
-                            'nidn': 'NIDN',
-                            'nip': 'NIP',
-                            'jabatan akademik': 'Jabatan Akademik',
-                            'jurusan': 'Jurusan',
-                            'program studi': 'Program Studi'
-                        }
-                        normalized_key = key_mapping.get(key.lower(), key)
-                        leader_info[normalized_key] = value
-            # If we found leadership data, create narrative
-            if leader_info and position:
-                leader_info['Jabatan'] = position
-                narrative = self.create_leader_narrative(leader_info)
-                if narrative:
-                    paragraphs.append(narrative)
-        # Also extract any descriptive paragraphs about leaders
-        content_paragraphs = response.css('div.entry-content p')
-        for para in content_paragraphs:
-            para_text = para.xpath('string(.)').get('').strip()
-            # Look for biographical information (text in italics often contains bio info)
-            if para.css('em') and len(para_text.split()) > 20:
-                # Extract just the italic text which usually contains the biography
-                italic_text = ' '.join(para.css('em *::text').getall()).strip()
-                if italic_text and len(italic_text.split()) > 10:
-                    paragraphs.append(f"Informasi tambahan: {italic_text}")
-        return self.format_final_content(page_title, response.url, paragraphs)
-    def create_leader_narrative(self, leader_info: Dict[str, str]) -> str:
-        """Create narrative text from leader information - improved version"""
-        # Get information with flexible field matching
-        jabatan = (leader_info.get("Jabatan") or
-                  leader_info.get("jabatan") or
-                  "Pejabat di PNP")
-        nama = (leader_info.get("Nama") or
-               leader_info.get("Nama Lengkap") or
-               leader_info.get("nama") or
-               "Tidak diketahui")
-        jabatan_akademik = (leader_info.get("Jabatan Akademik") or
-                           leader_info.get("jabatan akademik") or "")
-        jurusan = (leader_info.get("Jurusan") or
-                  leader_info.get("jurusan") or "")
-        prodi = (leader_info.get("Program Studi") or
-                leader_info.get("program studi") or "")
-        nidn = (leader_info.get("NIDN") or
-               leader_info.get("nidn") or "")
-        nip = (leader_info.get("NIP") or
-              leader_info.get("nip") or "")
-        # Build narrative with better formatting
-        narrative_parts = []
-        # Clean up position title
-        if jabatan:
-            # Remove extra formatting and normalize
-            jabatan = re.sub(r'\s+', ' ', jabatan).strip()
-            jabatan = jabatan.replace('WAKIL DIREKTUR BIDANG', 'Wakil Direktur Bidang')
-            jabatan = jabatan.replace('DIREKTUR', 'Direktur')
-        narrative_parts.append(f"{jabatan} Politeknik Negeri Padang adalah {nama}.")
-        if jabatan_akademik and jabatan_akademik.lower() not in ['tidak ada', '-']:
-            narrative_parts.append(f"Beliau memiliki jabatan akademik {jabatan_akademik}.")
-        if jurusan and jurusan.lower() not in ['tidak ada', '-']:
-            narrative_parts.append(f"Berasal dari Jurusan {jurusan}.")
-        if prodi and prodi.lower() not in ['tidak ada', '-']:
-            # Clean up program study name
-            prodi = re.sub(r'^(D-?[34]|Diploma)\s*', '', prodi).strip()
-            narrative_parts.append(f"Program studi {prodi}.")
-        if nidn and nidn.lower() not in ['tidak ada', '-']:
-            narrative_parts.append(f"NIDN: {nidn}.")
-        if nip and nip.lower() not in ['tidak ada', '-']:
-            narrative_parts.append(f"NIP: {nip}.")
-        return " ".join(narrative_parts)
-    def parse_general_content(self, response, page_title: str, menu_path: str) -> str:
-        """Parse general page content"""
         paragraphs = []
-        # Try different content selectors
         content_selectors = [
             'div.entry-content', 'article.post', 'main.site-main',
-            'div.content', 'div.main-content', 'div#content',
-            'div.page-content', 'div.post-content'
         ]
         for selector in content_selectors:
             content_area = response.css(selector)
             if content_area:
-                elements = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div')
-                for elem in elements:
-                    text = self.extract_element_text(elem, response)
-                    if text and len(text.split()) >= 5:  # Filter short texts
                         paragraphs.append(text)
                 if paragraphs:
                     break
@@ -357,185 +303,109 @@ class PNPContentSpider(scrapy.Spider):
         # Fallback: extract from body
         if not paragraphs:
             body_texts = response.css('body *::text').getall()
-            paragraphs = [t.strip() for t in body_texts if t.strip() and len(t.strip().split()) >= 5]
         # Format paragraphs
         formatted_paragraphs = []
         for para in paragraphs:
-            para = re.sub(r'\s+', ' ', para.strip())
             if len(para.split()) >= 10:
-                formatted = self.format_paragraph(para)
-                if formatted and formatted not in formatted_paragraphs:
-                    formatted_paragraphs.append(formatted)
-        content = self.format_final_content(page_title, response.url, formatted_paragraphs)
-        # Add table data
-        table_content = self.extract_table_data(response)
-        if table_content:
-            content += f"\n\n# Tabel Data\n\n{table_content}"
-        return content
-    def extract_element_text(self, elem, response) -> str:
-        """Extract text from element including links"""
-        text = ' '.join(elem.css('*::text').getall()).strip()
-        # Add link information
-        links = elem.css('a::attr(href)').getall()
-        for link in links:
-            if link and not link.startswith('#'):
-                full_link = response.urljoin(link)
-                text += f" (Link: {full_link})"
-        return text
-    def extract_table_data(self, response) -> str:
         """Extract and format table data"""
-        table_output = []
         tables = response.css('table')
-        for i, table in enumerate(tables, 1):
             table_rows = []
             for row in table.css('tr'):
                 cells = row.css('th, td')
                 row_data = []
                 for cell in cells:
-                    cell_text = ' '.join(cell.css('*::text').getall()).strip()
-                    link = cell.css('a::attr(href)').get()
-                    if link:
                         cell_text += f" (Link: {response.urljoin(link)})"
                     if cell_text:
                         row_data.append(cell_text)
                 if row_data:
                     table_rows.append(" | ".join(row_data))
             if table_rows:
-                table_output.append(f"## Tabel {i}\n")
-                table_output.extend(table_rows)
-                table_output.append("")  # Add spacing
-        return "\n".join(table_output)
-    def format_final_content(self, page_title: str, url: str, paragraphs: List[str]) -> str:
-        """Format final content text"""
         return f"""# {page_title}
-Tanggal: {datetime.now().strftime('%d %B %Y')}
-URL: {url}
-{chr(10).join(paragraphs) if paragraphs else 'Tidak ada konten yang dapat diekstrak.'}"""
-    def upload_content(self, content_text: str, page_title: str, url: str, menu_path: str) -> Optional[Dict[str, Any]]:
-        """Upload content to Supabase storage"""
-        # Create safe filename
         safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
-        safe_title = re.sub(r'[-\s]+', '-', safe_title)[:50]  # Limit length
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = f"{safe_title}_{timestamp}.txt"
         try:
-            with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False, suffix='.txt') as temp_file:
-                temp_file.write(content_text)
-                temp_path = temp_file.name
-            # Upload to Supabase
-            result = self.supabase.storage.from_(self.bucket).upload(
                 path=filename,
-                file=temp_path,
                 file_options={"content-type": "text/plain; charset=utf-8"}
             )
-            self.upload_stats['success'] += 1
-            self.logger.info(f"✅ Uploaded {filename} successfully.")
-            return {
-                'url': url,
-                'title': page_title,
-                'menu_path': menu_path,
-                'uploaded_as': filename,
-                'timestamp': datetime.now().isoformat(),
-                'content_length': len(content_text)
-            }
         except Exception as e:
-            self.upload_stats['failed'] += 1
-            self.logger.error(f"❌ Upload error for {filename}: {str(e)}")
-            return None
-        finally:
-            # Clean up temporary file
-            if 'temp_path' in locals() and os.path.exists(temp_path):
-                os.remove(temp_path)
-    def follow_additional_links(self, response, menu_path: str):
-        """Follow additional links on the same domain"""
         current_domain = response.url.split('//')[1].split('/')[0]
-        # Only follow additional links for non-PNP domains
         if 'pnp.ac.id' not in current_domain:
-            header_selectors = ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']
             header_links = []
-            for selector in header_selectors:
-                header_links.extend(response.css(selector).getall())
-            # Process unique links
-            processed_links = set()
-            for link in header_links:
-                if not link or link in processed_links:
-                    continue
-                if self.should_follow_link(link):
-                    full_link = response.urljoin(link)
-                    if current_domain in full_link and full_link not in self.processed_urls:
-                        processed_links.add(link)
-                        self.processed_urls.add(full_link)
-                        yield scrapy.Request(
-                            url=full_link,
-                            callback=self.parse_content,
-                            meta={
-                                'page_title': 'Header Link',
-                                'menu_path': f"{menu_path} > Header"
-                            },
-                            errback=self.handle_error
-                        )
-    def closed(self, reason):
-        """Called when spider closes"""
-        self.logger.info(f"Spider closed: {reason}")
-        self.logger.info(f"Upload statistics - Success: {self.upload_stats['success']}, Failed: {self.upload_stats['failed']}")
-        self.logger.info(f"Total URLs processed: {len(self.processed_urls)}")
 if __name__ == '__main__':
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
-    )
-    # Validate environment before starting
-    if not all([SUPABASE_URL, SUPABASE_KEY]):
-        print("❌ Missing required environment variables!")
-        exit(1)
-    try:
-        process = CrawlerProcess({
-            'USER_AGENT': 'PNPBot/1.0 (+https://www.pnp.ac.id)',
-            'DOWNLOAD_DELAY': 2,
-            'ROBOTSTXT_OBEY': True,
-            'LOG_LEVEL': 'INFO',
-            'CONCURRENT_REQUESTS': 1,
-            'DOWNLOAD_TIMEOUT': 60,
-            'RETRY_TIMES': 3,
-            'HTTPCACHE_ENABLED': False,
-            'DEPTH_LIMIT': 3,
-        })
-        process.crawl(PNPContentSpider)
-        process.start()
-    except Exception as e:
-        logging.error(f"Failed to run spider: {e}")
-        raise

 from datetime import datetime
 import re
 import os
 from supabase import create_client, Client
+import html
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
+SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
 class PNPContentSpider(scrapy.Spider):
     name = 'pnp_content_spider'
+    start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
     excluded_subdomains = [
         'akt.pnp.ac.id',
+        'an.pnp.ac.id',
         'bing.pnp.ac.id',
         'elektro.pnp.ac.id',
         'me.pnp.ac.id',
     ]
     custom_settings = {
+        'DOWNLOAD_DELAY': 1,
         'RETRY_TIMES': 3,
         'HTTPCACHE_ENABLED': False,
         'ROBOTSTXT_OBEY': True,
         'CONCURRENT_REQUESTS': 1,
+        'RETRY_ENABLED': True,
+        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     }
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text content"""
+        if not text:
+            return ""
+        # Decode HTML entities
+        text = html.unescape(text)
+        # Remove extra whitespace and normalize
+        text = ' '.join(text.split())
+        # Fix common encoding issues
+        text = text.replace('â€œ', '"').replace('â€', '"').replace('â€™', "'")
+        text = text.replace('â€"', '—').replace('â€"', '–')
+        return text.strip()
+    def format_paragraph(self, text: str) -> str:
+        text = self.clean_text(text)
+        sentences = re.split(r'(?<=[.!?]) +', text)
         paragraph = ''
         word_count = 0
         for sentence in sentences:
             words = sentence.split()
             word_count += len(words)
             paragraph += sentence + ' '
+            if 50 <= word_count <= 150:
+                break
         return paragraph.strip()
     def parse(self, response):
         self.logger.info(f"Processing main page: {response.url}")
         nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
         for item in nav_items:
+            main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
+            if not main_title:
+                main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
             main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
+            if main_link and not main_link.startswith('#'):
                 main_link = response.urljoin(main_link)
+                if "jurusan" in main_link.lower():
+                    continue
+                yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
             submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
             for submenu in submenus:
+                submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
+                if not submenu_title:
+                    submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
                 submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
+                if submenu_link and not submenu_link.startswith('#'):
                     submenu_link = response.urljoin(submenu_link)
+                    if "jurusan" in submenu_link.lower():
+                        continue
+                    menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
+                    yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
+    def extract_leadership_info(self, response):
+        """Extract leadership information from the special leadership page"""
+        self.logger.info("Extracting leadership information from special page")
+        leaders_data = []
+        # Try multiple table selectors based on the HTML structure shown
+        tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
+        if tables:
+            # Process each table
+            for table_idx, table in enumerate(tables):
+                self.logger.info(f"Processing table {table_idx + 1}")
+                rows = table.css('tr')
+                if not rows:
                     continue
+                leader_info = {}
+                position_title = ""
+                # Look for position title (like "DIREKTUR")
+                title_elements = table.css('strong, .position-title, th')
+                for title_elem in title_elements:
+                    title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
+                    if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
+                        position_title = title_text
+                        break
+                # Extract key-value pairs from table rows
+                for row in rows:
+                    cells = row.css('td, th')
+                    if len(cells) >= 3:
+                        # Format: Label | : | Value (3 columns)
+                        key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
+                        separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
+                        value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
+                        if key and value and separator == ":":
+                            leader_info[key] = value
+                    elif len(cells) == 2:
+                        # Format: Label | Value (2 columns)
+                        key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
+                        value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
+                        if key and value and key != value:
+                            # Skip if key contains colon (likely "Label:")
+                            clean_key = key.replace(':', '').strip()
+                            leader_info[clean_key] = value
+                # Add position title if found
+                if position_title:
+                    leader_info['Posisi'] = position_title
+                # If we found structured data, add it
+                if leader_info:
+                    leaders_data.append(leader_info)
+                    self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
+        # Fallback: Extract from general content structure
+        if not leaders_data:
+            self.logger.info("No table data found, trying general content extraction")
+            # Look for profile sections
+            profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
+            for section in profile_sections:
+                section_text = self.clean_text(' '.join(section.css('*::text').getall()))
+                # Check if this section contains leadership info
+                if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
+                    # Try to extract structured info from the text
+                    leader_info = {'description': section_text}
+                    # Try to extract specific details using regex
+                    name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
+                    if name_match:
+                        leader_info['Nama'] = name_match.group(0).strip()
+                    nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
+                    if nidn_match:
+                        leader_info['NIDN'] = nidn_match.group(1)
+                    leaders_data.append(leader_info)
+        return leaders_data
+    def format_leadership_content(self, leaders_data):
+        """Format leadership data into readable content"""
+        formatted_content = []
+        for idx, leader in enumerate(leaders_data, 1):
+            if isinstance(leader, dict):
+                if 'description' in leader and len(leader) == 1:
+                    # Simple description format
+                    content = f"## Pimpinan {idx}\n\n{leader['description']}"
+                else:
+                    # Structured data format
+                    position = leader.get("Posisi", f"Pimpinan {idx}")
+                    content = f"## {position}\n\n"
+                    # Format key information in a logical order
+                    ordered_keys = ['Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi']
+                    # Add ordered information first
+                    for key in ordered_keys:
+                        if key in leader:
+                            content += f"**{key}**: {leader[key]}\n\n"
+                    # Add remaining information
+                    for key, value in leader.items():
+                        if key not in ordered_keys and key not in ['Posisi', 'description']:
+                            content += f"**{key}**: {value}\n\n"
+                    # Add description if exists
+                    if 'description' in leader:
+                        content += f"\n{leader['description']}\n\n"
+                formatted_content.append(content.strip())
+        return formatted_content
+    def parse_content(self, response):
+        page_title = response.meta.get('page_title', 'Unknown Page')
+        menu_path = response.meta.get('menu_path', '')
+        if page_title == 'Unknown Page':
+            page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
+        self.logger.info(f"Extracting content from: {response.url} ({page_title})")
         paragraphs = []
+        # 🔹 Special case: halaman pimpinan PNP
+        if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
+            self.logger.info("Detected leadership page - using special extraction")
+            leaders_data = self.extract_leadership_info(response)
+            self.logger.info(f"Found {len(leaders_data)} leadership entries")
+            if leaders_data:
+                formatted_leaders = self.format_leadership_content(leaders_data)
+                paragraphs = formatted_leaders
+                # Also extract any additional content from the page
+                additional_content = self.extract_general_content(response)
+                if additional_content:
+                    paragraphs.extend(["## Informasi Tambahan"] + additional_content)
+            else:
+                # Fallback to general content extraction
+                self.logger.warning("Leadership extraction failed, falling back to general extraction")
+                paragraphs = self.extract_general_content(response)
+        else:
+            # 🔹 Normal content extraction
+            paragraphs = self.extract_general_content(response)
+        # Create final content
+        content_text = self.create_final_content(page_title, response.url, paragraphs)
+        # Add table data if any (but skip for leadership pages to avoid duplication)
+        if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
+            table_content = self.extract_table_data(response)
+            if table_content:
+                content_text += "\n\n## Data Tabel\n\n" + table_content
+        # Upload to Supabase
+        filename = self.upload_content(page_title, content_text)
+        yield {
+            'url': response.url,
+            'title': page_title,
+            'menu_path': menu_path,
+            'uploaded_as': filename,
+            'timestamp': datetime.now().isoformat(),
+            'content_length': len(content_text),
+            'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
+        }
+        # Continue with additional scraping if needed
+        self.process_additional_links(response, menu_path)
+    def extract_general_content(self, response):
+        """Extract general content from the page"""
+        paragraphs = []
         content_selectors = [
             'div.entry-content', 'article.post', 'main.site-main',
+            'div.content', 'div.main-content', 'div#content', 'div.page-content'
         ]
         for selector in content_selectors:
             content_area = response.css(selector)
             if content_area:
+                elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
+                for elem in elems:
+                    text = self.clean_text(' '.join(elem.css('*::text').getall()))
+                    if text and len(text.split()) >= 5:
+                        # Add links if any
+                        links = elem.css('a::attr(href)').getall()
+                        for link in links:
+                            if link and not link.startswith('#'):
+                                text += f" (Link: {response.urljoin(link)})"
                         paragraphs.append(text)
                 if paragraphs:
                     break
         # Fallback: extract from body
         if not paragraphs:
             body_texts = response.css('body *::text').getall()
+            combined_text = self.clean_text(' '.join(body_texts))
+            if combined_text:
+                # Split into meaningful chunks
+                sentences = re.split(r'(?<=[.!?])\s+', combined_text)
+                current_para = ""
+                for sentence in sentences:
+                    if len((current_para + " " + sentence).split()) <= 50:
+                        current_para += " " + sentence
+                    else:
+                        if current_para.strip():
+                            paragraphs.append(current_para.strip())
+                        current_para = sentence
+                if current_para.strip():
+                    paragraphs.append(current_para.strip())
         # Format paragraphs
         formatted_paragraphs = []
         for para in paragraphs:
             if len(para.split()) >= 10:
+                formatted_paragraphs.append(self.format_paragraph(para))
+        return formatted_paragraphs
+    def extract_table_data(self, response):
         """Extract and format table data"""
         tables = response.css('table')
+        table_output = []
+        for table_idx, table in enumerate(tables):
             table_rows = []
             for row in table.css('tr'):
                 cells = row.css('th, td')
                 row_data = []
                 for cell in cells:
+                    cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
+                    if link := cell.css('a::attr(href)').get():
                         cell_text += f" (Link: {response.urljoin(link)})"
                     if cell_text:
                         row_data.append(cell_text)
                 if row_data:
                     table_rows.append(" | ".join(row_data))
             if table_rows:
+                table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
+        return "\n\n".join(table_output)
+    def create_final_content(self, page_title, url, paragraphs):
+        """Create the final formatted content"""
         return f"""# {page_title}
+**Tanggal**: {datetime.now().strftime('%d %B %Y')}
+**URL**: {url}
+{chr(10).join(paragraphs)}"""
+    def upload_content(self, page_title, content_text):
+        """Upload content to Supabase"""
         safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
+        safe_title = re.sub(r'[-\s]+', '-', safe_title)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = f"{safe_title}_{timestamp}.txt"
         try:
+            supabase.storage.from_(SUPABASE_BUCKET).upload(
                 path=filename,
+                file=content_text.encode('utf-8'),
                 file_options={"content-type": "text/plain; charset=utf-8"}
             )
+            self.logger.info(f"Uploaded {filename} successfully.")
+            return filename
         except Exception as e:
+            self.logger.error(f"Upload error for {filename}: {str(e)}")
+            return f"failed_{filename}"
+    def process_additional_links(self, response, menu_path):
+        """Process additional links from the same domain"""
         current_domain = response.url.split('//')[1].split('/')[0]
         if 'pnp.ac.id' not in current_domain:
             header_links = []
+            for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
+                header_links.extend(response.css(sel).getall())
+            for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
+                full_link = response.urljoin(link)
+                if current_domain in full_link:
+                    yield scrapy.Request(
+                        url=full_link,
+                        callback=self.parse_content,
+                        meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
+                    )
 if __name__ == '__main__':
+    process = CrawlerProcess({
+        'USER_AGENT': 'PNPBot/1.0',
+        'DOWNLOAD_DELAY': 2,
+        'ROBOTSTXT_OBEY': True,
+        'LOG_LEVEL': 'INFO',
+        'CONCURRENT_REQUESTS': 1,
+        'DOWNLOAD_TIMEOUT': 100,
+        'RETRY_TIMES': 3,
+        'HTTPCACHE_ENABLED': False,
+        'FEED_EXPORT_ENCODING': 'utf-8'
+    })
+    process.crawl(PNPContentSpider)
+    process.start()