Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on Jul 5

Commit

c41b08a

1 Parent(s): 30065a4

update besar jurusan scrap

Browse files

Files changed (2) hide show

requirements.txt +1 -0
scrapping/jurusan_scrap.py +118 -824

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ scrapy
 supabase
 python-dotenv
 requests

 supabase
 python-dotenv
 requests
+beautifulsoup4

scrapping/jurusan_scrap.py CHANGED Viewed

@@ -1,831 +1,125 @@
 import scrapy
 from scrapy.crawler import CrawlerProcess
-import re
-import os
-import unicodedata
-from urllib.parse import urlparse, urljoin
-from datetime import datetime
-from collections import defaultdict
-from supabase import create_client
 from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-class PNPDepartmentSpider(scrapy.Spider):
-    name = 'improved_pnp_department_spider'
-    DEPARTMENTS = {
-        'akt.pnp.ac.id': 'Akuntansi',
-        'an.pnp.ac.id': 'Administrasi_Niaga',
-        'bing.pnp.ac.id': 'Bahasa_Inggris',
-        'elektro.pnp.ac.id': 'Teknik_Elektro',
-        'me.pnp.ac.id': 'Teknik_Mesin',
-        'sipil.pnp.ac.id': 'Teknik_Sipil',
-        'ti.pnp.ac.id': 'Teknologi_Informasi'
-    }
-    start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
-    visited_urls = set()
-    custom_settings = {
-        'DOWNLOAD_DELAY': 2.0,
-        'ROBOTSTXT_OBEY': True,
-        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'LOG_LEVEL': 'INFO',
-        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
-        "DOWNLOAD_TIMEOUT": 100,
-        'RETRY_TIMES': 3,
-        'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
-        'HTTPCACHE_ENABLED': True
-    }
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.supabase = create_client(
-            os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
-            os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
-        )
-        self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
-        self.department_data = defaultdict(lambda: defaultdict(list))
-        self.study_programs = defaultdict(list)
-        self.department_info = defaultdict(dict)
-    def start_requests(self):
-        for url in self.start_urls:
-            yield scrapy.Request(
-                url=url,
-                callback=self.parse_department_homepage,
-                errback=self.handle_error,
-                headers={'Accept': 'text/html,application/xhtml+xml'}
-            )
-    def parse_department_homepage(self, response):
-        domain = urlparse(response.url).netloc
-        department = self.DEPARTMENTS.get(domain, domain)
-        self.visited_urls.add(response.url)
-        self.logger.info(f"Processing department homepage: {domain} - {department}")
-        # Extract homepage content first
-        homepage_content = self.extract_content(response)
-        page_title = response.css('h1::text, .site-title::text, title::text').get()
-        if page_title:
-            page_title = page_title.strip()
-        else:
-            page_title = "Homepage"
-        if homepage_content:
-            self.save_page_content(
-                response.url,
-                page_title,
-                department,
-                domain,
-                'Beranda',
-                homepage_content
-            )
-        # Process navigation menu
-        nav_elements = self.extract_navigation(response)
-        for nav_item in nav_elements:
-            if not nav_item['link'] or nav_item['link'].startswith('#'):
-                continue
-            full_url = response.urljoin(nav_item['link'])
-            category = self.determine_category(nav_item['text'])
-            if full_url not in self.visited_urls:
-                yield scrapy.Request(
-                    url=full_url,
-                    callback=self.parse_content_page,
-                    meta={
-                        'page_title': nav_item['text'],
-                        'category': category,
-                        'department': department,
-                        'domain': domain,
-                        'menu_path': nav_item['text']
-                    },
-                    errback=self.handle_error
-                )
-        # Find and process all study program links
-        study_program_links = self.extract_study_program_links(response)
-        for prog in study_program_links:
-            full_url = response.urljoin(prog['link'])
-            if full_url not in self.visited_urls:
-                yield scrapy.Request(
-                    url=full_url,
-                    callback=self.parse_study_program,
-                    meta={
-                        'page_title': prog['title'],
-                        'department': department,
-                        'domain': domain
-                    },
-                    errback=self.handle_error
-                )
-        # Find and process vision & mission specifically
-        vision_mission_links = self.extract_vision_mission_links(response)
-        for vm_link in vision_mission_links:
-            full_url = response.urljoin(vm_link['link'])
-            if full_url not in self.visited_urls:
-                yield scrapy.Request(
-                    url=full_url,
-                    callback=self.parse_vision_mission,
-                    meta={
-                        'page_title': vm_link['title'],
-                        'department': department,
-                        'domain': domain
-                    },
-                    errback=self.handle_error
-                )
-    def extract_navigation(self, response):
-        """Extract navigation elements from page"""
-        nav_items = []
-        # Try multiple selectors that commonly contain navigation
-        nav_selectors = [
-            'nav a', '.navbar a', '.navigation a', '.main-menu a', '.nav a',
-            '#menu a', '.menu a', 'header a', '.navbar-collapse a',
-            'ul.nav a', '.dropdown-menu a', '.megamenu a',
-            '#main-menu a', '.main-navigation a', '#primary-menu a',
-            '.top-menu a', '.primary-menu a', '#nav a'
-        ]
-        for selector in nav_selectors:
-            for item in response.css(selector):
-                text = self.clean_text(' '.join(item.css('::text').getall()))
-                link = item.css('::attr(href)').get()
-                if text and link and len(text.strip()) > 1:
-                    if not self.is_social_media_link(link) and not self.is_unwanted_url(link):
-                        nav_items.append({
-                            'text': text.strip(),
-                            'link': response.urljoin(link)
-                        })
-        return nav_items
-    def extract_study_program_links(self, response):
-        """Extract study program links from complex WordPress navigation menus"""
-        program_links = []
-        # XPath untuk menemukan menu 'Program Studi' dan semua submenu-nya
-        base_xpath = """
-            //li[contains(@class, 'wp-block-navigation-submenu')]
-            [.//span[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi')]]
-            //ul[@class='wp-block-navigation__submenu-container']
-            /li[contains(@class, 'wp-block-navigation-submenu')]
-            /a[contains(@class, 'wp-block-navigation-item__content')]
-        """
-        # Ambil semua link program studi utama
-        for link in response.xpath(base_xpath):
-            text = self.clean_text(''.join(link.xpath('.//span[@class="wp-block-navigation-item__label"]//text()').getall()))
-            url = link.xpath('@href').get()
-            if text and url:
-                program_links.append({
-                    'title': text.strip(),
-                    'link': response.urljoin(url)
-                })
-        # Logika fallback untuk website yang menggunakan struktur berbeda
-        if not program_links:
-            program_links = self.fallback_extract_study_program_links(response)
-        return program_links
-    def fallback_extract_study_program_links(self, response):
-        """Fallback method for extracting study program links"""
-        program_links = []
-        keywords = ['program studi', 'prodi', 'd3', 'd4', 's1', 'diploma', 'sarjana']
-        for keyword in keywords:
-            # Case-insensitive search for links containing the keyword
-            xpath_expr = f"//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{keyword}')]"
-            for link in response.xpath(xpath_expr):
-                text = self.clean_text(''.join(link.xpath('.//text()').getall()))
-                url = link.xpath('@href').get()
-                if text and url:
-                    program_links.append({
-                        'title': text.strip(),
-                        'link': response.urljoin(url)
-                    })
-        return program_links
-    def extract_vision_mission_links(self, response):
-        """Extract links specifically for vision & mission"""
-        vm_links = []
-        # Terms related to vision & mission
-        vm_terms = ['visi', 'misi', 'vision', 'mission', 'visi-misi', 'visi & misi', 'visi dan misi']
-        # Look for links containing these terms using XPath
-        for term in vm_terms:
-            # Case-insensitive search
-            xpath_expr = f"//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{term}')]"
-            for link in response.xpath(xpath_expr):
-                text = self.clean_text(''.join(link.xpath('.//text()').getall()))
-                url = link.xpath('@href').get()
-                if text and url:
-                    vm_links.append({
-                        'title': text.strip(),
-                        'link': response.urljoin(url)
-                    })
-        return vm_links
-    def parse_content_page(self, response):
-        """Process regular content pages"""
-        meta = response.meta
-        self.visited_urls.add(response.url)
-        # Extract content from this page
-        content = self.extract_content(response)
-        if content:
-            self.save_page_content(
-                response.url,
-                meta['page_title'],
-                meta['department'],
-                meta['domain'],
-                meta['category'],
-                content,
-                meta.get('menu_path', '')
-            )
-        # Look for additional sub-links within this page
-        content_links = response.css('article a, .content a, .entry-content a, .post-content a, main a')
-        for link in content_links:
-            link_text = self.clean_text(' '.join(link.css('::text').getall()))
-            link_url = link.css('::attr(href)').get()
-            if link_text and link_url and len(link_text) > 3:
-                # Only follow internal links
-                parsed_url = urlparse(response.urljoin(link_url))
-                if parsed_url.netloc == meta['domain']:
-                    full_url = response.urljoin(link_url)
-                    if full_url not in self.visited_urls and not self.is_unwanted_url(full_url):
-                        yield scrapy.Request(
-                            url=full_url,
-                            callback=self.parse_content_page,
-                            meta={
-                                'page_title': link_text,
-                                'category': meta['category'],  # Keep parent category
-                                'department': meta['department'],
-                                'domain': meta['domain'],
-                                'menu_path': f"{meta.get('menu_path', '')} > {link_text}"
-                            },
-                            errback=self.handle_error
-                        )
-    def parse_study_program(self, response):
-        """Process study program pages specifically"""
-        meta = response.meta
-        self.visited_urls.add(response.url)
-        department = meta['department']
-        program_title = meta['page_title']
-        # Extract program details
-        program_details = self.extract_program_details(response)
-        # Add to the study programs collection
-        self.study_programs[department].append({
-            'title': program_title,
-            'url': response.url,
-            'details': program_details
-        })
-        # Also save as a regular page
-        content = self.extract_content(response)
-        if content:
-            self.save_page_content(
-                response.url,
-                program_title,
-                department,
-                meta['domain'],
-                'Program_Studi',
-                content
-            )
-    def extract_program_details(self, response):
-        """Enhanced program details extraction with better degree detection"""
-        details = {}  # Initialize details as empty dict
-        # Improved degree detection from multiple sources
-        degree_sources = [
-            response.css('title::text').get(),
-            response.css('h1::text').get(),
-            ' '.join(response.css('.breadcrumb ::text').getall())
-        ]
-        degree_pattern = re.compile(
-            r'\b(D[1-4]|S[1-3]|Diploma|Sarjana|Magister|Profesi|Spesialis|Terapan)\b',
-            re.IGNORECASE
-        )
-        for text in degree_sources:
-            if text and (match := degree_pattern.search(text)):
-                details['degree'] = match.group(1).upper()
-                break
-        # Extract accreditation status
-        accreditation = response.xpath(
-            '//span[contains(translate(., "ABCDE", "abcde"), "akreditasi")]'
-            '/following-sibling::span/text()'
-        ).get()
-        if accreditation:
-            details['accreditation'] = self.clean_text(accreditation)
-        # Extract description from the first paragraph
-        first_paragraph = response.css('p::text').get()
-        if first_paragraph:
-            details['description'] = self.clean_text(first_paragraph)
-        return details
-    def parse_vision_mission(self, response):
-        """Special handler for vision & mission pages"""
-        meta = response.meta
-        self.visited_urls.add(response.url)
-        department = meta['department']
-        vision_text = ""
-        mission_text = ""
-        # Look for vision section
-        vision_selectors = [
-            'h2:contains("Visi") + p', 'h3:contains("Visi") + p',
-            'h4:contains("Visi") + p', '.visi p', '#visi p',
-            'h2:contains("Vision") + p', 'h3:contains("Vision") + p',
-            'strong:contains("Visi") + p', 'b:contains("Visi") + p'
-        ]
-        for selector in vision_selectors:
-            try:
-                vision = response.css(selector).get()
-                if vision:
-                    vision_text = self.clean_text(scrapy.Selector(text=vision).css('::text').get(''))
-                    if vision_text:
-                        break
-            except:
-                continue
-        # If still not found, try looking for paragraphs after headings
-        if not vision_text:
-            for heading in response.css('h1, h2, h3, h4, h5, h6'):
-                heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-                if heading_text and ('visi' in heading_text.lower() or 'vision' in heading_text.lower()):
-                    # Try to get the next paragraph
-                    next_p = heading.xpath('following-sibling::p[1]')
-                    if next_p:
-                        vision_text = self.clean_text(' '.join(next_p.css('::text').getall()))
-                        break
-        # Look for mission section using similar approach
-        mission_selectors = [
-            'h2:contains("Misi") + p', 'h3:contains("Misi") + p',
-            'h4:contains("Misi") + p', '.misi p', '#misi p',
-            'h2:contains("Mission") + p', 'h3:contains("Mission") + p',
-            'strong:contains("Misi") + p', 'b:contains("Misi") + p'
-        ]
-        for selector in mission_selectors:
-            try:
-                mission = response.css(selector).get()
-                if mission:
-                    mission_text = self.clean_text(scrapy.Selector(text=mission).css('::text').get(''))
-                    if mission_text:
-                        break
-            except:
-                continue
-        # If still not found, try looking for paragraphs after headings
-        if not mission_text:
-            for heading in response.css('h1, h2, h3, h4, h5, h6'):
-                heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-                if heading_text and ('misi' in heading_text.lower() or 'mission' in heading_text.lower()):
-                    # Try to get the next paragraph
-                    next_p = heading.xpath('following-sibling::p[1]')
-                    if next_p:
-                        mission_text = self.clean_text(' '.join(next_p.css('::text').getall()))
-                        break
-        # Try to find mission list items
-        mission_list_items = []
-        for list_selector in ['h2:contains("Misi") ~ ul li', 'h3:contains("Misi") ~ ul li',
-                             'h4:contains("Misi") ~ ul li', '.misi ul li', '#misi ul li',
-                             'h2:contains("Mission") ~ ul li', 'h3:contains("Mission") ~ ul li']:
-            try:
-                items = response.css(f'{list_selector}::text').getall()
-                if items:
-                    mission_list_items = [self.clean_text(item) for item in items if self.clean_text(item)]
-                    if mission_list_items:
-                        break
-            except:
-                continue
-        # Store vision and mission in department info
-        if vision_text or mission_text or mission_list_items:
-            if vision_text:
-                self.department_info[department]['vision'] = vision_text
-            if mission_text:
-                self.department_info[department]['mission'] = mission_text
-            if mission_list_items:
-                self.department_info[department]['mission_items'] = mission_list_items
-            # Save as separate file for vision-mission
-            self.save_vision_mission(
-                department,
-                meta['domain'],
-                vision_text,
-                mission_text,
-                mission_list_items,
-                response.url
-            )
-        # Also save as a regular page
-        content = self.extract_content(response)
-        if content:
-            self.save_page_content(
-                response.url,
-                meta['page_title'],
-                department,
-                meta['domain'],
-                'Profil',
-                content
-            )
-    def extract_content(self, response):
-        """Extract content from a page in a structured format"""
-        content = {"paragraphs": [], "tables": [], "files": []}
-        # First try to find the main content areas
-        content_selectors = [
-            'div.entry-content', 'article.post', 'main.site-main',
-            'div.content', 'div.main-content', 'div#content', 'div.page-content',
-            'article', '.post-content', '.entry-content', '.content',
-            '.page-content', 'main', '#content', '.main-content',
-            '.article-content', '.single-content'
-        ]
-        main_content = None
-        for selector in content_selectors:
-            elements = response.css(selector)
-            if elements:
-                main_content = elements
-                break
-        # If no primary content found, use body
-        if not main_content:
-            main_content = response.css('body')
-        # Extract headings and paragraphs
-        for heading in main_content.css('h1, h2, h3, h4, h5, h6'):
-            heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-            if heading_text and len(heading_text) > 3:
-                heading_tag = heading.root.tag
-                content["paragraphs"].append(f"[{heading_tag.upper()}] {heading_text}")
-        # Extract paragraphs
-        for p in main_content.css('p'):
-            text = self.clean_text(' '.join(p.css('::text').getall()))
-            if text and len(text) > 10:  # Reduced minimum meaningful length
-                # Add any links found in this paragraph
-                links = []
-                for a in p.css('a'):
-                    link_text = self.clean_text(' '.join(a.css('::text').getall()))
-                    link_url = a.css('::attr(href)').get()
-                    if link_text and link_url:
-                        links.append(f"{link_text} (Link: {response.urljoin(link_url)})")
-                paragraph = text
-                if links:
-                    paragraph += f" | Links: {'; '.join(links)}"
-                content["paragraphs"].append(paragraph)
-        # Extract list items
-        for li in main_content.css('li'):
-            text = self.clean_text(' '.join(li.css('::text').getall()))
-            if text and len(text) > 10:
-                content["paragraphs"].append(f"• {text}")
-        # If no structured text elements found, try general text extraction
-        if not content["paragraphs"]:
-            # Get all text nodes within divs but not within scripts or styles
-            for div in main_content.css('div'):
-                text = self.clean_text(' '.join(div.xpath('./text()').getall()))
-                if text and len(text) > 30:
-                    content["paragraphs"].append(text)
-        # Extract tables
-        for table in main_content.css('table'):
-            rows = []
-            # Get header if it exists
-            headers = []
-            for th in table.css('thead th, tr th'):
-                header_text = self.clean_text(' '.join(th.css('::text').getall()))
-                if header_text:
-                    headers.append(header_text)
-            if headers:
-                rows.append(" - ".join(headers))
-            # Get table body rows
-            for tr in table.css('tbody tr, tr'):
-                if tr.css('th') and not tr.css('td'):
-                    continue  # Skip header rows already processed
-                cells = []
-                for td in tr.css('td'):
-                    cell_text = self.clean_text(' '.join(td.css('::text').getall()))
-                    link = td.css('a::attr(href)').get()
-                    if link:
-                        cell_text += f" (Link: {response.urljoin(link)})"
-                    if cell_text:
-                        cells.append(cell_text)
-                    else:
-                        cells.append(" ")  # Empty cell placeholder
-                if cells:
-                    rows.append(" - ".join(cells))
-            if len(rows) > 1:  # Only add if we have meaningful table
-                content["tables"].append("\n".join(rows))
-        # Extract downloads and files
-        for link in main_content.css('a[href]'):
-            href = link.css('::attr(href)').get()
-            if not href:
-                continue
-            link_text = self.clean_text(' '.join(link.css('::text').getall()))
-            if not link_text:
-                link_text = "Unduhan"
-            # Match common document formats
-            if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
-                # Extract file extension for better categorization
-                file_ext = href.split('.')[-1].lower()
-                content["files"].append({
-                    "title": link_text,
-                    "url": urljoin(response.url, href),
-                    "type": file_ext
-                })
-        return content if any(value for value in content.values()) else None
-    def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
-        """Save a page's content as a formatted text file"""
-        if not content or not title:
-            return
-        # Clean up title for filename
-        safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
-        safe_title = re.sub(r'[-\s]+', '-', safe_title)[:50]  # Limit filename length
-        # Prepare the content
-        formatted_content = f"""# {title}
-URL: {url}
-Tanggal: {datetime.now().strftime('%d %B %Y')}
-Jurusan: {department}
-Kategori: {category}
-"""
-        if menu_path:
-            formatted_content += f"Navigasi: {menu_path}\n"
-        formatted_content += "\n## Konten\n\n"
-        if content["paragraphs"]:
-            formatted_content += "\n".join(content["paragraphs"])
-        if content["tables"]:
-            formatted_content += "\n\n## Tabel Data\n\n"
-            for i, table in enumerate(content["tables"]):
-                formatted_content += f"### Tabel {i+1}\n{table}\n\n"
-        if content["files"]:
-            formatted_content += "\n\n## Berkas\n\n"
-            for file in content["files"]:
-                formatted_content += f"- {file['title']} [{file['type']}]: {file['url']}\n"
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Generate filename with department prefix
-        filename = f"{department}_{safe_title}_{timestamp}.txt"
-        # Upload file to Supabase
-        try:
-            upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
-                path=filename,
-                file=formatted_content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
-            )
-            self.logger.info(f"Successfully uploaded {filename}")
-            # Store in our collection for later summaries
-            self.department_data[department][category].append({
-                'title': title,
-                'url': url,
-                'filename': filename
-            })
-        except Exception as e:
-            self.logger.error(f"Upload failed for {filename}: {str(e)}")
-    def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
-        """Save vision & mission as a separate well-formatted file"""
-        filename = f"{department}_Visi_Misi.txt"
-        content = f"""# Visi dan Misi {department}
-URL: {url}
-Tanggal: {datetime.now().strftime('%d %B %Y')}
-Jurusan: {department}
-"""
-        if vision:
-            content += f"## Visi\n\n{vision}\n\n"
-        if mission:
-            content += f"## Misi\n\n{mission}\n\n"
-        if mission_items:
-            if not mission:  # Only add header if not already added
-                content += "## Misi\n\n"
-            for i, item in enumerate(mission_items, 1):
-                content += f"{i}. {item}\n"
-        try:
-            # Remove existing file if it exists
-            try:
-                self.supabase.storage.from_(self.storage_bucket).remove(filename)
-            except:
-                pass
-            upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
-                path=filename,
-                file=content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
-            )
-            self.logger.info(f"Successfully uploaded {filename}")
-        except Exception as e:
-            self.logger.error(f"Upload failed for {filename}: {str(e)}")
-    def clean_text(self, text):
-        """Clean and normalize text"""
-        if not text:
-            return ""
-        # Normalize unicode characters
-        text = unicodedata.normalize('NFKC', text)
-        # Replace multiple spaces with single space
-        text = re.sub(r'\s+', ' ', text)
-        # Remove special characters and non-printable characters
-        text = re.sub(r'[^\x20-\x7E\s\u00A0-\u00FF\u0100-\u017F]', '', text)
-        # Remove multiple periods
-        text = re.sub(r'\.{2,}', ' ', text)
-        return text.strip()
-    def determine_category(self, menu_text):
-        """Determine content category based on menu text"""
-        menu_lower = menu_text.lower()
-        # Define category mappings
-        categories = {
-            'Beranda': ['beranda', 'home', 'utama', 'main', 'index'],
-            'Profil': ['profil', 'profile', 'tentang', 'about', 'visi', 'misi', 'sejarah', 'history', 'struktur', 'organisasi', 'pimpinan', 'sambutan'],
-            'Program_Studi': ['program', 'studi', 'prodi', 'd3', 'd4', 'diploma', 'sarjana', 'akademik', 'jurusan', 'kurikulum'],
-            'Dosen': ['dosen', 'staff', 'tenaga', 'pengajar', 'lecturer', 'faculty'],
-            'Penelitian': ['penelitian', 'research', 'jurnal', 'karya', 'ilmiah', 'publikasi', 'paper'],
-            'Mahasiswa': ['mahasiswa', 'student', 'alumni', 'lulusan', 'graduate', 'kegiatan', 'activity', 'kemahasiswaan'],
-            'Fasilitas': ['fasilitas', 'facility', 'lab', 'laboratorium', 'gedung', 'building', 'sarana', 'prasarana'],
-            'Informasi': ['informasi', 'info', 'pengumuman', 'announcement', 'agenda', 'berita', 'news', 'event'],
-            'Kerjasama': ['kerjasama', 'cooperation', 'mitra', 'partner', 'industri', 'industry', 'collaboration'],
-            'Dokumen': ['dokumen', 'document', 'unduhan', 'download', 'berkas', 'file']
-        }
-        # Check each category
-        for category, terms in categories.items():
-            if any(term in menu_lower for term in terms):
-                return category
-        # Default category if no match
-        return 'Lainnya'
-    def is_social_media_link(self, url):
-        social_patterns = [
-            'facebook.com', 'twitter.com', 'instagram.com',
-            'youtube.com', 'linkedin.com', 'pinterest.com',
-            'tiktok.com', 'wa.me', 'whatsapp.com', 't.me'
-        ]
-        return any(pattern in url.lower() for pattern in social_patterns)
-    def is_unwanted_url(self, url):
-        """Check if URL should be skipped"""
-        # Skip certain file types
-        if re.search(r'\.(jpg|jpeg|png|gif|svg|ico|css|js)$', url.lower()):
-            return True
-        # Skip certain URL patterns
-        unwanted_patterns = [
-            'login', 'logout', 'signin', 'signup', 'register', 'admin',
-            'wp-', '/wp/', 'wordpress', 'comment', 'feed', 'rss', 'atom',
-            'javascript:', 'mailto:', 'tel:', 'page/', '/tag/', '/author/',
-            '/archive/', '/category/', '/search', 'kalender', '/ajax/', '/api/'
-        ]
-        return any(pattern in url.lower() for pattern in unwanted_patterns)
-    def handle_error(self, failure):
-        """Handle request errors"""
-        url = failure.request.url
-        self.visited_urls.add(url)  # Mark as visited to prevent retries
-        self.logger.error(f"Request failed: {url} - {str(failure.value)}")
-    def closed(self, reason):
-        """Finalize processing when spider is closed"""
-        self.logger.info("Spider closed. Generating summary report...")
-        # Log statistics
-        departments_count = len(self.department_data)
-        pages_count = sum(len(cat_data) for dept_data in self.department_data.values()
-                        for cat_data in dept_data.values())
-        self.logger.info(f"Crawled {departments_count} departments and {pages_count} pages")
-        self.logger.info(f"Found {len(self.study_programs)} departments with programs")
-        for dept, programs in self.study_programs.items():
-            self.logger.info(f"{dept}: {len(programs)} programs")
-        # Generate and upload the summary file
-        self.generate_summary_file()
-    def generate_summary_file(self):
-        """Generate comprehensive summary with program metadata"""
-        content = """# Daftar Lengkap Jurusan dan Program Studi Politeknik Negeri Padang\n\n"""
-        content += f"**Terakhir diperbarui**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
-        # Create reverse mapping from department name to domain
-        reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
-        for department, programs in self.study_programs.items():
-            # Get domain from reverse mapping
-            domain = reverse_departments.get(department, '')
-            website_url = f'https://{domain}' if domain else 'URL tidak ditemukan'
-            content += f"## {department.replace('_', ' ')}\n"
-            content += f"**Website**: {website_url}\n\n"
-            if programs:
-                for prog in programs:
-                    content += f"### {prog['title']}\n"
-                    content += f"- **Jenjang**: {prog['details'].get('degree', 'N/A')}\n"
-                    content += f"- **Akreditasi**: {prog['details'].get('accreditation', 'N/A')}\n"
-                    content += f"- **URL**: {prog['url']}\n"
-                    if 'description' in prog['details']:
-                        desc = prog['details']['description']
-                        content += f"\n**Deskripsi**:\n{desc}\n"
-                    content += "\n"
-            else:
-                content += "### Belum ada informasi program studi\n"
-            content += "\n---\n\n"
-        # Upload to Supabase
-        filename = "Daftar_Jurusan_dan_Prodi_Politeknik_Negeri_Padang.txt"
-        try:
-            self.supabase.storage.from_(self.storage_bucket).remove(filename)
-            self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
-                file=content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
             )
-            self.logger.info("Ringkasan jurusan berhasil diunggah")
-        except Exception as e:
-            self.logger.error(f"Gagal mengupload file ringkasan: {str(e)}")
-# Main execution
 if __name__ == "__main__":
-    process = CrawlerProcess()
-    process.crawl(PNPDepartmentSpider)
-    process.start()

 import scrapy
 from scrapy.crawler import CrawlerProcess
+from bs4 import BeautifulSoup
 from dotenv import load_dotenv
+from supabase import create_client
+from datetime import datetime
+import os, re
+# Load .env.production
+load_dotenv(".env.production")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
+JURUSAN_URLS = {
+    'akt.pnp.ac.id': 'Akuntansi',
+    'an.pnp.ac.id': 'Administrasi_Niaga',
+    'bing.pnp.ac.id': 'Bahasa_Inggris',
+    'elektro.pnp.ac.id': 'Teknik_Elektro',
+    'me.pnp.ac.id': 'Teknik_Mesin',
+    'sipil.pnp.ac.id': 'Teknik_Sipil',
+    'ti.pnp.ac.id': 'Teknologi_Informasi',
+}
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+PRODI_LIST = []
+TOTAL_PRODI = 0
+class JurusanSpider(scrapy.Spider):
+    name = "jurusan"
+    start_urls = [f"https://{url}/" for url in JURUSAN_URLS.keys()]
+    def parse(self, response):
+        jurusan_domain = response.url.split("//")[1].split("/")[0]
+        jurusan_nama = JURUSAN_URLS[jurusan_domain]
+        url = response.url
+        # Extract all visible text
+        soup = BeautifulSoup(response.text, "html.parser")
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        visible_text = soup.get_text(separator="\n")
+        # Clean text
+        lines = [line.strip() for line in visible_text.splitlines()]
+        lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
+        text_cleaned = "\n".join(lines)
+        # Extract Program Studi menu
+        program_studi = []
+        menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
+        for menu in menu_elements:
+            ul = menu.find_next("ul")
+            if ul:
+                lis = ul.find_all("li")
+                for li in lis:
+                    item = li.get_text(strip=True)
+                    if item and item not in program_studi:
+                        program_studi.append(item)
+        # Simpan global untuk rekap
+        PRODI_LIST.append((jurusan_nama, program_studi))
+        global TOTAL_PRODI
+        TOTAL_PRODI += len(program_studi)
+        # Buat nama file
+        filename = f"{jurusan_nama.upper()}_{timestamp}.txt"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"JURUSAN: {jurusan_nama.replace('_', ' ').upper()}\n")
+            f.write(f"URL: {url}\n\n")
+            f.write("== PROGRAM STUDI ==\n")
+            for i, item in enumerate(program_studi, 1):
+                f.write(f"{i}. {item}\n")
+            f.write(f"\nTotal Program Studi: {len(program_studi)}\n\n")
+            f.write("== PROFIL ==\n")
+            f.write(text_cleaned.strip()[:8000])  # Batas isi maksimal agar tidak terlalu besar
+            f.write("\n\n")
+        print(f"📄 Saved {filename}")
+        # Upload ke Supabase
+        with open(filename, "rb") as f_upload:
+            supabase.storage.from_("documents").upload(
+                file=f_upload,
                 path=filename,
+                file_options={"content-type": "text/plain"},
+                upsert=True,
             )
+            print(f"✅ Uploaded to Supabase: {filename}")
+def run_spider():
+    process = CrawlerProcess(settings={
+        "LOG_LEVEL": "ERROR",
+        "USER_AGENT": "Mozilla/5.0",
+    })
+    process.crawl(JurusanSpider)
+    process.start()
+    # Simpan rekap program studi semua jurusan
+    rekap_file = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
+    with open(rekap_file, "w", encoding="utf-8") as f:
+        total = 0
+        for jurusan, daftar in PRODI_LIST:
+            f.write(f"{jurusan.replace('_', ' ')}:\n")
+            for p in daftar:
+                f.write(f"- {p}\n")
+            f.write(f"Jumlah: {len(daftar)}\n\n")
+            total += len(daftar)
+        f.write(f"TOTAL PROGRAM STUDI: {total}\n")
+    # Upload rekap
+    with open(rekap_file, "rb") as f_rekap:
+        supabase.storage.from_("documents").upload(
+            file=f_rekap,
+            path=rekap_file,
+            file_options={"content-type": "text/plain"},
+            upsert=True,
+        )
+        print(f"✅ Uploaded REKAP: {rekap_file}")
 if __name__ == "__main__":
+    run_spider()