import scrapy from scrapy.crawler import CrawlerProcess from bs4 import BeautifulSoup from dotenv import load_dotenv from supabase import create_client from datetime import datetime import os, re, tempfile import logging load_dotenv() SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL") SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY") SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage") def is_valid_prodi(nama): """Validate if a string represents a valid study program name""" if not nama or len(nama.strip()) < 3: return False pattern = r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b' return bool(re.match(pattern, nama.strip(), re.I)) class JurusanSpider(scrapy.Spider): name = "jurusan" custom_settings = { 'DOWNLOAD_DELAY': 2, 'USER_AGENT': 'PNPBot/1.0', 'ROBOTSTXT_OBEY': True, 'LOG_LEVEL': 'INFO', 'CONCURRENT_REQUESTS': 1, 'DOWNLOAD_TIMEOUT': 100, 'RETRY_TIMES': 3, 'DEPTH_LIMIT': 3, # Prevent infinite crawling 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter', } domain_to_name = { 'akt.pnp.ac.id': 'Akuntansi', 'an.pnp.ac.id': 'Administrasi_Niaga', 'bing.pnp.ac.id': 'Bahasa_Inggris', 'elektro.pnp.ac.id': 'Teknik_Elektro', 'me.pnp.ac.id': 'Teknik_Mesin', 'sipil.pnp.ac.id': 'Teknik_Sipil', 'ti.pnp.ac.id': 'Teknologi_Informasi', } start_urls = [f"https://{d}/" for d in domain_to_name.keys()] def __init__(self, *args, **kwargs): super(JurusanSpider, self).__init__(*args, **kwargs) # Validate environment variables if not all([SUPABASE_URL, SUPABASE_KEY]): raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_KEY") try: self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY) except Exception as e: self.logger.error(f"Failed to initialize Supabase client: {e}") raise self.bucket = SUPABASE_BUCKET self.timestamp = datetime.now().strftime("%Y%m%d_%H%M") self.per_jurusan_pages = {} self.rekap_prodi = {} self.processed_urls = set() # Track processed URLs def parse(self, response): """Parse main department pages""" if response.status != 200: self.logger.warning(f"Non-200 response from {response.url}: {response.status}") return domain = response.url.split("//")[1].split("/")[0] jurusan = self.domain_to_name.get(domain, domain) try: soup = BeautifulSoup(response.text, "html.parser") except Exception as e: self.logger.error(f"Failed to parse HTML from {response.url}: {e}") return program_studi = [] # Extract study programs for a_tag in soup.find_all("a"): try: item = a_tag.get_text(strip=True) href = a_tag.get("href") if item and is_valid_prodi(item) and item not in program_studi: program_studi.append(item) if href: prodi_url = response.urljoin(href) if prodi_url not in self.processed_urls: self.processed_urls.add(prodi_url) self.logger.info(f"[🧩] Found prodi: {item} ({prodi_url}) in {jurusan}") yield scrapy.Request( prodi_url, callback=self.parse_detail, meta={"jurusan": jurusan, "url": prodi_url}, dont_filter=False ) except Exception as e: self.logger.warning(f"Error processing link in {response.url}: {e}") continue # Store initial results self.rekap_prodi[jurusan] = program_studi # Follow internal links with better filtering for a in soup.find_all("a", href=True): try: href = a["href"] full_url = None if href.startswith("http") and domain in href: full_url = href elif href.startswith("/"): full_url = response.urljoin(href) if full_url and full_url not in self.processed_urls: # Skip certain file types and external links if any(ext in full_url.lower() for ext in ['.pdf', '.doc', '.xls', '.ppt', '.jpg', '.png', '.gif']): continue self.processed_urls.add(full_url) yield scrapy.Request( full_url, callback=self.parse_detail, meta={"jurusan": jurusan, "url": full_url} ) except Exception as e: self.logger.warning(f"Error processing internal link: {e}") continue def clean_html(self, soup): """Clean HTML content by removing unwanted elements""" # Remove unwanted elements for selector in [ 'header', 'footer', 'nav', 'aside', 'menu', '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu', '.breadcrumbs', '.pagination', '.navigation', 'script', 'style', 'noscript', 'iframe', '.social-links', '.share-buttons', '.newsletter', '.ad-container', '.ads', '.advert' ]: for tag in soup.select(selector): tag.decompose() # Remove empty containers for element in soup.find_all(True): if not element.get_text(strip=True) and not element.find_all(True): element.decompose() def parse_detail(self, response): """Parse detailed pages""" if response.status != 200: return jurusan = response.meta["jurusan"] url = response.meta["url"] try: soup = BeautifulSoup(response.text, "html.parser") except Exception as e: self.logger.error(f"Failed to parse HTML from {url}: {e}") return self.clean_html(soup) title_tag = soup.find("title") or soup.find("h1") page_title = title_tag.get_text(strip=True) if title_tag else "Halaman" # Handle specific TI pages if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/": content_text = self.parse_ti_dosen_page(soup, url) elif url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/": content_text = self.parse_ti_leadership_page(soup, url) else: content_text = self.parse_general_page(soup, url, jurusan, page_title) if content_text: self.per_jurusan_pages.setdefault(jurusan, []).append({ "url": url, "title": page_title, "content": content_text }) def parse_ti_dosen_page(self, soup, url): """Parse TI dosen page specifically""" dosen_list = [] # Find names in gallery captions for nama_tag in soup.find_all("dd", class_="wp-caption-text"): nama = nama_tag.get_text(strip=True) if nama and nama not in dosen_list: dosen_list.append(nama) # Create narrative text naratif = ["## Daftar Dosen dan Staf Pengajar"] for nama in dosen_list: naratif.append(f"- {nama}") return f"""# Dosen dan Staf Pengajar Teknologi Informasi URL: {url} Jurusan: Teknologi Informasi Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} {chr(10).join(naratif)}""" def parse_ti_leadership_page(self, soup, url): """Parse TI leadership page specifically""" leadership_data = { "Pimpinan Jurusan": [], "Koordinator Program Studi": [], "Kepala Labor": [] } # Extract all member items member_items = soup.find_all(class_="member-item") for member in member_items: try: name_tag = member.find(class_="item-title") name = name_tag.get_text(strip=True) if name_tag else "N/A" position_tag = member.find(class_="small-text") position = position_tag.get_text(strip=True) if position_tag else "N/A" # Categorize based on position if any(role in position for role in ["Ketua Jurusan", "Sekretaris Jurusan"]): leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position}) elif any(role in position for role in ["Koordinator Program Studi", "Koordinator PSDKU"]): leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position}) elif "Kepala Labor" in position: leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position}) except Exception as e: self.logger.warning(f"Error parsing member item: {e}") continue # Generate narrative naratif = [] for section, members in leadership_data.items(): if members: naratif.append(f"\n## {section}") for member in members: naratif.append(f"- {member['jabatan']}: {member['nama']}") return f"""# Pimpinan Jurusan Teknologi Informasi URL: {url} Jurusan: Teknologi Informasi Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} {chr(10).join(naratif)}""" def parse_general_page(self, soup, url, jurusan, page_title): """Parse general pages""" body_text = [] for element in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]): txt = element.get_text(strip=True) if txt and len(txt) > 10: # Filter out very short text body_text.append(txt) content_text = f"""# {page_title} URL: {url} Jurusan: {jurusan.replace('_', ' ')} Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} {chr(10).join(body_text)}""" # Extract faculty information from tables dosen_entries = self.extract_faculty_info(soup) if dosen_entries: content_text += f"\n\n## Daftar Dosen\n\n{chr(10).join(dosen_entries)}" # Add general tables tables_content = self.extract_tables(soup) if tables_content: content_text += f"\n\n## Tabel Data\n\n{tables_content}" return content_text def extract_faculty_info(self, soup): """Extract faculty information from tables and text""" dosen_entries = [] # Extract from tables for table in soup.find_all("table"): try: headers = [th.get_text(strip=True).lower() for th in table.find_all("th")] if any(keyword in " ".join(headers) for keyword in ["dosen", "jabatan", "nip", "nama"]): for row in table.find_all("tr")[1:]: cols = row.find_all(["td", "th"]) if len(cols) >= 1: nama_dosen = cols[0].get_text(strip=True) jabatan = cols[1].get_text(strip=True) if len(cols) > 1 else "-" if nama_dosen and len(nama_dosen) > 3: dosen_entries.append(f"Nama: {nama_dosen} | Jabatan: {jabatan}") except Exception as e: self.logger.warning(f"Error extracting faculty from table: {e}") continue return list(set(dosen_entries)) # Remove duplicates def extract_tables(self, soup): """Extract table data""" tables_content = [] for i, table in enumerate(soup.find_all("table")): try: table_data = [f"### Tabel {i+1}"] for row in table.find_all("tr"): cols = row.find_all(["td", "th"]) if cols: row_data = [col.get_text(strip=True) for col in cols] table_data.append(" | ".join(row_data)) if len(table_data) > 1: # Only add if table has content tables_content.extend(table_data) tables_content.append("") # Add spacing except Exception as e: self.logger.warning(f"Error extracting table {i}: {e}") continue return "\n".join(tables_content) def upload_to_supabase(self, filename, content, content_type="text/plain"): """Upload content to Supabase storage""" try: with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f: f.write(content) temp_path = f.name result = self.supabase.storage.from_(self.bucket).upload( path=filename, file=temp_path, file_options={"content-type": content_type} ) self.logger.info(f"✅ Uploaded: {filename}") return True except Exception as e: self.logger.error(f"❌ Upload failed for {filename}: {e}") return False finally: if 'temp_path' in locals() and os.path.exists(temp_path): os.remove(temp_path) def closed(self, reason): """Called when spider closes""" self.logger.info(f"Spider closed: {reason}") # Upload files per department for jurusan, pages in self.per_jurusan_pages.items(): if not pages: continue filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt" content = "" for page in pages: content += f"{page['content']}\n\n---\n\n" self.upload_to_supabase(filename, content) # Create and upload summary self.create_and_upload_summary() def create_and_upload_summary(self): """Create and upload program study summary""" rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt" content_lines = [ f"# REKAP PROGRAM STUDI PNP", f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}", "" ] total_prodi = 0 jumlah_jurusan = 0 for jurusan, daftar in self.rekap_prodi.items(): valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)] if not valid_prodi: continue jurusan_display = jurusan.replace("_", " ") content_lines.append(f"## {jurusan_display}:") for prodi in sorted(set(valid_prodi)): content_lines.append(f"- {prodi}") jumlah_prodi = len(set(valid_prodi)) content_lines.append(f"Jumlah program studi: {jumlah_prodi}") content_lines.append("") total_prodi += jumlah_prodi jumlah_jurusan += 1 content_lines.extend([ f"**Total Jurusan di PNP: {jumlah_jurusan}**", f"**Total Program Studi di PNP: {total_prodi}**" ]) content = "\n".join(content_lines) self.upload_to_supabase(rekap_filename, content) if __name__ == "__main__": # Add logging configuration logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' ) try: process = CrawlerProcess() process.crawl(JurusanSpider) process.start() except Exception as e: logging.error(f"Failed to run spider: {e}") raise