Spaces:
Sleeping
Sleeping
| import scrapy | |
| from scrapy.crawler import CrawlerProcess | |
| from bs4 import BeautifulSoup | |
| from dotenv import load_dotenv | |
| from supabase import create_client | |
| from datetime import datetime | |
| import os, re, tempfile | |
| import logging | |
| load_dotenv() | |
| SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL") | |
| SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY") | |
| SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage") | |
| def is_valid_prodi(nama): | |
| """Validate if a string represents a valid study program name""" | |
| if not nama or len(nama.strip()) < 3: | |
| return False | |
| pattern = r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b' | |
| return bool(re.match(pattern, nama.strip(), re.I)) | |
| class JurusanSpider(scrapy.Spider): | |
| name = "jurusan" | |
| custom_settings = { | |
| 'DOWNLOAD_DELAY': 2, | |
| 'USER_AGENT': 'PNPBot/1.0', | |
| 'ROBOTSTXT_OBEY': True, | |
| 'LOG_LEVEL': 'INFO', | |
| 'CONCURRENT_REQUESTS': 1, | |
| 'DOWNLOAD_TIMEOUT': 100, | |
| 'RETRY_TIMES': 3, | |
| 'DEPTH_LIMIT': 3, # Prevent infinite crawling | |
| 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter', | |
| } | |
| domain_to_name = { | |
| 'akt.pnp.ac.id': 'Akuntansi', | |
| 'an.pnp.ac.id': 'Administrasi_Niaga', | |
| 'bing.pnp.ac.id': 'Bahasa_Inggris', | |
| 'elektro.pnp.ac.id': 'Teknik_Elektro', | |
| 'me.pnp.ac.id': 'Teknik_Mesin', | |
| 'sipil.pnp.ac.id': 'Teknik_Sipil', | |
| 'ti.pnp.ac.id': 'Teknologi_Informasi', | |
| } | |
| start_urls = [f"https://{d}/" for d in domain_to_name.keys()] | |
| def __init__(self, *args, **kwargs): | |
| super(JurusanSpider, self).__init__(*args, **kwargs) | |
| # Validate environment variables | |
| if not all([SUPABASE_URL, SUPABASE_KEY]): | |
| raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_KEY") | |
| try: | |
| self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY) | |
| except Exception as e: | |
| self.logger.error(f"Failed to initialize Supabase client: {e}") | |
| raise | |
| self.bucket = SUPABASE_BUCKET | |
| self.timestamp = datetime.now().strftime("%Y%m%d_%H%M") | |
| self.per_jurusan_pages = {} | |
| self.rekap_prodi = {} | |
| self.processed_urls = set() # Track processed URLs | |
| def parse(self, response): | |
| """Parse main department pages""" | |
| if response.status != 200: | |
| self.logger.warning(f"Non-200 response from {response.url}: {response.status}") | |
| return | |
| domain = response.url.split("//")[1].split("/")[0] | |
| jurusan = self.domain_to_name.get(domain, domain) | |
| try: | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| except Exception as e: | |
| self.logger.error(f"Failed to parse HTML from {response.url}: {e}") | |
| return | |
| program_studi = [] | |
| # Extract study programs | |
| for a_tag in soup.find_all("a"): | |
| try: | |
| item = a_tag.get_text(strip=True) | |
| href = a_tag.get("href") | |
| if item and is_valid_prodi(item) and item not in program_studi: | |
| program_studi.append(item) | |
| if href: | |
| prodi_url = response.urljoin(href) | |
| if prodi_url not in self.processed_urls: | |
| self.processed_urls.add(prodi_url) | |
| self.logger.info(f"[🧩] Found prodi: {item} ({prodi_url}) in {jurusan}") | |
| yield scrapy.Request( | |
| prodi_url, | |
| callback=self.parse_detail, | |
| meta={"jurusan": jurusan, "url": prodi_url}, | |
| dont_filter=False | |
| ) | |
| except Exception as e: | |
| self.logger.warning(f"Error processing link in {response.url}: {e}") | |
| continue | |
| # Store initial results | |
| self.rekap_prodi[jurusan] = program_studi | |
| # Follow internal links with better filtering | |
| for a in soup.find_all("a", href=True): | |
| try: | |
| href = a["href"] | |
| full_url = None | |
| if href.startswith("http") and domain in href: | |
| full_url = href | |
| elif href.startswith("/"): | |
| full_url = response.urljoin(href) | |
| if full_url and full_url not in self.processed_urls: | |
| # Skip certain file types and external links | |
| if any(ext in full_url.lower() for ext in ['.pdf', '.doc', '.xls', '.ppt', '.jpg', '.png', '.gif']): | |
| continue | |
| self.processed_urls.add(full_url) | |
| yield scrapy.Request( | |
| full_url, | |
| callback=self.parse_detail, | |
| meta={"jurusan": jurusan, "url": full_url} | |
| ) | |
| except Exception as e: | |
| self.logger.warning(f"Error processing internal link: {e}") | |
| continue | |
| def clean_html(self, soup): | |
| """Clean HTML content by removing unwanted elements""" | |
| # Remove unwanted elements | |
| for selector in [ | |
| 'header', 'footer', 'nav', 'aside', 'menu', | |
| '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu', | |
| '.breadcrumbs', '.pagination', '.navigation', | |
| 'script', 'style', 'noscript', 'iframe', | |
| '.social-links', '.share-buttons', '.newsletter', | |
| '.ad-container', '.ads', '.advert' | |
| ]: | |
| for tag in soup.select(selector): | |
| tag.decompose() | |
| # Remove empty containers | |
| for element in soup.find_all(True): | |
| if not element.get_text(strip=True) and not element.find_all(True): | |
| element.decompose() | |
| def parse_detail(self, response): | |
| """Parse detailed pages""" | |
| if response.status != 200: | |
| return | |
| jurusan = response.meta["jurusan"] | |
| url = response.meta["url"] | |
| try: | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| except Exception as e: | |
| self.logger.error(f"Failed to parse HTML from {url}: {e}") | |
| return | |
| self.clean_html(soup) | |
| title_tag = soup.find("title") or soup.find("h1") | |
| page_title = title_tag.get_text(strip=True) if title_tag else "Halaman" | |
| # Handle specific TI pages | |
| if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/": | |
| content_text = self.parse_ti_dosen_page(soup, url) | |
| elif url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/": | |
| content_text = self.parse_ti_leadership_page(soup, url) | |
| else: | |
| content_text = self.parse_general_page(soup, url, jurusan, page_title) | |
| if content_text: | |
| self.per_jurusan_pages.setdefault(jurusan, []).append({ | |
| "url": url, | |
| "title": page_title, | |
| "content": content_text | |
| }) | |
| def parse_ti_dosen_page(self, soup, url): | |
| """Parse TI dosen page specifically""" | |
| dosen_list = [] | |
| # Find names in gallery captions | |
| for nama_tag in soup.find_all("dd", class_="wp-caption-text"): | |
| nama = nama_tag.get_text(strip=True) | |
| if nama and nama not in dosen_list: | |
| dosen_list.append(nama) | |
| # Create narrative text | |
| naratif = ["## Daftar Dosen dan Staf Pengajar"] | |
| for nama in dosen_list: | |
| naratif.append(f"- {nama}") | |
| return f"""# Dosen dan Staf Pengajar Teknologi Informasi | |
| URL: {url} | |
| Jurusan: Teknologi Informasi | |
| Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} | |
| {chr(10).join(naratif)}""" | |
| def parse_ti_leadership_page(self, soup, url): | |
| """Parse TI leadership page specifically""" | |
| leadership_data = { | |
| "Pimpinan Jurusan": [], | |
| "Koordinator Program Studi": [], | |
| "Kepala Labor": [] | |
| } | |
| # Extract all member items | |
| member_items = soup.find_all(class_="member-item") | |
| for member in member_items: | |
| try: | |
| name_tag = member.find(class_="item-title") | |
| name = name_tag.get_text(strip=True) if name_tag else "N/A" | |
| position_tag = member.find(class_="small-text") | |
| position = position_tag.get_text(strip=True) if position_tag else "N/A" | |
| # Categorize based on position | |
| if any(role in position for role in ["Ketua Jurusan", "Sekretaris Jurusan"]): | |
| leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position}) | |
| elif any(role in position for role in ["Koordinator Program Studi", "Koordinator PSDKU"]): | |
| leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position}) | |
| elif "Kepala Labor" in position: | |
| leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position}) | |
| except Exception as e: | |
| self.logger.warning(f"Error parsing member item: {e}") | |
| continue | |
| # Generate narrative | |
| naratif = [] | |
| for section, members in leadership_data.items(): | |
| if members: | |
| naratif.append(f"\n## {section}") | |
| for member in members: | |
| naratif.append(f"- {member['jabatan']}: {member['nama']}") | |
| return f"""# Pimpinan Jurusan Teknologi Informasi | |
| URL: {url} | |
| Jurusan: Teknologi Informasi | |
| Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} | |
| {chr(10).join(naratif)}""" | |
| def parse_general_page(self, soup, url, jurusan, page_title): | |
| """Parse general pages""" | |
| body_text = [] | |
| for element in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]): | |
| txt = element.get_text(strip=True) | |
| if txt and len(txt) > 10: # Filter out very short text | |
| body_text.append(txt) | |
| content_text = f"""# {page_title} | |
| URL: {url} | |
| Jurusan: {jurusan.replace('_', ' ')} | |
| Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')} | |
| {chr(10).join(body_text)}""" | |
| # Extract faculty information from tables | |
| dosen_entries = self.extract_faculty_info(soup) | |
| if dosen_entries: | |
| content_text += f"\n\n## Daftar Dosen\n\n{chr(10).join(dosen_entries)}" | |
| # Add general tables | |
| tables_content = self.extract_tables(soup) | |
| if tables_content: | |
| content_text += f"\n\n## Tabel Data\n\n{tables_content}" | |
| return content_text | |
| def extract_faculty_info(self, soup): | |
| """Extract faculty information from tables and text""" | |
| dosen_entries = [] | |
| # Extract from tables | |
| for table in soup.find_all("table"): | |
| try: | |
| headers = [th.get_text(strip=True).lower() for th in table.find_all("th")] | |
| if any(keyword in " ".join(headers) for keyword in ["dosen", "jabatan", "nip", "nama"]): | |
| for row in table.find_all("tr")[1:]: | |
| cols = row.find_all(["td", "th"]) | |
| if len(cols) >= 1: | |
| nama_dosen = cols[0].get_text(strip=True) | |
| jabatan = cols[1].get_text(strip=True) if len(cols) > 1 else "-" | |
| if nama_dosen and len(nama_dosen) > 3: | |
| dosen_entries.append(f"Nama: {nama_dosen} | Jabatan: {jabatan}") | |
| except Exception as e: | |
| self.logger.warning(f"Error extracting faculty from table: {e}") | |
| continue | |
| return list(set(dosen_entries)) # Remove duplicates | |
| def extract_tables(self, soup): | |
| """Extract table data""" | |
| tables_content = [] | |
| for i, table in enumerate(soup.find_all("table")): | |
| try: | |
| table_data = [f"### Tabel {i+1}"] | |
| for row in table.find_all("tr"): | |
| cols = row.find_all(["td", "th"]) | |
| if cols: | |
| row_data = [col.get_text(strip=True) for col in cols] | |
| table_data.append(" | ".join(row_data)) | |
| if len(table_data) > 1: # Only add if table has content | |
| tables_content.extend(table_data) | |
| tables_content.append("") # Add spacing | |
| except Exception as e: | |
| self.logger.warning(f"Error extracting table {i}: {e}") | |
| continue | |
| return "\n".join(tables_content) | |
| def upload_to_supabase(self, filename, content, content_type="text/plain"): | |
| """Upload content to Supabase storage""" | |
| try: | |
| with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f: | |
| f.write(content) | |
| temp_path = f.name | |
| result = self.supabase.storage.from_(self.bucket).upload( | |
| path=filename, | |
| file=temp_path, | |
| file_options={"content-type": content_type} | |
| ) | |
| self.logger.info(f"✅ Uploaded: {filename}") | |
| return True | |
| except Exception as e: | |
| self.logger.error(f"❌ Upload failed for {filename}: {e}") | |
| return False | |
| finally: | |
| if 'temp_path' in locals() and os.path.exists(temp_path): | |
| os.remove(temp_path) | |
| def closed(self, reason): | |
| """Called when spider closes""" | |
| self.logger.info(f"Spider closed: {reason}") | |
| # Upload files per department | |
| for jurusan, pages in self.per_jurusan_pages.items(): | |
| if not pages: | |
| continue | |
| filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt" | |
| content = "" | |
| for page in pages: | |
| content += f"{page['content']}\n\n---\n\n" | |
| self.upload_to_supabase(filename, content) | |
| # Create and upload summary | |
| self.create_and_upload_summary() | |
| def create_and_upload_summary(self): | |
| """Create and upload program study summary""" | |
| rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt" | |
| content_lines = [ | |
| f"# REKAP PROGRAM STUDI PNP", | |
| f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}", | |
| "" | |
| ] | |
| total_prodi = 0 | |
| jumlah_jurusan = 0 | |
| for jurusan, daftar in self.rekap_prodi.items(): | |
| valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)] | |
| if not valid_prodi: | |
| continue | |
| jurusan_display = jurusan.replace("_", " ") | |
| content_lines.append(f"## {jurusan_display}:") | |
| for prodi in sorted(set(valid_prodi)): | |
| content_lines.append(f"- {prodi}") | |
| jumlah_prodi = len(set(valid_prodi)) | |
| content_lines.append(f"Jumlah program studi: {jumlah_prodi}") | |
| content_lines.append("") | |
| total_prodi += jumlah_prodi | |
| jumlah_jurusan += 1 | |
| content_lines.extend([ | |
| f"**Total Jurusan di PNP: {jumlah_jurusan}**", | |
| f"**Total Program Studi di PNP: {total_prodi}**" | |
| ]) | |
| content = "\n".join(content_lines) | |
| self.upload_to_supabase(rekap_filename, content) | |
| if __name__ == "__main__": | |
| # Add logging configuration | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' | |
| ) | |
| try: | |
| process = CrawlerProcess() | |
| process.crawl(JurusanSpider) | |
| process.start() | |
| except Exception as e: | |
| logging.error(f"Failed to run spider: {e}") | |
| raise |