pnp-chatbot-admin-v1 / scrapping /jurusan_scrap.py
FauziIsyrinApridal
update jurudsan dan pnp
b5910df
raw
history blame
16.2 kB
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from supabase import create_client
from datetime import datetime
import os, re, tempfile
import logging
load_dotenv()
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
def is_valid_prodi(nama):
"""Validate if a string represents a valid study program name"""
if not nama or len(nama.strip()) < 3:
return False
pattern = r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b'
return bool(re.match(pattern, nama.strip(), re.I))
class JurusanSpider(scrapy.Spider):
name = "jurusan"
custom_settings = {
'DOWNLOAD_DELAY': 2,
'USER_AGENT': 'PNPBot/1.0',
'ROBOTSTXT_OBEY': True,
'LOG_LEVEL': 'INFO',
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_TIMEOUT': 100,
'RETRY_TIMES': 3,
'DEPTH_LIMIT': 3, # Prevent infinite crawling
'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
}
domain_to_name = {
'akt.pnp.ac.id': 'Akuntansi',
'an.pnp.ac.id': 'Administrasi_Niaga',
'bing.pnp.ac.id': 'Bahasa_Inggris',
'elektro.pnp.ac.id': 'Teknik_Elektro',
'me.pnp.ac.id': 'Teknik_Mesin',
'sipil.pnp.ac.id': 'Teknik_Sipil',
'ti.pnp.ac.id': 'Teknologi_Informasi',
}
start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
def __init__(self, *args, **kwargs):
super(JurusanSpider, self).__init__(*args, **kwargs)
# Validate environment variables
if not all([SUPABASE_URL, SUPABASE_KEY]):
raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_KEY")
try:
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
except Exception as e:
self.logger.error(f"Failed to initialize Supabase client: {e}")
raise
self.bucket = SUPABASE_BUCKET
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
self.per_jurusan_pages = {}
self.rekap_prodi = {}
self.processed_urls = set() # Track processed URLs
def parse(self, response):
"""Parse main department pages"""
if response.status != 200:
self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
return
domain = response.url.split("//")[1].split("/")[0]
jurusan = self.domain_to_name.get(domain, domain)
try:
soup = BeautifulSoup(response.text, "html.parser")
except Exception as e:
self.logger.error(f"Failed to parse HTML from {response.url}: {e}")
return
program_studi = []
# Extract study programs
for a_tag in soup.find_all("a"):
try:
item = a_tag.get_text(strip=True)
href = a_tag.get("href")
if item and is_valid_prodi(item) and item not in program_studi:
program_studi.append(item)
if href:
prodi_url = response.urljoin(href)
if prodi_url not in self.processed_urls:
self.processed_urls.add(prodi_url)
self.logger.info(f"[🧩] Found prodi: {item} ({prodi_url}) in {jurusan}")
yield scrapy.Request(
prodi_url,
callback=self.parse_detail,
meta={"jurusan": jurusan, "url": prodi_url},
dont_filter=False
)
except Exception as e:
self.logger.warning(f"Error processing link in {response.url}: {e}")
continue
# Store initial results
self.rekap_prodi[jurusan] = program_studi
# Follow internal links with better filtering
for a in soup.find_all("a", href=True):
try:
href = a["href"]
full_url = None
if href.startswith("http") and domain in href:
full_url = href
elif href.startswith("/"):
full_url = response.urljoin(href)
if full_url and full_url not in self.processed_urls:
# Skip certain file types and external links
if any(ext in full_url.lower() for ext in ['.pdf', '.doc', '.xls', '.ppt', '.jpg', '.png', '.gif']):
continue
self.processed_urls.add(full_url)
yield scrapy.Request(
full_url,
callback=self.parse_detail,
meta={"jurusan": jurusan, "url": full_url}
)
except Exception as e:
self.logger.warning(f"Error processing internal link: {e}")
continue
def clean_html(self, soup):
"""Clean HTML content by removing unwanted elements"""
# Remove unwanted elements
for selector in [
'header', 'footer', 'nav', 'aside', 'menu',
'.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
'.breadcrumbs', '.pagination', '.navigation',
'script', 'style', 'noscript', 'iframe',
'.social-links', '.share-buttons', '.newsletter',
'.ad-container', '.ads', '.advert'
]:
for tag in soup.select(selector):
tag.decompose()
# Remove empty containers
for element in soup.find_all(True):
if not element.get_text(strip=True) and not element.find_all(True):
element.decompose()
def parse_detail(self, response):
"""Parse detailed pages"""
if response.status != 200:
return
jurusan = response.meta["jurusan"]
url = response.meta["url"]
try:
soup = BeautifulSoup(response.text, "html.parser")
except Exception as e:
self.logger.error(f"Failed to parse HTML from {url}: {e}")
return
self.clean_html(soup)
title_tag = soup.find("title") or soup.find("h1")
page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
# Handle specific TI pages
if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
content_text = self.parse_ti_dosen_page(soup, url)
elif url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
content_text = self.parse_ti_leadership_page(soup, url)
else:
content_text = self.parse_general_page(soup, url, jurusan, page_title)
if content_text:
self.per_jurusan_pages.setdefault(jurusan, []).append({
"url": url,
"title": page_title,
"content": content_text
})
def parse_ti_dosen_page(self, soup, url):
"""Parse TI dosen page specifically"""
dosen_list = []
# Find names in gallery captions
for nama_tag in soup.find_all("dd", class_="wp-caption-text"):
nama = nama_tag.get_text(strip=True)
if nama and nama not in dosen_list:
dosen_list.append(nama)
# Create narrative text
naratif = ["## Daftar Dosen dan Staf Pengajar"]
for nama in dosen_list:
naratif.append(f"- {nama}")
return f"""# Dosen dan Staf Pengajar Teknologi Informasi
URL: {url}
Jurusan: Teknologi Informasi
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
{chr(10).join(naratif)}"""
def parse_ti_leadership_page(self, soup, url):
"""Parse TI leadership page specifically"""
leadership_data = {
"Pimpinan Jurusan": [],
"Koordinator Program Studi": [],
"Kepala Labor": []
}
# Extract all member items
member_items = soup.find_all(class_="member-item")
for member in member_items:
try:
name_tag = member.find(class_="item-title")
name = name_tag.get_text(strip=True) if name_tag else "N/A"
position_tag = member.find(class_="small-text")
position = position_tag.get_text(strip=True) if position_tag else "N/A"
# Categorize based on position
if any(role in position for role in ["Ketua Jurusan", "Sekretaris Jurusan"]):
leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
elif any(role in position for role in ["Koordinator Program Studi", "Koordinator PSDKU"]):
leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
elif "Kepala Labor" in position:
leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
except Exception as e:
self.logger.warning(f"Error parsing member item: {e}")
continue
# Generate narrative
naratif = []
for section, members in leadership_data.items():
if members:
naratif.append(f"\n## {section}")
for member in members:
naratif.append(f"- {member['jabatan']}: {member['nama']}")
return f"""# Pimpinan Jurusan Teknologi Informasi
URL: {url}
Jurusan: Teknologi Informasi
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
{chr(10).join(naratif)}"""
def parse_general_page(self, soup, url, jurusan, page_title):
"""Parse general pages"""
body_text = []
for element in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
txt = element.get_text(strip=True)
if txt and len(txt) > 10: # Filter out very short text
body_text.append(txt)
content_text = f"""# {page_title}
URL: {url}
Jurusan: {jurusan.replace('_', ' ')}
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
{chr(10).join(body_text)}"""
# Extract faculty information from tables
dosen_entries = self.extract_faculty_info(soup)
if dosen_entries:
content_text += f"\n\n## Daftar Dosen\n\n{chr(10).join(dosen_entries)}"
# Add general tables
tables_content = self.extract_tables(soup)
if tables_content:
content_text += f"\n\n## Tabel Data\n\n{tables_content}"
return content_text
def extract_faculty_info(self, soup):
"""Extract faculty information from tables and text"""
dosen_entries = []
# Extract from tables
for table in soup.find_all("table"):
try:
headers = [th.get_text(strip=True).lower() for th in table.find_all("th")]
if any(keyword in " ".join(headers) for keyword in ["dosen", "jabatan", "nip", "nama"]):
for row in table.find_all("tr")[1:]:
cols = row.find_all(["td", "th"])
if len(cols) >= 1:
nama_dosen = cols[0].get_text(strip=True)
jabatan = cols[1].get_text(strip=True) if len(cols) > 1 else "-"
if nama_dosen and len(nama_dosen) > 3:
dosen_entries.append(f"Nama: {nama_dosen} | Jabatan: {jabatan}")
except Exception as e:
self.logger.warning(f"Error extracting faculty from table: {e}")
continue
return list(set(dosen_entries)) # Remove duplicates
def extract_tables(self, soup):
"""Extract table data"""
tables_content = []
for i, table in enumerate(soup.find_all("table")):
try:
table_data = [f"### Tabel {i+1}"]
for row in table.find_all("tr"):
cols = row.find_all(["td", "th"])
if cols:
row_data = [col.get_text(strip=True) for col in cols]
table_data.append(" | ".join(row_data))
if len(table_data) > 1: # Only add if table has content
tables_content.extend(table_data)
tables_content.append("") # Add spacing
except Exception as e:
self.logger.warning(f"Error extracting table {i}: {e}")
continue
return "\n".join(tables_content)
def upload_to_supabase(self, filename, content, content_type="text/plain"):
"""Upload content to Supabase storage"""
try:
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
f.write(content)
temp_path = f.name
result = self.supabase.storage.from_(self.bucket).upload(
path=filename,
file=temp_path,
file_options={"content-type": content_type}
)
self.logger.info(f"✅ Uploaded: {filename}")
return True
except Exception as e:
self.logger.error(f"❌ Upload failed for {filename}: {e}")
return False
finally:
if 'temp_path' in locals() and os.path.exists(temp_path):
os.remove(temp_path)
def closed(self, reason):
"""Called when spider closes"""
self.logger.info(f"Spider closed: {reason}")
# Upload files per department
for jurusan, pages in self.per_jurusan_pages.items():
if not pages:
continue
filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
content = ""
for page in pages:
content += f"{page['content']}\n\n---\n\n"
self.upload_to_supabase(filename, content)
# Create and upload summary
self.create_and_upload_summary()
def create_and_upload_summary(self):
"""Create and upload program study summary"""
rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
content_lines = [
f"# REKAP PROGRAM STUDI PNP",
f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}",
""
]
total_prodi = 0
jumlah_jurusan = 0
for jurusan, daftar in self.rekap_prodi.items():
valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)]
if not valid_prodi:
continue
jurusan_display = jurusan.replace("_", " ")
content_lines.append(f"## {jurusan_display}:")
for prodi in sorted(set(valid_prodi)):
content_lines.append(f"- {prodi}")
jumlah_prodi = len(set(valid_prodi))
content_lines.append(f"Jumlah program studi: {jumlah_prodi}")
content_lines.append("")
total_prodi += jumlah_prodi
jumlah_jurusan += 1
content_lines.extend([
f"**Total Jurusan di PNP: {jumlah_jurusan}**",
f"**Total Program Studi di PNP: {total_prodi}**"
])
content = "\n".join(content_lines)
self.upload_to_supabase(rekap_filename, content)
if __name__ == "__main__":
# Add logging configuration
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
)
try:
process = CrawlerProcess()
process.crawl(JurusanSpider)
process.start()
except Exception as e:
logging.error(f"Failed to run spider: {e}")
raise