Spaces:
Sleeping
Sleeping
FauziIsyrinApridal
commited on
Commit
·
f1150bb
1
Parent(s):
a300f9d
fix pnp pimpinan
Browse files- scrapping/pnp_scrap.py +286 -416
scrapping/pnp_scrap.py
CHANGED
@@ -3,27 +3,23 @@ from scrapy.crawler import CrawlerProcess
|
|
3 |
from datetime import datetime
|
4 |
import re
|
5 |
import os
|
6 |
-
import tempfile
|
7 |
-
import logging
|
8 |
-
from typing import Optional, List, Dict, Any
|
9 |
from supabase import create_client, Client
|
10 |
-
|
11 |
|
12 |
-
# Load environment variables
|
13 |
-
load_dotenv()
|
14 |
|
15 |
-
# Environment variables with validation
|
16 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
17 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
18 |
-
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET"
|
|
|
|
|
19 |
|
20 |
class PNPContentSpider(scrapy.Spider):
|
21 |
name = 'pnp_content_spider'
|
22 |
-
start_urls = ['https://www.pnp.ac.id',
|
23 |
|
24 |
excluded_subdomains = [
|
25 |
'akt.pnp.ac.id',
|
26 |
-
'an.pnp.ac.id',
|
27 |
'bing.pnp.ac.id',
|
28 |
'elektro.pnp.ac.id',
|
29 |
'me.pnp.ac.id',
|
@@ -32,324 +28,274 @@ class PNPContentSpider(scrapy.Spider):
|
|
32 |
]
|
33 |
|
34 |
custom_settings = {
|
35 |
-
'DOWNLOAD_DELAY':
|
36 |
'RETRY_TIMES': 3,
|
37 |
'HTTPCACHE_ENABLED': False,
|
38 |
'ROBOTSTXT_OBEY': True,
|
39 |
'CONCURRENT_REQUESTS': 1,
|
40 |
-
'RETRY_ENABLED': True,
|
41 |
-
'USER_AGENT': '
|
42 |
-
'DOWNLOAD_TIMEOUT': 60,
|
43 |
-
'DEPTH_LIMIT': 3,
|
44 |
-
'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
|
45 |
}
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
if not all([SUPABASE_URL, SUPABASE_KEY]):
|
52 |
-
raise ValueError("Missing required environment variables: SUPABASE_URL and SUPABASE_KEY")
|
53 |
-
|
54 |
-
try:
|
55 |
-
self.supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
56 |
-
except Exception as e:
|
57 |
-
self.logger.error(f"Failed to initialize Supabase client: {e}")
|
58 |
-
raise
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
self.upload_stats = {'success': 0, 'failed': 0}
|
63 |
-
|
64 |
-
def should_follow_link(self, url: str) -> bool:
|
65 |
-
"""Check if URL should be followed based on exclusion rules"""
|
66 |
-
if not url or url.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
|
67 |
-
return False
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
-
if subdomain in url:
|
72 |
-
return False
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
-
|
77 |
-
if any(url.lower().endswith(ext) for ext in excluded_extensions):
|
78 |
-
return False
|
79 |
|
80 |
-
return
|
81 |
|
82 |
-
def format_paragraph(self, text: str
|
83 |
-
|
84 |
-
|
85 |
-
return ""
|
86 |
-
|
87 |
-
# Clean and normalize text
|
88 |
-
text = re.sub(r'\s+', ' ', text.strip())
|
89 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
90 |
-
|
91 |
paragraph = ''
|
92 |
word_count = 0
|
93 |
-
|
94 |
for sentence in sentences:
|
95 |
words = sentence.split()
|
96 |
-
if word_count + len(words) > max_words and word_count >= 50:
|
97 |
-
break
|
98 |
word_count += len(words)
|
99 |
paragraph += sentence + ' '
|
100 |
-
|
|
|
101 |
return paragraph.strip()
|
102 |
|
103 |
def parse(self, response):
|
104 |
-
"""Parse main navigation and follow links"""
|
105 |
-
if response.status != 200:
|
106 |
-
self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
|
107 |
-
return
|
108 |
-
|
109 |
self.logger.info(f"Processing main page: {response.url}")
|
110 |
-
|
111 |
-
# Parse navigation items
|
112 |
nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
|
113 |
-
|
114 |
for item in nav_items:
|
115 |
-
|
116 |
-
|
|
|
117 |
main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
|
118 |
-
|
119 |
-
if main_link and self.should_follow_link(main_link):
|
120 |
main_link = response.urljoin(main_link)
|
121 |
-
if
|
122 |
-
|
123 |
-
|
124 |
-
main_link,
|
125 |
-
callback=self.parse_content,
|
126 |
-
meta={'page_title': main_title, 'menu_path': main_title},
|
127 |
-
errback=self.handle_error
|
128 |
-
)
|
129 |
-
|
130 |
-
# Submenu items
|
131 |
submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
|
132 |
for submenu in submenus:
|
133 |
-
submenu_title =
|
|
|
|
|
134 |
submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
|
135 |
-
|
136 |
-
if submenu_link and self.should_follow_link(submenu_link):
|
137 |
submenu_link = response.urljoin(submenu_link)
|
138 |
-
if
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
submenu_link,
|
143 |
-
callback=self.parse_content,
|
144 |
-
meta={'page_title': submenu_title, 'menu_path': menu_path},
|
145 |
-
errback=self.handle_error
|
146 |
-
)
|
147 |
-
|
148 |
-
def extract_menu_title(self, item) -> str:
|
149 |
-
"""Extract menu title from navigation item"""
|
150 |
-
title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
|
151 |
-
if not title:
|
152 |
-
title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
|
153 |
-
return title or "Unknown"
|
154 |
-
|
155 |
-
def handle_error(self, failure):
|
156 |
-
"""Handle request errors"""
|
157 |
-
self.logger.error(f"Request failed: {failure.request.url} - {failure.value}")
|
158 |
-
|
159 |
-
def parse_content(self, response):
|
160 |
-
"""Parse content from pages"""
|
161 |
-
if response.status != 200:
|
162 |
-
return
|
163 |
-
|
164 |
-
page_title = response.meta.get('page_title', 'Unknown Page')
|
165 |
-
menu_path = response.meta.get('menu_path', '')
|
166 |
-
|
167 |
-
# Extract page title if not provided
|
168 |
-
if page_title == 'Unknown Page':
|
169 |
-
title_selectors = ['h1.entry-title::text', 'h1.page-title::text', 'title::text', 'h1::text']
|
170 |
-
for selector in title_selectors:
|
171 |
-
title = response.css(selector).get()
|
172 |
-
if title:
|
173 |
-
page_title = title.strip()
|
174 |
-
break
|
175 |
-
|
176 |
-
self.logger.info(f"Extracting content from: {response.url} ({page_title})")
|
177 |
-
|
178 |
-
# Special case handling
|
179 |
-
if self.is_leadership_page(response.url):
|
180 |
-
content_text = self.parse_leadership_page(response, page_title)
|
181 |
-
else:
|
182 |
-
content_text = self.parse_general_content(response, page_title, menu_path)
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
yield result
|
188 |
-
|
189 |
-
# Follow additional links on same domain
|
190 |
-
yield from self.follow_additional_links(response, menu_path)
|
191 |
-
|
192 |
-
def is_leadership_page(self, url: str) -> bool:
|
193 |
-
"""Check if this is the leadership page"""
|
194 |
-
return url.strip("/") == "https://www.pnp.ac.id/pnp-profil/pimpinan-pnp"
|
195 |
-
|
196 |
-
def parse_leadership_page(self, response, page_title: str) -> str:
|
197 |
-
"""Parse the leadership page with special handling"""
|
198 |
-
self.logger.info("Detected special page: Pimpinan PNP")
|
199 |
|
200 |
-
|
201 |
|
202 |
-
#
|
203 |
-
tables = response.css('table')
|
204 |
|
205 |
-
|
206 |
-
#
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
rows = table.css('tr')
|
211 |
-
leader_info = {}
|
212 |
-
position = ""
|
213 |
-
|
214 |
-
for i, row in enumerate(rows):
|
215 |
-
cells = row.css('td')
|
216 |
|
217 |
-
|
218 |
-
if
|
219 |
-
# Extract position from first row
|
220 |
-
position_text = row.xpath('string(.)').get('').strip()
|
221 |
-
if any(title in position_text.upper() for title in ['DIREKTUR', 'WAKIL DIREKTUR']):
|
222 |
-
position = position_text
|
223 |
-
# Clean up position text
|
224 |
-
position = re.sub(r'<[^>]+>', '', position) # Remove any HTML tags
|
225 |
-
position = re.sub(r'\s+', ' ', position).strip() # Normalize whitespace
|
226 |
continue
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
240 |
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
'nama': 'Nama',
|
246 |
-
'nama lengkap': 'Nama',
|
247 |
-
'nidn': 'NIDN',
|
248 |
-
'nip': 'NIP',
|
249 |
-
'jabatan akademik': 'Jabatan Akademik',
|
250 |
-
'jurusan': 'Jurusan',
|
251 |
-
'program studi': 'Program Studi'
|
252 |
-
}
|
253 |
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
-
#
|
258 |
-
|
259 |
-
leader_info['Jabatan'] = position
|
260 |
-
narrative = self.create_leader_narrative(leader_info)
|
261 |
-
if narrative:
|
262 |
-
paragraphs.append(narrative)
|
263 |
-
|
264 |
-
# Also extract any descriptive paragraphs about leaders
|
265 |
-
content_paragraphs = response.css('div.entry-content p')
|
266 |
-
for para in content_paragraphs:
|
267 |
-
para_text = para.xpath('string(.)').get('').strip()
|
268 |
-
# Look for biographical information (text in italics often contains bio info)
|
269 |
-
if para.css('em') and len(para_text.split()) > 20:
|
270 |
-
# Extract just the italic text which usually contains the biography
|
271 |
-
italic_text = ' '.join(para.css('em *::text').getall()).strip()
|
272 |
-
if italic_text and len(italic_text.split()) > 10:
|
273 |
-
paragraphs.append(f"Informasi tambahan: {italic_text}")
|
274 |
-
|
275 |
-
return self.format_final_content(page_title, response.url, paragraphs)
|
276 |
-
|
277 |
-
def create_leader_narrative(self, leader_info: Dict[str, str]) -> str:
|
278 |
-
"""Create narrative text from leader information - improved version"""
|
279 |
-
# Get information with flexible field matching
|
280 |
-
jabatan = (leader_info.get("Jabatan") or
|
281 |
-
leader_info.get("jabatan") or
|
282 |
-
"Pejabat di PNP")
|
283 |
-
|
284 |
-
nama = (leader_info.get("Nama") or
|
285 |
-
leader_info.get("Nama Lengkap") or
|
286 |
-
leader_info.get("nama") or
|
287 |
-
"Tidak diketahui")
|
288 |
-
|
289 |
-
jabatan_akademik = (leader_info.get("Jabatan Akademik") or
|
290 |
-
leader_info.get("jabatan akademik") or "")
|
291 |
-
|
292 |
-
jurusan = (leader_info.get("Jurusan") or
|
293 |
-
leader_info.get("jurusan") or "")
|
294 |
-
|
295 |
-
prodi = (leader_info.get("Program Studi") or
|
296 |
-
leader_info.get("program studi") or "")
|
297 |
-
|
298 |
-
nidn = (leader_info.get("NIDN") or
|
299 |
-
leader_info.get("nidn") or "")
|
300 |
-
|
301 |
-
nip = (leader_info.get("NIP") or
|
302 |
-
leader_info.get("nip") or "")
|
303 |
-
|
304 |
-
# Build narrative with better formatting
|
305 |
-
narrative_parts = []
|
306 |
-
|
307 |
-
# Clean up position title
|
308 |
-
if jabatan:
|
309 |
-
# Remove extra formatting and normalize
|
310 |
-
jabatan = re.sub(r'\s+', ' ', jabatan).strip()
|
311 |
-
jabatan = jabatan.replace('WAKIL DIREKTUR BIDANG', 'Wakil Direktur Bidang')
|
312 |
-
jabatan = jabatan.replace('DIREKTUR', 'Direktur')
|
313 |
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
-
|
328 |
-
narrative_parts.append(f"NIDN: {nidn}.")
|
329 |
-
|
330 |
-
if nip and nip.lower() not in ['tidak ada', '-']:
|
331 |
-
narrative_parts.append(f"NIP: {nip}.")
|
332 |
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
-
def parse_general_content(self, response, page_title: str, menu_path: str) -> str:
|
336 |
-
"""Parse general page content"""
|
337 |
paragraphs = []
|
338 |
|
339 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
content_selectors = [
|
341 |
'div.entry-content', 'article.post', 'main.site-main',
|
342 |
-
'div.content', 'div.main-content', 'div#content',
|
343 |
-
'div.page-content', 'div.post-content'
|
344 |
]
|
345 |
-
|
346 |
for selector in content_selectors:
|
347 |
content_area = response.css(selector)
|
348 |
if content_area:
|
349 |
-
|
350 |
-
for elem in
|
351 |
-
text = self.
|
352 |
-
if text and len(text.split()) >= 5:
|
|
|
|
|
|
|
|
|
|
|
353 |
paragraphs.append(text)
|
354 |
if paragraphs:
|
355 |
break
|
@@ -357,185 +303,109 @@ class PNPContentSpider(scrapy.Spider):
|
|
357 |
# Fallback: extract from body
|
358 |
if not paragraphs:
|
359 |
body_texts = response.css('body *::text').getall()
|
360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
# Format paragraphs
|
363 |
formatted_paragraphs = []
|
364 |
for para in paragraphs:
|
365 |
-
para = re.sub(r'\s+', ' ', para.strip())
|
366 |
if len(para.split()) >= 10:
|
367 |
-
|
368 |
-
if formatted and formatted not in formatted_paragraphs:
|
369 |
-
formatted_paragraphs.append(formatted)
|
370 |
-
|
371 |
-
content = self.format_final_content(page_title, response.url, formatted_paragraphs)
|
372 |
-
|
373 |
-
# Add table data
|
374 |
-
table_content = self.extract_table_data(response)
|
375 |
-
if table_content:
|
376 |
-
content += f"\n\n# Tabel Data\n\n{table_content}"
|
377 |
|
378 |
-
return
|
379 |
|
380 |
-
def
|
381 |
-
"""Extract text from element including links"""
|
382 |
-
text = ' '.join(elem.css('*::text').getall()).strip()
|
383 |
-
|
384 |
-
# Add link information
|
385 |
-
links = elem.css('a::attr(href)').getall()
|
386 |
-
for link in links:
|
387 |
-
if link and not link.startswith('#'):
|
388 |
-
full_link = response.urljoin(link)
|
389 |
-
text += f" (Link: {full_link})"
|
390 |
-
|
391 |
-
return text
|
392 |
-
|
393 |
-
def extract_table_data(self, response) -> str:
|
394 |
"""Extract and format table data"""
|
395 |
-
table_output = []
|
396 |
tables = response.css('table')
|
|
|
397 |
|
398 |
-
for
|
399 |
table_rows = []
|
400 |
for row in table.css('tr'):
|
401 |
cells = row.css('th, td')
|
402 |
row_data = []
|
403 |
-
|
404 |
for cell in cells:
|
405 |
-
cell_text = ' '.join(cell.css('*::text').getall())
|
406 |
-
link
|
407 |
-
if link:
|
408 |
cell_text += f" (Link: {response.urljoin(link)})"
|
409 |
if cell_text:
|
410 |
row_data.append(cell_text)
|
411 |
-
|
412 |
if row_data:
|
413 |
table_rows.append(" | ".join(row_data))
|
414 |
|
415 |
if table_rows:
|
416 |
-
table_output.append(f"
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
return "\n".join(table_output)
|
421 |
|
422 |
-
def
|
423 |
-
"""
|
424 |
return f"""# {page_title}
|
425 |
|
426 |
-
Tanggal
|
427 |
-
URL
|
428 |
|
429 |
-
{chr(10).join(paragraphs)
|
430 |
|
431 |
-
def upload_content(self,
|
432 |
-
"""Upload content to Supabase
|
433 |
-
# Create safe filename
|
434 |
safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
|
435 |
-
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
436 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
437 |
filename = f"{safe_title}_{timestamp}.txt"
|
438 |
|
439 |
try:
|
440 |
-
|
441 |
-
temp_file.write(content_text)
|
442 |
-
temp_path = temp_file.name
|
443 |
-
|
444 |
-
# Upload to Supabase
|
445 |
-
result = self.supabase.storage.from_(self.bucket).upload(
|
446 |
path=filename,
|
447 |
-
file=
|
448 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
449 |
)
|
450 |
-
|
451 |
-
|
452 |
-
self.logger.info(f"✅ Uploaded {filename} successfully.")
|
453 |
-
|
454 |
-
return {
|
455 |
-
'url': url,
|
456 |
-
'title': page_title,
|
457 |
-
'menu_path': menu_path,
|
458 |
-
'uploaded_as': filename,
|
459 |
-
'timestamp': datetime.now().isoformat(),
|
460 |
-
'content_length': len(content_text)
|
461 |
-
}
|
462 |
-
|
463 |
except Exception as e:
|
464 |
-
self.
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
if 'temp_path' in locals() and os.path.exists(temp_path):
|
470 |
-
os.remove(temp_path)
|
471 |
-
|
472 |
-
def follow_additional_links(self, response, menu_path: str):
|
473 |
-
"""Follow additional links on the same domain"""
|
474 |
current_domain = response.url.split('//')[1].split('/')[0]
|
475 |
-
|
476 |
-
# Only follow additional links for non-PNP domains
|
477 |
if 'pnp.ac.id' not in current_domain:
|
478 |
-
header_selectors = ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']
|
479 |
header_links = []
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
if self.should_follow_link(link):
|
491 |
-
full_link = response.urljoin(link)
|
492 |
-
if current_domain in full_link and full_link not in self.processed_urls:
|
493 |
-
processed_links.add(link)
|
494 |
-
self.processed_urls.add(full_link)
|
495 |
-
|
496 |
-
yield scrapy.Request(
|
497 |
-
url=full_link,
|
498 |
-
callback=self.parse_content,
|
499 |
-
meta={
|
500 |
-
'page_title': 'Header Link',
|
501 |
-
'menu_path': f"{menu_path} > Header"
|
502 |
-
},
|
503 |
-
errback=self.handle_error
|
504 |
-
)
|
505 |
-
|
506 |
-
def closed(self, reason):
|
507 |
-
"""Called when spider closes"""
|
508 |
-
self.logger.info(f"Spider closed: {reason}")
|
509 |
-
self.logger.info(f"Upload statistics - Success: {self.upload_stats['success']}, Failed: {self.upload_stats['failed']}")
|
510 |
-
self.logger.info(f"Total URLs processed: {len(self.processed_urls)}")
|
511 |
|
512 |
|
513 |
if __name__ == '__main__':
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
'USER_AGENT': 'PNPBot/1.0 (+https://www.pnp.ac.id)',
|
528 |
-
'DOWNLOAD_DELAY': 2,
|
529 |
-
'ROBOTSTXT_OBEY': True,
|
530 |
-
'LOG_LEVEL': 'INFO',
|
531 |
-
'CONCURRENT_REQUESTS': 1,
|
532 |
-
'DOWNLOAD_TIMEOUT': 60,
|
533 |
-
'RETRY_TIMES': 3,
|
534 |
-
'HTTPCACHE_ENABLED': False,
|
535 |
-
'DEPTH_LIMIT': 3,
|
536 |
-
})
|
537 |
-
process.crawl(PNPContentSpider)
|
538 |
-
process.start()
|
539 |
-
except Exception as e:
|
540 |
-
logging.error(f"Failed to run spider: {e}")
|
541 |
-
raise
|
|
|
3 |
from datetime import datetime
|
4 |
import re
|
5 |
import os
|
|
|
|
|
|
|
6 |
from supabase import create_client, Client
|
7 |
+
import html
|
8 |
|
|
|
|
|
9 |
|
|
|
10 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
11 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
12 |
+
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
13 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
14 |
+
|
15 |
|
16 |
class PNPContentSpider(scrapy.Spider):
|
17 |
name = 'pnp_content_spider'
|
18 |
+
start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
|
19 |
|
20 |
excluded_subdomains = [
|
21 |
'akt.pnp.ac.id',
|
22 |
+
'an.pnp.ac.id',
|
23 |
'bing.pnp.ac.id',
|
24 |
'elektro.pnp.ac.id',
|
25 |
'me.pnp.ac.id',
|
|
|
28 |
]
|
29 |
|
30 |
custom_settings = {
|
31 |
+
'DOWNLOAD_DELAY': 1,
|
32 |
'RETRY_TIMES': 3,
|
33 |
'HTTPCACHE_ENABLED': False,
|
34 |
'ROBOTSTXT_OBEY': True,
|
35 |
'CONCURRENT_REQUESTS': 1,
|
36 |
+
'RETRY_ENABLED': True,
|
37 |
+
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
|
|
|
|
|
38 |
}
|
39 |
|
40 |
+
def clean_text(self, text: str) -> str:
|
41 |
+
"""Clean and normalize text content"""
|
42 |
+
if not text:
|
43 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
# Decode HTML entities
|
46 |
+
text = html.unescape(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# Remove extra whitespace and normalize
|
49 |
+
text = ' '.join(text.split())
|
|
|
|
|
50 |
|
51 |
+
# Fix common encoding issues
|
52 |
+
text = text.replace('“', '"').replace('â€', '"').replace('’', "'")
|
53 |
+
text = text.replace('â€"', '—').replace('â€"', '–')
|
|
|
|
|
54 |
|
55 |
+
return text.strip()
|
56 |
|
57 |
+
def format_paragraph(self, text: str) -> str:
|
58 |
+
text = self.clean_text(text)
|
59 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
paragraph = ''
|
61 |
word_count = 0
|
|
|
62 |
for sentence in sentences:
|
63 |
words = sentence.split()
|
|
|
|
|
64 |
word_count += len(words)
|
65 |
paragraph += sentence + ' '
|
66 |
+
if 50 <= word_count <= 150:
|
67 |
+
break
|
68 |
return paragraph.strip()
|
69 |
|
70 |
def parse(self, response):
|
|
|
|
|
|
|
|
|
|
|
71 |
self.logger.info(f"Processing main page: {response.url}")
|
|
|
|
|
72 |
nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
|
|
|
73 |
for item in nav_items:
|
74 |
+
main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
|
75 |
+
if not main_title:
|
76 |
+
main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
|
77 |
main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
|
78 |
+
if main_link and not main_link.startswith('#'):
|
|
|
79 |
main_link = response.urljoin(main_link)
|
80 |
+
if "jurusan" in main_link.lower():
|
81 |
+
continue
|
82 |
+
yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
|
84 |
for submenu in submenus:
|
85 |
+
submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
|
86 |
+
if not submenu_title:
|
87 |
+
submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
|
88 |
submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
|
89 |
+
if submenu_link and not submenu_link.startswith('#'):
|
|
|
90 |
submenu_link = response.urljoin(submenu_link)
|
91 |
+
if "jurusan" in submenu_link.lower():
|
92 |
+
continue
|
93 |
+
menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
|
94 |
+
yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
def extract_leadership_info(self, response):
|
97 |
+
"""Extract leadership information from the special leadership page"""
|
98 |
+
self.logger.info("Extracting leadership information from special page")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
leaders_data = []
|
101 |
|
102 |
+
# Try multiple table selectors based on the HTML structure shown
|
103 |
+
tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
|
104 |
|
105 |
+
if tables:
|
106 |
+
# Process each table
|
107 |
+
for table_idx, table in enumerate(tables):
|
108 |
+
self.logger.info(f"Processing table {table_idx + 1}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
+
rows = table.css('tr')
|
111 |
+
if not rows:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
continue
|
113 |
|
114 |
+
leader_info = {}
|
115 |
+
position_title = ""
|
116 |
+
|
117 |
+
# Look for position title (like "DIREKTUR")
|
118 |
+
title_elements = table.css('strong, .position-title, th')
|
119 |
+
for title_elem in title_elements:
|
120 |
+
title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
|
121 |
+
if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
|
122 |
+
position_title = title_text
|
123 |
+
break
|
124 |
+
|
125 |
+
# Extract key-value pairs from table rows
|
126 |
+
for row in rows:
|
127 |
+
cells = row.css('td, th')
|
128 |
|
129 |
+
if len(cells) >= 3:
|
130 |
+
# Format: Label | : | Value (3 columns)
|
131 |
+
key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
|
132 |
+
separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
|
133 |
+
value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
|
134 |
+
|
135 |
+
if key and value and separator == ":":
|
136 |
+
leader_info[key] = value
|
137 |
|
138 |
+
elif len(cells) == 2:
|
139 |
+
# Format: Label | Value (2 columns)
|
140 |
+
key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
|
141 |
+
value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
if key and value and key != value:
|
144 |
+
# Skip if key contains colon (likely "Label:")
|
145 |
+
clean_key = key.replace(':', '').strip()
|
146 |
+
leader_info[clean_key] = value
|
147 |
+
|
148 |
+
# Add position title if found
|
149 |
+
if position_title:
|
150 |
+
leader_info['Posisi'] = position_title
|
151 |
+
|
152 |
+
# If we found structured data, add it
|
153 |
+
if leader_info:
|
154 |
+
leaders_data.append(leader_info)
|
155 |
+
self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
|
156 |
+
|
157 |
+
# Fallback: Extract from general content structure
|
158 |
+
if not leaders_data:
|
159 |
+
self.logger.info("No table data found, trying general content extraction")
|
160 |
|
161 |
+
# Look for profile sections
|
162 |
+
profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
for section in profile_sections:
|
165 |
+
section_text = self.clean_text(' '.join(section.css('*::text').getall()))
|
166 |
+
|
167 |
+
# Check if this section contains leadership info
|
168 |
+
if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
|
169 |
+
# Try to extract structured info from the text
|
170 |
+
leader_info = {'description': section_text}
|
171 |
+
|
172 |
+
# Try to extract specific details using regex
|
173 |
+
name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
|
174 |
+
if name_match:
|
175 |
+
leader_info['Nama'] = name_match.group(0).strip()
|
176 |
+
|
177 |
+
nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
|
178 |
+
if nidn_match:
|
179 |
+
leader_info['NIDN'] = nidn_match.group(1)
|
180 |
+
|
181 |
+
leaders_data.append(leader_info)
|
182 |
+
|
183 |
+
return leaders_data
|
184 |
+
|
185 |
+
def format_leadership_content(self, leaders_data):
|
186 |
+
"""Format leadership data into readable content"""
|
187 |
+
formatted_content = []
|
188 |
+
|
189 |
+
for idx, leader in enumerate(leaders_data, 1):
|
190 |
+
if isinstance(leader, dict):
|
191 |
+
if 'description' in leader and len(leader) == 1:
|
192 |
+
# Simple description format
|
193 |
+
content = f"## Pimpinan {idx}\n\n{leader['description']}"
|
194 |
+
else:
|
195 |
+
# Structured data format
|
196 |
+
position = leader.get("Posisi", f"Pimpinan {idx}")
|
197 |
+
content = f"## {position}\n\n"
|
198 |
+
|
199 |
+
# Format key information in a logical order
|
200 |
+
ordered_keys = ['Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi']
|
201 |
+
|
202 |
+
# Add ordered information first
|
203 |
+
for key in ordered_keys:
|
204 |
+
if key in leader:
|
205 |
+
content += f"**{key}**: {leader[key]}\n\n"
|
206 |
+
|
207 |
+
# Add remaining information
|
208 |
+
for key, value in leader.items():
|
209 |
+
if key not in ordered_keys and key not in ['Posisi', 'description']:
|
210 |
+
content += f"**{key}**: {value}\n\n"
|
211 |
+
|
212 |
+
# Add description if exists
|
213 |
+
if 'description' in leader:
|
214 |
+
content += f"\n{leader['description']}\n\n"
|
215 |
+
|
216 |
+
formatted_content.append(content.strip())
|
217 |
|
218 |
+
return formatted_content
|
|
|
|
|
|
|
|
|
219 |
|
220 |
+
def parse_content(self, response):
|
221 |
+
page_title = response.meta.get('page_title', 'Unknown Page')
|
222 |
+
menu_path = response.meta.get('menu_path', '')
|
223 |
+
if page_title == 'Unknown Page':
|
224 |
+
page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
|
225 |
+
|
226 |
+
self.logger.info(f"Extracting content from: {response.url} ({page_title})")
|
227 |
|
|
|
|
|
228 |
paragraphs = []
|
229 |
|
230 |
+
# 🔹 Special case: halaman pimpinan PNP
|
231 |
+
if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
|
232 |
+
self.logger.info("Detected leadership page - using special extraction")
|
233 |
+
|
234 |
+
leaders_data = self.extract_leadership_info(response)
|
235 |
+
self.logger.info(f"Found {len(leaders_data)} leadership entries")
|
236 |
+
|
237 |
+
if leaders_data:
|
238 |
+
formatted_leaders = self.format_leadership_content(leaders_data)
|
239 |
+
paragraphs = formatted_leaders
|
240 |
+
|
241 |
+
# Also extract any additional content from the page
|
242 |
+
additional_content = self.extract_general_content(response)
|
243 |
+
if additional_content:
|
244 |
+
paragraphs.extend(["## Informasi Tambahan"] + additional_content)
|
245 |
+
else:
|
246 |
+
# Fallback to general content extraction
|
247 |
+
self.logger.warning("Leadership extraction failed, falling back to general extraction")
|
248 |
+
paragraphs = self.extract_general_content(response)
|
249 |
+
else:
|
250 |
+
# 🔹 Normal content extraction
|
251 |
+
paragraphs = self.extract_general_content(response)
|
252 |
+
|
253 |
+
# Create final content
|
254 |
+
content_text = self.create_final_content(page_title, response.url, paragraphs)
|
255 |
+
|
256 |
+
# Add table data if any (but skip for leadership pages to avoid duplication)
|
257 |
+
if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
|
258 |
+
table_content = self.extract_table_data(response)
|
259 |
+
if table_content:
|
260 |
+
content_text += "\n\n## Data Tabel\n\n" + table_content
|
261 |
+
|
262 |
+
# Upload to Supabase
|
263 |
+
filename = self.upload_content(page_title, content_text)
|
264 |
+
|
265 |
+
yield {
|
266 |
+
'url': response.url,
|
267 |
+
'title': page_title,
|
268 |
+
'menu_path': menu_path,
|
269 |
+
'uploaded_as': filename,
|
270 |
+
'timestamp': datetime.now().isoformat(),
|
271 |
+
'content_length': len(content_text),
|
272 |
+
'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
|
273 |
+
}
|
274 |
+
|
275 |
+
# Continue with additional scraping if needed
|
276 |
+
self.process_additional_links(response, menu_path)
|
277 |
+
|
278 |
+
def extract_general_content(self, response):
|
279 |
+
"""Extract general content from the page"""
|
280 |
+
paragraphs = []
|
281 |
+
|
282 |
content_selectors = [
|
283 |
'div.entry-content', 'article.post', 'main.site-main',
|
284 |
+
'div.content', 'div.main-content', 'div#content', 'div.page-content'
|
|
|
285 |
]
|
286 |
+
|
287 |
for selector in content_selectors:
|
288 |
content_area = response.css(selector)
|
289 |
if content_area:
|
290 |
+
elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
|
291 |
+
for elem in elems:
|
292 |
+
text = self.clean_text(' '.join(elem.css('*::text').getall()))
|
293 |
+
if text and len(text.split()) >= 5:
|
294 |
+
# Add links if any
|
295 |
+
links = elem.css('a::attr(href)').getall()
|
296 |
+
for link in links:
|
297 |
+
if link and not link.startswith('#'):
|
298 |
+
text += f" (Link: {response.urljoin(link)})"
|
299 |
paragraphs.append(text)
|
300 |
if paragraphs:
|
301 |
break
|
|
|
303 |
# Fallback: extract from body
|
304 |
if not paragraphs:
|
305 |
body_texts = response.css('body *::text').getall()
|
306 |
+
combined_text = self.clean_text(' '.join(body_texts))
|
307 |
+
if combined_text:
|
308 |
+
# Split into meaningful chunks
|
309 |
+
sentences = re.split(r'(?<=[.!?])\s+', combined_text)
|
310 |
+
current_para = ""
|
311 |
+
for sentence in sentences:
|
312 |
+
if len((current_para + " " + sentence).split()) <= 50:
|
313 |
+
current_para += " " + sentence
|
314 |
+
else:
|
315 |
+
if current_para.strip():
|
316 |
+
paragraphs.append(current_para.strip())
|
317 |
+
current_para = sentence
|
318 |
+
if current_para.strip():
|
319 |
+
paragraphs.append(current_para.strip())
|
320 |
|
321 |
# Format paragraphs
|
322 |
formatted_paragraphs = []
|
323 |
for para in paragraphs:
|
|
|
324 |
if len(para.split()) >= 10:
|
325 |
+
formatted_paragraphs.append(self.format_paragraph(para))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
+
return formatted_paragraphs
|
328 |
|
329 |
+
def extract_table_data(self, response):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
"""Extract and format table data"""
|
|
|
331 |
tables = response.css('table')
|
332 |
+
table_output = []
|
333 |
|
334 |
+
for table_idx, table in enumerate(tables):
|
335 |
table_rows = []
|
336 |
for row in table.css('tr'):
|
337 |
cells = row.css('th, td')
|
338 |
row_data = []
|
|
|
339 |
for cell in cells:
|
340 |
+
cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
|
341 |
+
if link := cell.css('a::attr(href)').get():
|
|
|
342 |
cell_text += f" (Link: {response.urljoin(link)})"
|
343 |
if cell_text:
|
344 |
row_data.append(cell_text)
|
|
|
345 |
if row_data:
|
346 |
table_rows.append(" | ".join(row_data))
|
347 |
|
348 |
if table_rows:
|
349 |
+
table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
|
350 |
+
|
351 |
+
return "\n\n".join(table_output)
|
|
|
|
|
352 |
|
353 |
+
def create_final_content(self, page_title, url, paragraphs):
|
354 |
+
"""Create the final formatted content"""
|
355 |
return f"""# {page_title}
|
356 |
|
357 |
+
**Tanggal**: {datetime.now().strftime('%d %B %Y')}
|
358 |
+
**URL**: {url}
|
359 |
|
360 |
+
{chr(10).join(paragraphs)}"""
|
361 |
|
362 |
+
def upload_content(self, page_title, content_text):
|
363 |
+
"""Upload content to Supabase"""
|
|
|
364 |
safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
|
365 |
+
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
366 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
367 |
filename = f"{safe_title}_{timestamp}.txt"
|
368 |
|
369 |
try:
|
370 |
+
supabase.storage.from_(SUPABASE_BUCKET).upload(
|
|
|
|
|
|
|
|
|
|
|
371 |
path=filename,
|
372 |
+
file=content_text.encode('utf-8'),
|
373 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
374 |
)
|
375 |
+
self.logger.info(f"Uploaded {filename} successfully.")
|
376 |
+
return filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
except Exception as e:
|
378 |
+
self.logger.error(f"Upload error for {filename}: {str(e)}")
|
379 |
+
return f"failed_{filename}"
|
380 |
+
|
381 |
+
def process_additional_links(self, response, menu_path):
|
382 |
+
"""Process additional links from the same domain"""
|
|
|
|
|
|
|
|
|
|
|
383 |
current_domain = response.url.split('//')[1].split('/')[0]
|
|
|
|
|
384 |
if 'pnp.ac.id' not in current_domain:
|
|
|
385 |
header_links = []
|
386 |
+
for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
|
387 |
+
header_links.extend(response.css(sel).getall())
|
388 |
+
for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
|
389 |
+
full_link = response.urljoin(link)
|
390 |
+
if current_domain in full_link:
|
391 |
+
yield scrapy.Request(
|
392 |
+
url=full_link,
|
393 |
+
callback=self.parse_content,
|
394 |
+
meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
|
395 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
|
398 |
if __name__ == '__main__':
|
399 |
+
process = CrawlerProcess({
|
400 |
+
'USER_AGENT': 'PNPBot/1.0',
|
401 |
+
'DOWNLOAD_DELAY': 2,
|
402 |
+
'ROBOTSTXT_OBEY': True,
|
403 |
+
'LOG_LEVEL': 'INFO',
|
404 |
+
'CONCURRENT_REQUESTS': 1,
|
405 |
+
'DOWNLOAD_TIMEOUT': 100,
|
406 |
+
'RETRY_TIMES': 3,
|
407 |
+
'HTTPCACHE_ENABLED': False,
|
408 |
+
'FEED_EXPORT_ENCODING': 'utf-8'
|
409 |
+
})
|
410 |
+
process.crawl(PNPContentSpider)
|
411 |
+
process.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|