FauziIsyrinApridal commited on
Commit
f1150bb
·
1 Parent(s): a300f9d

fix pnp pimpinan

Browse files
Files changed (1) hide show
  1. scrapping/pnp_scrap.py +286 -416
scrapping/pnp_scrap.py CHANGED
@@ -3,27 +3,23 @@ from scrapy.crawler import CrawlerProcess
3
  from datetime import datetime
4
  import re
5
  import os
6
- import tempfile
7
- import logging
8
- from typing import Optional, List, Dict, Any
9
  from supabase import create_client, Client
10
- from dotenv import load_dotenv
11
 
12
- # Load environment variables
13
- load_dotenv()
14
 
15
- # Environment variables with validation
16
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
17
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
18
- SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
 
 
19
 
20
  class PNPContentSpider(scrapy.Spider):
21
  name = 'pnp_content_spider'
22
- start_urls = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
23
 
24
  excluded_subdomains = [
25
  'akt.pnp.ac.id',
26
- 'an.pnp.ac.id',
27
  'bing.pnp.ac.id',
28
  'elektro.pnp.ac.id',
29
  'me.pnp.ac.id',
@@ -32,324 +28,274 @@ class PNPContentSpider(scrapy.Spider):
32
  ]
33
 
34
  custom_settings = {
35
- 'DOWNLOAD_DELAY': 2,
36
  'RETRY_TIMES': 3,
37
  'HTTPCACHE_ENABLED': False,
38
  'ROBOTSTXT_OBEY': True,
39
  'CONCURRENT_REQUESTS': 1,
40
- 'RETRY_ENABLED': True,
41
- 'USER_AGENT': 'PNPBot/1.0 (+https://www.pnp.ac.id)',
42
- 'DOWNLOAD_TIMEOUT': 60,
43
- 'DEPTH_LIMIT': 3,
44
- 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
45
  }
46
 
47
- def __init__(self, *args, **kwargs):
48
- super().__init__(*args, **kwargs)
49
-
50
- # Validate environment variables
51
- if not all([SUPABASE_URL, SUPABASE_KEY]):
52
- raise ValueError("Missing required environment variables: SUPABASE_URL and SUPABASE_KEY")
53
-
54
- try:
55
- self.supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
56
- except Exception as e:
57
- self.logger.error(f"Failed to initialize Supabase client: {e}")
58
- raise
59
 
60
- self.bucket = SUPABASE_BUCKET
61
- self.processed_urls = set()
62
- self.upload_stats = {'success': 0, 'failed': 0}
63
-
64
- def should_follow_link(self, url: str) -> bool:
65
- """Check if URL should be followed based on exclusion rules"""
66
- if not url or url.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
67
- return False
68
 
69
- # Check for excluded subdomains
70
- for subdomain in self.excluded_subdomains:
71
- if subdomain in url:
72
- return False
73
 
74
- # Skip certain file types
75
- excluded_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
76
- '.jpg', '.jpeg', '.png', '.gif', '.svg', '.zip', '.rar']
77
- if any(url.lower().endswith(ext) for ext in excluded_extensions):
78
- return False
79
 
80
- return True
81
 
82
- def format_paragraph(self, text: str, max_words: int = 150) -> str:
83
- """Format text into well-structured paragraphs"""
84
- if not text:
85
- return ""
86
-
87
- # Clean and normalize text
88
- text = re.sub(r'\s+', ' ', text.strip())
89
- sentences = re.split(r'(?<=[.!?])\s+', text)
90
-
91
  paragraph = ''
92
  word_count = 0
93
-
94
  for sentence in sentences:
95
  words = sentence.split()
96
- if word_count + len(words) > max_words and word_count >= 50:
97
- break
98
  word_count += len(words)
99
  paragraph += sentence + ' '
100
-
 
101
  return paragraph.strip()
102
 
103
  def parse(self, response):
104
- """Parse main navigation and follow links"""
105
- if response.status != 200:
106
- self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
107
- return
108
-
109
  self.logger.info(f"Processing main page: {response.url}")
110
-
111
- # Parse navigation items
112
  nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
113
-
114
  for item in nav_items:
115
- # Main menu item
116
- main_title = self.extract_menu_title(item)
 
117
  main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
118
-
119
- if main_link and self.should_follow_link(main_link):
120
  main_link = response.urljoin(main_link)
121
- if main_link not in self.processed_urls:
122
- self.processed_urls.add(main_link)
123
- yield scrapy.Request(
124
- main_link,
125
- callback=self.parse_content,
126
- meta={'page_title': main_title, 'menu_path': main_title},
127
- errback=self.handle_error
128
- )
129
-
130
- # Submenu items
131
  submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
132
  for submenu in submenus:
133
- submenu_title = self.extract_menu_title(submenu)
 
 
134
  submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
135
-
136
- if submenu_link and self.should_follow_link(submenu_link):
137
  submenu_link = response.urljoin(submenu_link)
138
- if submenu_link not in self.processed_urls:
139
- self.processed_urls.add(submenu_link)
140
- menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
141
- yield scrapy.Request(
142
- submenu_link,
143
- callback=self.parse_content,
144
- meta={'page_title': submenu_title, 'menu_path': menu_path},
145
- errback=self.handle_error
146
- )
147
-
148
- def extract_menu_title(self, item) -> str:
149
- """Extract menu title from navigation item"""
150
- title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
151
- if not title:
152
- title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
153
- return title or "Unknown"
154
-
155
- def handle_error(self, failure):
156
- """Handle request errors"""
157
- self.logger.error(f"Request failed: {failure.request.url} - {failure.value}")
158
-
159
- def parse_content(self, response):
160
- """Parse content from pages"""
161
- if response.status != 200:
162
- return
163
-
164
- page_title = response.meta.get('page_title', 'Unknown Page')
165
- menu_path = response.meta.get('menu_path', '')
166
-
167
- # Extract page title if not provided
168
- if page_title == 'Unknown Page':
169
- title_selectors = ['h1.entry-title::text', 'h1.page-title::text', 'title::text', 'h1::text']
170
- for selector in title_selectors:
171
- title = response.css(selector).get()
172
- if title:
173
- page_title = title.strip()
174
- break
175
-
176
- self.logger.info(f"Extracting content from: {response.url} ({page_title})")
177
-
178
- # Special case handling
179
- if self.is_leadership_page(response.url):
180
- content_text = self.parse_leadership_page(response, page_title)
181
- else:
182
- content_text = self.parse_general_content(response, page_title, menu_path)
183
 
184
- if content_text:
185
- result = self.upload_content(content_text, page_title, response.url, menu_path)
186
- if result:
187
- yield result
188
-
189
- # Follow additional links on same domain
190
- yield from self.follow_additional_links(response, menu_path)
191
-
192
- def is_leadership_page(self, url: str) -> bool:
193
- """Check if this is the leadership page"""
194
- return url.strip("/") == "https://www.pnp.ac.id/pnp-profil/pimpinan-pnp"
195
-
196
- def parse_leadership_page(self, response, page_title: str) -> str:
197
- """Parse the leadership page with special handling"""
198
- self.logger.info("Detected special page: Pimpinan PNP")
199
 
200
- paragraphs = []
201
 
202
- # Find all tables that contain leadership data
203
- tables = response.css('table')
204
 
205
- for table in tables:
206
- # Skip nested tables (social media icons, etc.)
207
- if table.css('img'):
208
- continue
209
-
210
- rows = table.css('tr')
211
- leader_info = {}
212
- position = ""
213
-
214
- for i, row in enumerate(rows):
215
- cells = row.css('td')
216
 
217
- # First row often contains the position title
218
- if len(cells) == 1 or (len(cells) == 3 and cells[0].get('colspan')):
219
- # Extract position from first row
220
- position_text = row.xpath('string(.)').get('').strip()
221
- if any(title in position_text.upper() for title in ['DIREKTUR', 'WAKIL DIREKTUR']):
222
- position = position_text
223
- # Clean up position text
224
- position = re.sub(r'<[^>]+>', '', position) # Remove any HTML tags
225
- position = re.sub(r'\s+', ' ', position).strip() # Normalize whitespace
226
  continue
227
 
228
- # Data rows should have exactly 3 cells (key : value)
229
- if len(cells) == 3:
230
- key = cells[0].xpath('string(.)').get('').strip()
231
- colon = cells[1].xpath('string(.)').get('').strip() # Should be ":"
232
- value = cells[2].xpath('string(.)').get('').strip()
 
 
 
 
 
 
 
 
 
233
 
234
- # Clean the value from any links but keep the text
235
- value_elem = cells[2]
236
- links = value_elem.css('a')
237
- if links:
238
- # If there's a link, extract the text content
239
- value = links[0].xpath('string(.)').get('').strip()
 
 
240
 
241
- # Validate and normalize key-value pairs
242
- if key and value and colon == ':' and key.lower() not in ['no', 'keterangan', 'foto']:
243
- # Normalize field names
244
- key_mapping = {
245
- 'nama': 'Nama',
246
- 'nama lengkap': 'Nama',
247
- 'nidn': 'NIDN',
248
- 'nip': 'NIP',
249
- 'jabatan akademik': 'Jabatan Akademik',
250
- 'jurusan': 'Jurusan',
251
- 'program studi': 'Program Studi'
252
- }
253
 
254
- normalized_key = key_mapping.get(key.lower(), key)
255
- leader_info[normalized_key] = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- # If we found leadership data, create narrative
258
- if leader_info and position:
259
- leader_info['Jabatan'] = position
260
- narrative = self.create_leader_narrative(leader_info)
261
- if narrative:
262
- paragraphs.append(narrative)
263
-
264
- # Also extract any descriptive paragraphs about leaders
265
- content_paragraphs = response.css('div.entry-content p')
266
- for para in content_paragraphs:
267
- para_text = para.xpath('string(.)').get('').strip()
268
- # Look for biographical information (text in italics often contains bio info)
269
- if para.css('em') and len(para_text.split()) > 20:
270
- # Extract just the italic text which usually contains the biography
271
- italic_text = ' '.join(para.css('em *::text').getall()).strip()
272
- if italic_text and len(italic_text.split()) > 10:
273
- paragraphs.append(f"Informasi tambahan: {italic_text}")
274
-
275
- return self.format_final_content(page_title, response.url, paragraphs)
276
-
277
- def create_leader_narrative(self, leader_info: Dict[str, str]) -> str:
278
- """Create narrative text from leader information - improved version"""
279
- # Get information with flexible field matching
280
- jabatan = (leader_info.get("Jabatan") or
281
- leader_info.get("jabatan") or
282
- "Pejabat di PNP")
283
-
284
- nama = (leader_info.get("Nama") or
285
- leader_info.get("Nama Lengkap") or
286
- leader_info.get("nama") or
287
- "Tidak diketahui")
288
-
289
- jabatan_akademik = (leader_info.get("Jabatan Akademik") or
290
- leader_info.get("jabatan akademik") or "")
291
-
292
- jurusan = (leader_info.get("Jurusan") or
293
- leader_info.get("jurusan") or "")
294
-
295
- prodi = (leader_info.get("Program Studi") or
296
- leader_info.get("program studi") or "")
297
-
298
- nidn = (leader_info.get("NIDN") or
299
- leader_info.get("nidn") or "")
300
-
301
- nip = (leader_info.get("NIP") or
302
- leader_info.get("nip") or "")
303
-
304
- # Build narrative with better formatting
305
- narrative_parts = []
306
-
307
- # Clean up position title
308
- if jabatan:
309
- # Remove extra formatting and normalize
310
- jabatan = re.sub(r'\s+', ' ', jabatan).strip()
311
- jabatan = jabatan.replace('WAKIL DIREKTUR BIDANG', 'Wakil Direktur Bidang')
312
- jabatan = jabatan.replace('DIREKTUR', 'Direktur')
313
 
314
- narrative_parts.append(f"{jabatan} Politeknik Negeri Padang adalah {nama}.")
315
-
316
- if jabatan_akademik and jabatan_akademik.lower() not in ['tidak ada', '-']:
317
- narrative_parts.append(f"Beliau memiliki jabatan akademik {jabatan_akademik}.")
318
-
319
- if jurusan and jurusan.lower() not in ['tidak ada', '-']:
320
- narrative_parts.append(f"Berasal dari Jurusan {jurusan}.")
321
-
322
- if prodi and prodi.lower() not in ['tidak ada', '-']:
323
- # Clean up program study name
324
- prodi = re.sub(r'^(D-?[34]|Diploma)\s*', '', prodi).strip()
325
- narrative_parts.append(f"Program studi {prodi}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- if nidn and nidn.lower() not in ['tidak ada', '-']:
328
- narrative_parts.append(f"NIDN: {nidn}.")
329
-
330
- if nip and nip.lower() not in ['tidak ada', '-']:
331
- narrative_parts.append(f"NIP: {nip}.")
332
 
333
- return " ".join(narrative_parts)
 
 
 
 
 
 
334
 
335
- def parse_general_content(self, response, page_title: str, menu_path: str) -> str:
336
- """Parse general page content"""
337
  paragraphs = []
338
 
339
- # Try different content selectors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  content_selectors = [
341
  'div.entry-content', 'article.post', 'main.site-main',
342
- 'div.content', 'div.main-content', 'div#content',
343
- 'div.page-content', 'div.post-content'
344
  ]
345
-
346
  for selector in content_selectors:
347
  content_area = response.css(selector)
348
  if content_area:
349
- elements = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div')
350
- for elem in elements:
351
- text = self.extract_element_text(elem, response)
352
- if text and len(text.split()) >= 5: # Filter short texts
 
 
 
 
 
353
  paragraphs.append(text)
354
  if paragraphs:
355
  break
@@ -357,185 +303,109 @@ class PNPContentSpider(scrapy.Spider):
357
  # Fallback: extract from body
358
  if not paragraphs:
359
  body_texts = response.css('body *::text').getall()
360
- paragraphs = [t.strip() for t in body_texts if t.strip() and len(t.strip().split()) >= 5]
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  # Format paragraphs
363
  formatted_paragraphs = []
364
  for para in paragraphs:
365
- para = re.sub(r'\s+', ' ', para.strip())
366
  if len(para.split()) >= 10:
367
- formatted = self.format_paragraph(para)
368
- if formatted and formatted not in formatted_paragraphs:
369
- formatted_paragraphs.append(formatted)
370
-
371
- content = self.format_final_content(page_title, response.url, formatted_paragraphs)
372
-
373
- # Add table data
374
- table_content = self.extract_table_data(response)
375
- if table_content:
376
- content += f"\n\n# Tabel Data\n\n{table_content}"
377
 
378
- return content
379
 
380
- def extract_element_text(self, elem, response) -> str:
381
- """Extract text from element including links"""
382
- text = ' '.join(elem.css('*::text').getall()).strip()
383
-
384
- # Add link information
385
- links = elem.css('a::attr(href)').getall()
386
- for link in links:
387
- if link and not link.startswith('#'):
388
- full_link = response.urljoin(link)
389
- text += f" (Link: {full_link})"
390
-
391
- return text
392
-
393
- def extract_table_data(self, response) -> str:
394
  """Extract and format table data"""
395
- table_output = []
396
  tables = response.css('table')
 
397
 
398
- for i, table in enumerate(tables, 1):
399
  table_rows = []
400
  for row in table.css('tr'):
401
  cells = row.css('th, td')
402
  row_data = []
403
-
404
  for cell in cells:
405
- cell_text = ' '.join(cell.css('*::text').getall()).strip()
406
- link = cell.css('a::attr(href)').get()
407
- if link:
408
  cell_text += f" (Link: {response.urljoin(link)})"
409
  if cell_text:
410
  row_data.append(cell_text)
411
-
412
  if row_data:
413
  table_rows.append(" | ".join(row_data))
414
 
415
  if table_rows:
416
- table_output.append(f"## Tabel {i}\n")
417
- table_output.extend(table_rows)
418
- table_output.append("") # Add spacing
419
-
420
- return "\n".join(table_output)
421
 
422
- def format_final_content(self, page_title: str, url: str, paragraphs: List[str]) -> str:
423
- """Format final content text"""
424
  return f"""# {page_title}
425
 
426
- Tanggal: {datetime.now().strftime('%d %B %Y')}
427
- URL: {url}
428
 
429
- {chr(10).join(paragraphs) if paragraphs else 'Tidak ada konten yang dapat diekstrak.'}"""
430
 
431
- def upload_content(self, content_text: str, page_title: str, url: str, menu_path: str) -> Optional[Dict[str, Any]]:
432
- """Upload content to Supabase storage"""
433
- # Create safe filename
434
  safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
435
- safe_title = re.sub(r'[-\s]+', '-', safe_title)[:50] # Limit length
436
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
437
  filename = f"{safe_title}_{timestamp}.txt"
438
 
439
  try:
440
- with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False, suffix='.txt') as temp_file:
441
- temp_file.write(content_text)
442
- temp_path = temp_file.name
443
-
444
- # Upload to Supabase
445
- result = self.supabase.storage.from_(self.bucket).upload(
446
  path=filename,
447
- file=temp_path,
448
  file_options={"content-type": "text/plain; charset=utf-8"}
449
  )
450
-
451
- self.upload_stats['success'] += 1
452
- self.logger.info(f"✅ Uploaded {filename} successfully.")
453
-
454
- return {
455
- 'url': url,
456
- 'title': page_title,
457
- 'menu_path': menu_path,
458
- 'uploaded_as': filename,
459
- 'timestamp': datetime.now().isoformat(),
460
- 'content_length': len(content_text)
461
- }
462
-
463
  except Exception as e:
464
- self.upload_stats['failed'] += 1
465
- self.logger.error(f"❌ Upload error for {filename}: {str(e)}")
466
- return None
467
- finally:
468
- # Clean up temporary file
469
- if 'temp_path' in locals() and os.path.exists(temp_path):
470
- os.remove(temp_path)
471
-
472
- def follow_additional_links(self, response, menu_path: str):
473
- """Follow additional links on the same domain"""
474
  current_domain = response.url.split('//')[1].split('/')[0]
475
-
476
- # Only follow additional links for non-PNP domains
477
  if 'pnp.ac.id' not in current_domain:
478
- header_selectors = ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']
479
  header_links = []
480
-
481
- for selector in header_selectors:
482
- header_links.extend(response.css(selector).getall())
483
-
484
- # Process unique links
485
- processed_links = set()
486
- for link in header_links:
487
- if not link or link in processed_links:
488
- continue
489
-
490
- if self.should_follow_link(link):
491
- full_link = response.urljoin(link)
492
- if current_domain in full_link and full_link not in self.processed_urls:
493
- processed_links.add(link)
494
- self.processed_urls.add(full_link)
495
-
496
- yield scrapy.Request(
497
- url=full_link,
498
- callback=self.parse_content,
499
- meta={
500
- 'page_title': 'Header Link',
501
- 'menu_path': f"{menu_path} > Header"
502
- },
503
- errback=self.handle_error
504
- )
505
-
506
- def closed(self, reason):
507
- """Called when spider closes"""
508
- self.logger.info(f"Spider closed: {reason}")
509
- self.logger.info(f"Upload statistics - Success: {self.upload_stats['success']}, Failed: {self.upload_stats['failed']}")
510
- self.logger.info(f"Total URLs processed: {len(self.processed_urls)}")
511
 
512
 
513
  if __name__ == '__main__':
514
- # Configure logging
515
- logging.basicConfig(
516
- level=logging.INFO,
517
- format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
518
- )
519
-
520
- # Validate environment before starting
521
- if not all([SUPABASE_URL, SUPABASE_KEY]):
522
- print("❌ Missing required environment variables!")
523
- exit(1)
524
-
525
- try:
526
- process = CrawlerProcess({
527
- 'USER_AGENT': 'PNPBot/1.0 (+https://www.pnp.ac.id)',
528
- 'DOWNLOAD_DELAY': 2,
529
- 'ROBOTSTXT_OBEY': True,
530
- 'LOG_LEVEL': 'INFO',
531
- 'CONCURRENT_REQUESTS': 1,
532
- 'DOWNLOAD_TIMEOUT': 60,
533
- 'RETRY_TIMES': 3,
534
- 'HTTPCACHE_ENABLED': False,
535
- 'DEPTH_LIMIT': 3,
536
- })
537
- process.crawl(PNPContentSpider)
538
- process.start()
539
- except Exception as e:
540
- logging.error(f"Failed to run spider: {e}")
541
- raise
 
3
  from datetime import datetime
4
  import re
5
  import os
 
 
 
6
  from supabase import create_client, Client
7
+ import html
8
 
 
 
9
 
 
10
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
11
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
12
+ SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
13
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
14
+
15
 
16
  class PNPContentSpider(scrapy.Spider):
17
  name = 'pnp_content_spider'
18
+ start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
19
 
20
  excluded_subdomains = [
21
  'akt.pnp.ac.id',
22
+ 'an.pnp.ac.id',
23
  'bing.pnp.ac.id',
24
  'elektro.pnp.ac.id',
25
  'me.pnp.ac.id',
 
28
  ]
29
 
30
  custom_settings = {
31
+ 'DOWNLOAD_DELAY': 1,
32
  'RETRY_TIMES': 3,
33
  'HTTPCACHE_ENABLED': False,
34
  'ROBOTSTXT_OBEY': True,
35
  'CONCURRENT_REQUESTS': 1,
36
+ 'RETRY_ENABLED': True,
37
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
 
38
  }
39
 
40
+ def clean_text(self, text: str) -> str:
41
+ """Clean and normalize text content"""
42
+ if not text:
43
+ return ""
 
 
 
 
 
 
 
 
44
 
45
+ # Decode HTML entities
46
+ text = html.unescape(text)
 
 
 
 
 
 
47
 
48
+ # Remove extra whitespace and normalize
49
+ text = ' '.join(text.split())
 
 
50
 
51
+ # Fix common encoding issues
52
+ text = text.replace('“', '"').replace('â€', '"').replace('’', "'")
53
+ text = text.replace('â€"', '').replace('â€"', '')
 
 
54
 
55
+ return text.strip()
56
 
57
+ def format_paragraph(self, text: str) -> str:
58
+ text = self.clean_text(text)
59
+ sentences = re.split(r'(?<=[.!?]) +', text)
 
 
 
 
 
 
60
  paragraph = ''
61
  word_count = 0
 
62
  for sentence in sentences:
63
  words = sentence.split()
 
 
64
  word_count += len(words)
65
  paragraph += sentence + ' '
66
+ if 50 <= word_count <= 150:
67
+ break
68
  return paragraph.strip()
69
 
70
  def parse(self, response):
 
 
 
 
 
71
  self.logger.info(f"Processing main page: {response.url}")
 
 
72
  nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
 
73
  for item in nav_items:
74
+ main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
75
+ if not main_title:
76
+ main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
77
  main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
78
+ if main_link and not main_link.startswith('#'):
 
79
  main_link = response.urljoin(main_link)
80
+ if "jurusan" in main_link.lower():
81
+ continue
82
+ yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
 
 
 
 
 
 
 
83
  submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
84
  for submenu in submenus:
85
+ submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
86
+ if not submenu_title:
87
+ submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
88
  submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
89
+ if submenu_link and not submenu_link.startswith('#'):
 
90
  submenu_link = response.urljoin(submenu_link)
91
+ if "jurusan" in submenu_link.lower():
92
+ continue
93
+ menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
94
+ yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ def extract_leadership_info(self, response):
97
+ """Extract leadership information from the special leadership page"""
98
+ self.logger.info("Extracting leadership information from special page")
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ leaders_data = []
101
 
102
+ # Try multiple table selectors based on the HTML structure shown
103
+ tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
104
 
105
+ if tables:
106
+ # Process each table
107
+ for table_idx, table in enumerate(tables):
108
+ self.logger.info(f"Processing table {table_idx + 1}")
 
 
 
 
 
 
 
109
 
110
+ rows = table.css('tr')
111
+ if not rows:
 
 
 
 
 
 
 
112
  continue
113
 
114
+ leader_info = {}
115
+ position_title = ""
116
+
117
+ # Look for position title (like "DIREKTUR")
118
+ title_elements = table.css('strong, .position-title, th')
119
+ for title_elem in title_elements:
120
+ title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
121
+ if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
122
+ position_title = title_text
123
+ break
124
+
125
+ # Extract key-value pairs from table rows
126
+ for row in rows:
127
+ cells = row.css('td, th')
128
 
129
+ if len(cells) >= 3:
130
+ # Format: Label | : | Value (3 columns)
131
+ key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
132
+ separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
133
+ value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
134
+
135
+ if key and value and separator == ":":
136
+ leader_info[key] = value
137
 
138
+ elif len(cells) == 2:
139
+ # Format: Label | Value (2 columns)
140
+ key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
141
+ value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
 
 
 
 
 
 
 
 
142
 
143
+ if key and value and key != value:
144
+ # Skip if key contains colon (likely "Label:")
145
+ clean_key = key.replace(':', '').strip()
146
+ leader_info[clean_key] = value
147
+
148
+ # Add position title if found
149
+ if position_title:
150
+ leader_info['Posisi'] = position_title
151
+
152
+ # If we found structured data, add it
153
+ if leader_info:
154
+ leaders_data.append(leader_info)
155
+ self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
156
+
157
+ # Fallback: Extract from general content structure
158
+ if not leaders_data:
159
+ self.logger.info("No table data found, trying general content extraction")
160
 
161
+ # Look for profile sections
162
+ profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ for section in profile_sections:
165
+ section_text = self.clean_text(' '.join(section.css('*::text').getall()))
166
+
167
+ # Check if this section contains leadership info
168
+ if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
169
+ # Try to extract structured info from the text
170
+ leader_info = {'description': section_text}
171
+
172
+ # Try to extract specific details using regex
173
+ name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
174
+ if name_match:
175
+ leader_info['Nama'] = name_match.group(0).strip()
176
+
177
+ nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
178
+ if nidn_match:
179
+ leader_info['NIDN'] = nidn_match.group(1)
180
+
181
+ leaders_data.append(leader_info)
182
+
183
+ return leaders_data
184
+
185
+ def format_leadership_content(self, leaders_data):
186
+ """Format leadership data into readable content"""
187
+ formatted_content = []
188
+
189
+ for idx, leader in enumerate(leaders_data, 1):
190
+ if isinstance(leader, dict):
191
+ if 'description' in leader and len(leader) == 1:
192
+ # Simple description format
193
+ content = f"## Pimpinan {idx}\n\n{leader['description']}"
194
+ else:
195
+ # Structured data format
196
+ position = leader.get("Posisi", f"Pimpinan {idx}")
197
+ content = f"## {position}\n\n"
198
+
199
+ # Format key information in a logical order
200
+ ordered_keys = ['Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi']
201
+
202
+ # Add ordered information first
203
+ for key in ordered_keys:
204
+ if key in leader:
205
+ content += f"**{key}**: {leader[key]}\n\n"
206
+
207
+ # Add remaining information
208
+ for key, value in leader.items():
209
+ if key not in ordered_keys and key not in ['Posisi', 'description']:
210
+ content += f"**{key}**: {value}\n\n"
211
+
212
+ # Add description if exists
213
+ if 'description' in leader:
214
+ content += f"\n{leader['description']}\n\n"
215
+
216
+ formatted_content.append(content.strip())
217
 
218
+ return formatted_content
 
 
 
 
219
 
220
+ def parse_content(self, response):
221
+ page_title = response.meta.get('page_title', 'Unknown Page')
222
+ menu_path = response.meta.get('menu_path', '')
223
+ if page_title == 'Unknown Page':
224
+ page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
225
+
226
+ self.logger.info(f"Extracting content from: {response.url} ({page_title})")
227
 
 
 
228
  paragraphs = []
229
 
230
+ # 🔹 Special case: halaman pimpinan PNP
231
+ if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
232
+ self.logger.info("Detected leadership page - using special extraction")
233
+
234
+ leaders_data = self.extract_leadership_info(response)
235
+ self.logger.info(f"Found {len(leaders_data)} leadership entries")
236
+
237
+ if leaders_data:
238
+ formatted_leaders = self.format_leadership_content(leaders_data)
239
+ paragraphs = formatted_leaders
240
+
241
+ # Also extract any additional content from the page
242
+ additional_content = self.extract_general_content(response)
243
+ if additional_content:
244
+ paragraphs.extend(["## Informasi Tambahan"] + additional_content)
245
+ else:
246
+ # Fallback to general content extraction
247
+ self.logger.warning("Leadership extraction failed, falling back to general extraction")
248
+ paragraphs = self.extract_general_content(response)
249
+ else:
250
+ # 🔹 Normal content extraction
251
+ paragraphs = self.extract_general_content(response)
252
+
253
+ # Create final content
254
+ content_text = self.create_final_content(page_title, response.url, paragraphs)
255
+
256
+ # Add table data if any (but skip for leadership pages to avoid duplication)
257
+ if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
258
+ table_content = self.extract_table_data(response)
259
+ if table_content:
260
+ content_text += "\n\n## Data Tabel\n\n" + table_content
261
+
262
+ # Upload to Supabase
263
+ filename = self.upload_content(page_title, content_text)
264
+
265
+ yield {
266
+ 'url': response.url,
267
+ 'title': page_title,
268
+ 'menu_path': menu_path,
269
+ 'uploaded_as': filename,
270
+ 'timestamp': datetime.now().isoformat(),
271
+ 'content_length': len(content_text),
272
+ 'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
273
+ }
274
+
275
+ # Continue with additional scraping if needed
276
+ self.process_additional_links(response, menu_path)
277
+
278
+ def extract_general_content(self, response):
279
+ """Extract general content from the page"""
280
+ paragraphs = []
281
+
282
  content_selectors = [
283
  'div.entry-content', 'article.post', 'main.site-main',
284
+ 'div.content', 'div.main-content', 'div#content', 'div.page-content'
 
285
  ]
286
+
287
  for selector in content_selectors:
288
  content_area = response.css(selector)
289
  if content_area:
290
+ elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
291
+ for elem in elems:
292
+ text = self.clean_text(' '.join(elem.css('*::text').getall()))
293
+ if text and len(text.split()) >= 5:
294
+ # Add links if any
295
+ links = elem.css('a::attr(href)').getall()
296
+ for link in links:
297
+ if link and not link.startswith('#'):
298
+ text += f" (Link: {response.urljoin(link)})"
299
  paragraphs.append(text)
300
  if paragraphs:
301
  break
 
303
  # Fallback: extract from body
304
  if not paragraphs:
305
  body_texts = response.css('body *::text').getall()
306
+ combined_text = self.clean_text(' '.join(body_texts))
307
+ if combined_text:
308
+ # Split into meaningful chunks
309
+ sentences = re.split(r'(?<=[.!?])\s+', combined_text)
310
+ current_para = ""
311
+ for sentence in sentences:
312
+ if len((current_para + " " + sentence).split()) <= 50:
313
+ current_para += " " + sentence
314
+ else:
315
+ if current_para.strip():
316
+ paragraphs.append(current_para.strip())
317
+ current_para = sentence
318
+ if current_para.strip():
319
+ paragraphs.append(current_para.strip())
320
 
321
  # Format paragraphs
322
  formatted_paragraphs = []
323
  for para in paragraphs:
 
324
  if len(para.split()) >= 10:
325
+ formatted_paragraphs.append(self.format_paragraph(para))
 
 
 
 
 
 
 
 
 
326
 
327
+ return formatted_paragraphs
328
 
329
+ def extract_table_data(self, response):
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  """Extract and format table data"""
 
331
  tables = response.css('table')
332
+ table_output = []
333
 
334
+ for table_idx, table in enumerate(tables):
335
  table_rows = []
336
  for row in table.css('tr'):
337
  cells = row.css('th, td')
338
  row_data = []
 
339
  for cell in cells:
340
+ cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
341
+ if link := cell.css('a::attr(href)').get():
 
342
  cell_text += f" (Link: {response.urljoin(link)})"
343
  if cell_text:
344
  row_data.append(cell_text)
 
345
  if row_data:
346
  table_rows.append(" | ".join(row_data))
347
 
348
  if table_rows:
349
+ table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
350
+
351
+ return "\n\n".join(table_output)
 
 
352
 
353
+ def create_final_content(self, page_title, url, paragraphs):
354
+ """Create the final formatted content"""
355
  return f"""# {page_title}
356
 
357
+ **Tanggal**: {datetime.now().strftime('%d %B %Y')}
358
+ **URL**: {url}
359
 
360
+ {chr(10).join(paragraphs)}"""
361
 
362
+ def upload_content(self, page_title, content_text):
363
+ """Upload content to Supabase"""
 
364
  safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
365
+ safe_title = re.sub(r'[-\s]+', '-', safe_title)
366
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
367
  filename = f"{safe_title}_{timestamp}.txt"
368
 
369
  try:
370
+ supabase.storage.from_(SUPABASE_BUCKET).upload(
 
 
 
 
 
371
  path=filename,
372
+ file=content_text.encode('utf-8'),
373
  file_options={"content-type": "text/plain; charset=utf-8"}
374
  )
375
+ self.logger.info(f"Uploaded {filename} successfully.")
376
+ return filename
 
 
 
 
 
 
 
 
 
 
 
377
  except Exception as e:
378
+ self.logger.error(f"Upload error for {filename}: {str(e)}")
379
+ return f"failed_{filename}"
380
+
381
+ def process_additional_links(self, response, menu_path):
382
+ """Process additional links from the same domain"""
 
 
 
 
 
383
  current_domain = response.url.split('//')[1].split('/')[0]
 
 
384
  if 'pnp.ac.id' not in current_domain:
 
385
  header_links = []
386
+ for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
387
+ header_links.extend(response.css(sel).getall())
388
+ for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
389
+ full_link = response.urljoin(link)
390
+ if current_domain in full_link:
391
+ yield scrapy.Request(
392
+ url=full_link,
393
+ callback=self.parse_content,
394
+ meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
395
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
 
398
  if __name__ == '__main__':
399
+ process = CrawlerProcess({
400
+ 'USER_AGENT': 'PNPBot/1.0',
401
+ 'DOWNLOAD_DELAY': 2,
402
+ 'ROBOTSTXT_OBEY': True,
403
+ 'LOG_LEVEL': 'INFO',
404
+ 'CONCURRENT_REQUESTS': 1,
405
+ 'DOWNLOAD_TIMEOUT': 100,
406
+ 'RETRY_TIMES': 3,
407
+ 'HTTPCACHE_ENABLED': False,
408
+ 'FEED_EXPORT_ENCODING': 'utf-8'
409
+ })
410
+ process.crawl(PNPContentSpider)
411
+ process.start()