FauziIsyrinApridal commited on
Commit
c41b08a
·
1 Parent(s): 30065a4

update besar jurusan scrap

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. scrapping/jurusan_scrap.py +118 -824
requirements.txt CHANGED
@@ -2,3 +2,4 @@ scrapy
2
  supabase
3
  python-dotenv
4
  requests
 
 
2
  supabase
3
  python-dotenv
4
  requests
5
+ beautifulsoup4
scrapping/jurusan_scrap.py CHANGED
@@ -1,831 +1,125 @@
1
  import scrapy
2
  from scrapy.crawler import CrawlerProcess
3
- import re
4
- import os
5
- import unicodedata
6
- from urllib.parse import urlparse, urljoin
7
- from datetime import datetime
8
- from collections import defaultdict
9
- from supabase import create_client
10
  from dotenv import load_dotenv
11
-
12
- # Load environment variables
13
- load_dotenv()
14
-
15
- class PNPDepartmentSpider(scrapy.Spider):
16
- name = 'improved_pnp_department_spider'
17
-
18
- DEPARTMENTS = {
19
- 'akt.pnp.ac.id': 'Akuntansi',
20
- 'an.pnp.ac.id': 'Administrasi_Niaga',
21
- 'bing.pnp.ac.id': 'Bahasa_Inggris',
22
- 'elektro.pnp.ac.id': 'Teknik_Elektro',
23
- 'me.pnp.ac.id': 'Teknik_Mesin',
24
- 'sipil.pnp.ac.id': 'Teknik_Sipil',
25
- 'ti.pnp.ac.id': 'Teknologi_Informasi'
26
- }
27
-
28
- start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
29
- visited_urls = set()
30
-
31
- custom_settings = {
32
- 'DOWNLOAD_DELAY': 2.0,
33
- 'ROBOTSTXT_OBEY': True,
34
- 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
35
- 'LOG_LEVEL': 'INFO',
36
- 'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
37
- "DOWNLOAD_TIMEOUT": 100,
38
- 'RETRY_TIMES': 3,
39
- 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
40
- 'HTTPCACHE_ENABLED': True
41
- }
42
-
43
- def __init__(self, *args, **kwargs):
44
- super().__init__(*args, **kwargs)
45
- self.supabase = create_client(
46
- os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
47
- os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
48
- )
49
- self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
50
- self.department_data = defaultdict(lambda: defaultdict(list))
51
- self.study_programs = defaultdict(list)
52
- self.department_info = defaultdict(dict)
53
-
54
- def start_requests(self):
55
- for url in self.start_urls:
56
- yield scrapy.Request(
57
- url=url,
58
- callback=self.parse_department_homepage,
59
- errback=self.handle_error,
60
- headers={'Accept': 'text/html,application/xhtml+xml'}
61
- )
62
-
63
- def parse_department_homepage(self, response):
64
- domain = urlparse(response.url).netloc
65
- department = self.DEPARTMENTS.get(domain, domain)
66
- self.visited_urls.add(response.url)
67
-
68
- self.logger.info(f"Processing department homepage: {domain} - {department}")
69
-
70
- # Extract homepage content first
71
- homepage_content = self.extract_content(response)
72
- page_title = response.css('h1::text, .site-title::text, title::text').get()
73
- if page_title:
74
- page_title = page_title.strip()
75
- else:
76
- page_title = "Homepage"
77
-
78
- if homepage_content:
79
- self.save_page_content(
80
- response.url,
81
- page_title,
82
- department,
83
- domain,
84
- 'Beranda',
85
- homepage_content
86
- )
87
-
88
- # Process navigation menu
89
- nav_elements = self.extract_navigation(response)
90
- for nav_item in nav_elements:
91
- if not nav_item['link'] or nav_item['link'].startswith('#'):
92
- continue
93
-
94
- full_url = response.urljoin(nav_item['link'])
95
- category = self.determine_category(nav_item['text'])
96
-
97
- if full_url not in self.visited_urls:
98
- yield scrapy.Request(
99
- url=full_url,
100
- callback=self.parse_content_page,
101
- meta={
102
- 'page_title': nav_item['text'],
103
- 'category': category,
104
- 'department': department,
105
- 'domain': domain,
106
- 'menu_path': nav_item['text']
107
- },
108
- errback=self.handle_error
109
- )
110
-
111
- # Find and process all study program links
112
- study_program_links = self.extract_study_program_links(response)
113
- for prog in study_program_links:
114
- full_url = response.urljoin(prog['link'])
115
- if full_url not in self.visited_urls:
116
- yield scrapy.Request(
117
- url=full_url,
118
- callback=self.parse_study_program,
119
- meta={
120
- 'page_title': prog['title'],
121
- 'department': department,
122
- 'domain': domain
123
- },
124
- errback=self.handle_error
125
- )
126
-
127
- # Find and process vision & mission specifically
128
- vision_mission_links = self.extract_vision_mission_links(response)
129
- for vm_link in vision_mission_links:
130
- full_url = response.urljoin(vm_link['link'])
131
- if full_url not in self.visited_urls:
132
- yield scrapy.Request(
133
- url=full_url,
134
- callback=self.parse_vision_mission,
135
- meta={
136
- 'page_title': vm_link['title'],
137
- 'department': department,
138
- 'domain': domain
139
- },
140
- errback=self.handle_error
141
- )
142
-
143
- def extract_navigation(self, response):
144
- """Extract navigation elements from page"""
145
- nav_items = []
146
-
147
- # Try multiple selectors that commonly contain navigation
148
- nav_selectors = [
149
- 'nav a', '.navbar a', '.navigation a', '.main-menu a', '.nav a',
150
- '#menu a', '.menu a', 'header a', '.navbar-collapse a',
151
- 'ul.nav a', '.dropdown-menu a', '.megamenu a',
152
- '#main-menu a', '.main-navigation a', '#primary-menu a',
153
- '.top-menu a', '.primary-menu a', '#nav a'
154
- ]
155
-
156
- for selector in nav_selectors:
157
- for item in response.css(selector):
158
- text = self.clean_text(' '.join(item.css('::text').getall()))
159
- link = item.css('::attr(href)').get()
160
-
161
- if text and link and len(text.strip()) > 1:
162
- if not self.is_social_media_link(link) and not self.is_unwanted_url(link):
163
- nav_items.append({
164
- 'text': text.strip(),
165
- 'link': response.urljoin(link)
166
- })
167
-
168
- return nav_items
169
-
170
- def extract_study_program_links(self, response):
171
- """Extract study program links from complex WordPress navigation menus"""
172
- program_links = []
173
-
174
- # XPath untuk menemukan menu 'Program Studi' dan semua submenu-nya
175
- base_xpath = """
176
- //li[contains(@class, 'wp-block-navigation-submenu')]
177
- [.//span[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi')]]
178
- //ul[@class='wp-block-navigation__submenu-container']
179
- /li[contains(@class, 'wp-block-navigation-submenu')]
180
- /a[contains(@class, 'wp-block-navigation-item__content')]
181
- """
182
-
183
- # Ambil semua link program studi utama
184
- for link in response.xpath(base_xpath):
185
- text = self.clean_text(''.join(link.xpath('.//span[@class="wp-block-navigation-item__label"]//text()').getall()))
186
- url = link.xpath('@href').get()
187
-
188
- if text and url:
189
- program_links.append({
190
- 'title': text.strip(),
191
- 'link': response.urljoin(url)
192
- })
193
-
194
- # Logika fallback untuk website yang menggunakan struktur berbeda
195
- if not program_links:
196
- program_links = self.fallback_extract_study_program_links(response)
197
-
198
- return program_links
199
-
200
- def fallback_extract_study_program_links(self, response):
201
- """Fallback method for extracting study program links"""
202
- program_links = []
203
- keywords = ['program studi', 'prodi', 'd3', 'd4', 's1', 'diploma', 'sarjana']
204
-
205
- for keyword in keywords:
206
- # Case-insensitive search for links containing the keyword
207
- xpath_expr = f"//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{keyword}')]"
208
- for link in response.xpath(xpath_expr):
209
- text = self.clean_text(''.join(link.xpath('.//text()').getall()))
210
- url = link.xpath('@href').get()
211
- if text and url:
212
- program_links.append({
213
- 'title': text.strip(),
214
- 'link': response.urljoin(url)
215
- })
216
-
217
- return program_links
218
-
219
- def extract_vision_mission_links(self, response):
220
- """Extract links specifically for vision & mission"""
221
- vm_links = []
222
-
223
- # Terms related to vision & mission
224
- vm_terms = ['visi', 'misi', 'vision', 'mission', 'visi-misi', 'visi & misi', 'visi dan misi']
225
-
226
- # Look for links containing these terms using XPath
227
- for term in vm_terms:
228
- # Case-insensitive search
229
- xpath_expr = f"//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{term}')]"
230
- for link in response.xpath(xpath_expr):
231
- text = self.clean_text(''.join(link.xpath('.//text()').getall()))
232
- url = link.xpath('@href').get()
233
-
234
- if text and url:
235
- vm_links.append({
236
- 'title': text.strip(),
237
- 'link': response.urljoin(url)
238
- })
239
-
240
- return vm_links
241
-
242
- def parse_content_page(self, response):
243
- """Process regular content pages"""
244
- meta = response.meta
245
- self.visited_urls.add(response.url)
246
-
247
- # Extract content from this page
248
- content = self.extract_content(response)
249
-
250
- if content:
251
- self.save_page_content(
252
- response.url,
253
- meta['page_title'],
254
- meta['department'],
255
- meta['domain'],
256
- meta['category'],
257
- content,
258
- meta.get('menu_path', '')
259
- )
260
-
261
- # Look for additional sub-links within this page
262
- content_links = response.css('article a, .content a, .entry-content a, .post-content a, main a')
263
- for link in content_links:
264
- link_text = self.clean_text(' '.join(link.css('::text').getall()))
265
- link_url = link.css('::attr(href)').get()
266
-
267
- if link_text and link_url and len(link_text) > 3:
268
- # Only follow internal links
269
- parsed_url = urlparse(response.urljoin(link_url))
270
- if parsed_url.netloc == meta['domain']:
271
- full_url = response.urljoin(link_url)
272
- if full_url not in self.visited_urls and not self.is_unwanted_url(full_url):
273
- yield scrapy.Request(
274
- url=full_url,
275
- callback=self.parse_content_page,
276
- meta={
277
- 'page_title': link_text,
278
- 'category': meta['category'], # Keep parent category
279
- 'department': meta['department'],
280
- 'domain': meta['domain'],
281
- 'menu_path': f"{meta.get('menu_path', '')} > {link_text}"
282
- },
283
- errback=self.handle_error
284
- )
285
-
286
- def parse_study_program(self, response):
287
- """Process study program pages specifically"""
288
- meta = response.meta
289
- self.visited_urls.add(response.url)
290
-
291
- department = meta['department']
292
- program_title = meta['page_title']
293
-
294
- # Extract program details
295
- program_details = self.extract_program_details(response)
296
-
297
-
298
- # Add to the study programs collection
299
- self.study_programs[department].append({
300
- 'title': program_title,
301
- 'url': response.url,
302
- 'details': program_details
303
- })
304
-
305
- # Also save as a regular page
306
- content = self.extract_content(response)
307
- if content:
308
- self.save_page_content(
309
- response.url,
310
- program_title,
311
- department,
312
- meta['domain'],
313
- 'Program_Studi',
314
- content
315
- )
316
-
317
- def extract_program_details(self, response):
318
- """Enhanced program details extraction with better degree detection"""
319
- details = {} # Initialize details as empty dict
320
-
321
- # Improved degree detection from multiple sources
322
- degree_sources = [
323
- response.css('title::text').get(),
324
- response.css('h1::text').get(),
325
- ' '.join(response.css('.breadcrumb ::text').getall())
326
- ]
327
-
328
- degree_pattern = re.compile(
329
- r'\b(D[1-4]|S[1-3]|Diploma|Sarjana|Magister|Profesi|Spesialis|Terapan)\b',
330
- re.IGNORECASE
331
- )
332
-
333
- for text in degree_sources:
334
- if text and (match := degree_pattern.search(text)):
335
- details['degree'] = match.group(1).upper()
336
- break
337
-
338
- # Extract accreditation status
339
- accreditation = response.xpath(
340
- '//span[contains(translate(., "ABCDE", "abcde"), "akreditasi")]'
341
- '/following-sibling::span/text()'
342
- ).get()
343
-
344
- if accreditation:
345
- details['accreditation'] = self.clean_text(accreditation)
346
-
347
- # Extract description from the first paragraph
348
- first_paragraph = response.css('p::text').get()
349
- if first_paragraph:
350
- details['description'] = self.clean_text(first_paragraph)
351
-
352
- return details
353
-
354
- def parse_vision_mission(self, response):
355
- """Special handler for vision & mission pages"""
356
- meta = response.meta
357
- self.visited_urls.add(response.url)
358
- department = meta['department']
359
-
360
- vision_text = ""
361
- mission_text = ""
362
-
363
- # Look for vision section
364
- vision_selectors = [
365
- 'h2:contains("Visi") + p', 'h3:contains("Visi") + p',
366
- 'h4:contains("Visi") + p', '.visi p', '#visi p',
367
- 'h2:contains("Vision") + p', 'h3:contains("Vision") + p',
368
- 'strong:contains("Visi") + p', 'b:contains("Visi") + p'
369
- ]
370
-
371
- for selector in vision_selectors:
372
- try:
373
- vision = response.css(selector).get()
374
- if vision:
375
- vision_text = self.clean_text(scrapy.Selector(text=vision).css('::text').get(''))
376
- if vision_text:
377
- break
378
- except:
379
- continue
380
-
381
- # If still not found, try looking for paragraphs after headings
382
- if not vision_text:
383
- for heading in response.css('h1, h2, h3, h4, h5, h6'):
384
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
385
- if heading_text and ('visi' in heading_text.lower() or 'vision' in heading_text.lower()):
386
- # Try to get the next paragraph
387
- next_p = heading.xpath('following-sibling::p[1]')
388
- if next_p:
389
- vision_text = self.clean_text(' '.join(next_p.css('::text').getall()))
390
- break
391
-
392
- # Look for mission section using similar approach
393
- mission_selectors = [
394
- 'h2:contains("Misi") + p', 'h3:contains("Misi") + p',
395
- 'h4:contains("Misi") + p', '.misi p', '#misi p',
396
- 'h2:contains("Mission") + p', 'h3:contains("Mission") + p',
397
- 'strong:contains("Misi") + p', 'b:contains("Misi") + p'
398
- ]
399
-
400
- for selector in mission_selectors:
401
- try:
402
- mission = response.css(selector).get()
403
- if mission:
404
- mission_text = self.clean_text(scrapy.Selector(text=mission).css('::text').get(''))
405
- if mission_text:
406
- break
407
- except:
408
- continue
409
-
410
- # If still not found, try looking for paragraphs after headings
411
- if not mission_text:
412
- for heading in response.css('h1, h2, h3, h4, h5, h6'):
413
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
414
- if heading_text and ('misi' in heading_text.lower() or 'mission' in heading_text.lower()):
415
- # Try to get the next paragraph
416
- next_p = heading.xpath('following-sibling::p[1]')
417
- if next_p:
418
- mission_text = self.clean_text(' '.join(next_p.css('::text').getall()))
419
- break
420
-
421
- # Try to find mission list items
422
- mission_list_items = []
423
- for list_selector in ['h2:contains("Misi") ~ ul li', 'h3:contains("Misi") ~ ul li',
424
- 'h4:contains("Misi") ~ ul li', '.misi ul li', '#misi ul li',
425
- 'h2:contains("Mission") ~ ul li', 'h3:contains("Mission") ~ ul li']:
426
- try:
427
- items = response.css(f'{list_selector}::text').getall()
428
- if items:
429
- mission_list_items = [self.clean_text(item) for item in items if self.clean_text(item)]
430
- if mission_list_items:
431
- break
432
- except:
433
- continue
434
-
435
- # Store vision and mission in department info
436
- if vision_text or mission_text or mission_list_items:
437
- if vision_text:
438
- self.department_info[department]['vision'] = vision_text
439
- if mission_text:
440
- self.department_info[department]['mission'] = mission_text
441
- if mission_list_items:
442
- self.department_info[department]['mission_items'] = mission_list_items
443
-
444
- # Save as separate file for vision-mission
445
- self.save_vision_mission(
446
- department,
447
- meta['domain'],
448
- vision_text,
449
- mission_text,
450
- mission_list_items,
451
- response.url
452
- )
453
-
454
- # Also save as a regular page
455
- content = self.extract_content(response)
456
- if content:
457
- self.save_page_content(
458
- response.url,
459
- meta['page_title'],
460
- department,
461
- meta['domain'],
462
- 'Profil',
463
- content
464
- )
465
-
466
- def extract_content(self, response):
467
- """Extract content from a page in a structured format"""
468
- content = {"paragraphs": [], "tables": [], "files": []}
469
-
470
- # First try to find the main content areas
471
- content_selectors = [
472
- 'div.entry-content', 'article.post', 'main.site-main',
473
- 'div.content', 'div.main-content', 'div#content', 'div.page-content',
474
- 'article', '.post-content', '.entry-content', '.content',
475
- '.page-content', 'main', '#content', '.main-content',
476
- '.article-content', '.single-content'
477
- ]
478
-
479
- main_content = None
480
- for selector in content_selectors:
481
- elements = response.css(selector)
482
- if elements:
483
- main_content = elements
484
- break
485
-
486
- # If no primary content found, use body
487
- if not main_content:
488
- main_content = response.css('body')
489
-
490
- # Extract headings and paragraphs
491
- for heading in main_content.css('h1, h2, h3, h4, h5, h6'):
492
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
493
- if heading_text and len(heading_text) > 3:
494
- heading_tag = heading.root.tag
495
- content["paragraphs"].append(f"[{heading_tag.upper()}] {heading_text}")
496
-
497
- # Extract paragraphs
498
- for p in main_content.css('p'):
499
- text = self.clean_text(' '.join(p.css('::text').getall()))
500
- if text and len(text) > 10: # Reduced minimum meaningful length
501
- # Add any links found in this paragraph
502
- links = []
503
- for a in p.css('a'):
504
- link_text = self.clean_text(' '.join(a.css('::text').getall()))
505
- link_url = a.css('::attr(href)').get()
506
- if link_text and link_url:
507
- links.append(f"{link_text} (Link: {response.urljoin(link_url)})")
508
-
509
- paragraph = text
510
- if links:
511
- paragraph += f" | Links: {'; '.join(links)}"
512
-
513
- content["paragraphs"].append(paragraph)
514
-
515
- # Extract list items
516
- for li in main_content.css('li'):
517
- text = self.clean_text(' '.join(li.css('::text').getall()))
518
- if text and len(text) > 10:
519
- content["paragraphs"].append(f"• {text}")
520
-
521
- # If no structured text elements found, try general text extraction
522
- if not content["paragraphs"]:
523
- # Get all text nodes within divs but not within scripts or styles
524
- for div in main_content.css('div'):
525
- text = self.clean_text(' '.join(div.xpath('./text()').getall()))
526
- if text and len(text) > 30:
527
- content["paragraphs"].append(text)
528
-
529
- # Extract tables
530
- for table in main_content.css('table'):
531
- rows = []
532
-
533
- # Get header if it exists
534
- headers = []
535
- for th in table.css('thead th, tr th'):
536
- header_text = self.clean_text(' '.join(th.css('::text').getall()))
537
- if header_text:
538
- headers.append(header_text)
539
-
540
- if headers:
541
- rows.append(" - ".join(headers))
542
-
543
- # Get table body rows
544
- for tr in table.css('tbody tr, tr'):
545
- if tr.css('th') and not tr.css('td'):
546
- continue # Skip header rows already processed
547
-
548
- cells = []
549
- for td in tr.css('td'):
550
- cell_text = self.clean_text(' '.join(td.css('::text').getall()))
551
- link = td.css('a::attr(href)').get()
552
- if link:
553
- cell_text += f" (Link: {response.urljoin(link)})"
554
- if cell_text:
555
- cells.append(cell_text)
556
- else:
557
- cells.append(" ") # Empty cell placeholder
558
-
559
- if cells:
560
- rows.append(" - ".join(cells))
561
-
562
- if len(rows) > 1: # Only add if we have meaningful table
563
- content["tables"].append("\n".join(rows))
564
-
565
- # Extract downloads and files
566
- for link in main_content.css('a[href]'):
567
- href = link.css('::attr(href)').get()
568
- if not href:
569
- continue
570
-
571
- link_text = self.clean_text(' '.join(link.css('::text').getall()))
572
- if not link_text:
573
- link_text = "Unduhan"
574
-
575
- # Match common document formats
576
- if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
577
- # Extract file extension for better categorization
578
- file_ext = href.split('.')[-1].lower()
579
- content["files"].append({
580
- "title": link_text,
581
- "url": urljoin(response.url, href),
582
- "type": file_ext
583
- })
584
-
585
- return content if any(value for value in content.values()) else None
586
-
587
- def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
588
- """Save a page's content as a formatted text file"""
589
- if not content or not title:
590
- return
591
-
592
- # Clean up title for filename
593
- safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
594
- safe_title = re.sub(r'[-\s]+', '-', safe_title)[:50] # Limit filename length
595
-
596
- # Prepare the content
597
- formatted_content = f"""# {title}
598
-
599
- URL: {url}
600
- Tanggal: {datetime.now().strftime('%d %B %Y')}
601
- Jurusan: {department}
602
- Kategori: {category}
603
- """
604
-
605
- if menu_path:
606
- formatted_content += f"Navigasi: {menu_path}\n"
607
-
608
- formatted_content += "\n## Konten\n\n"
609
- if content["paragraphs"]:
610
- formatted_content += "\n".join(content["paragraphs"])
611
-
612
- if content["tables"]:
613
- formatted_content += "\n\n## Tabel Data\n\n"
614
- for i, table in enumerate(content["tables"]):
615
- formatted_content += f"### Tabel {i+1}\n{table}\n\n"
616
-
617
- if content["files"]:
618
- formatted_content += "\n\n## Berkas\n\n"
619
- for file in content["files"]:
620
- formatted_content += f"- {file['title']} [{file['type']}]: {file['url']}\n"
621
-
622
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
623
- # Generate filename with department prefix
624
- filename = f"{department}_{safe_title}_{timestamp}.txt"
625
-
626
- # Upload file to Supabase
627
- try:
628
- upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
629
- path=filename,
630
- file=formatted_content.encode('utf-8'),
631
- file_options={"content-type": "text/plain", "x-upsert": "true"}
632
- )
633
-
634
- self.logger.info(f"Successfully uploaded {filename}")
635
-
636
- # Store in our collection for later summaries
637
- self.department_data[department][category].append({
638
- 'title': title,
639
- 'url': url,
640
- 'filename': filename
641
- })
642
-
643
- except Exception as e:
644
- self.logger.error(f"Upload failed for {filename}: {str(e)}")
645
-
646
- def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
647
- """Save vision & mission as a separate well-formatted file"""
648
- filename = f"{department}_Visi_Misi.txt"
649
-
650
- content = f"""# Visi dan Misi {department}
651
-
652
- URL: {url}
653
- Tanggal: {datetime.now().strftime('%d %B %Y')}
654
- Jurusan: {department}
655
-
656
- """
657
-
658
- if vision:
659
- content += f"## Visi\n\n{vision}\n\n"
660
-
661
- if mission:
662
- content += f"## Misi\n\n{mission}\n\n"
663
-
664
- if mission_items:
665
- if not mission: # Only add header if not already added
666
- content += "## Misi\n\n"
667
- for i, item in enumerate(mission_items, 1):
668
- content += f"{i}. {item}\n"
669
-
670
- try:
671
- # Remove existing file if it exists
672
- try:
673
- self.supabase.storage.from_(self.storage_bucket).remove(filename)
674
- except:
675
- pass
676
-
677
- upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
678
- path=filename,
679
- file=content.encode('utf-8'),
680
- file_options={"content-type": "text/plain", "x-upsert": "true"}
681
- )
682
-
683
- self.logger.info(f"Successfully uploaded {filename}")
684
- except Exception as e:
685
- self.logger.error(f"Upload failed for {filename}: {str(e)}")
686
-
687
- def clean_text(self, text):
688
- """Clean and normalize text"""
689
- if not text:
690
- return ""
691
-
692
- # Normalize unicode characters
693
- text = unicodedata.normalize('NFKC', text)
694
-
695
- # Replace multiple spaces with single space
696
- text = re.sub(r'\s+', ' ', text)
697
-
698
- # Remove special characters and non-printable characters
699
- text = re.sub(r'[^\x20-\x7E\s\u00A0-\u00FF\u0100-\u017F]', '', text)
700
-
701
- # Remove multiple periods
702
- text = re.sub(r'\.{2,}', ' ', text)
703
-
704
- return text.strip()
705
-
706
- def determine_category(self, menu_text):
707
- """Determine content category based on menu text"""
708
- menu_lower = menu_text.lower()
709
-
710
- # Define category mappings
711
- categories = {
712
- 'Beranda': ['beranda', 'home', 'utama', 'main', 'index'],
713
- 'Profil': ['profil', 'profile', 'tentang', 'about', 'visi', 'misi', 'sejarah', 'history', 'struktur', 'organisasi', 'pimpinan', 'sambutan'],
714
- 'Program_Studi': ['program', 'studi', 'prodi', 'd3', 'd4', 'diploma', 'sarjana', 'akademik', 'jurusan', 'kurikulum'],
715
- 'Dosen': ['dosen', 'staff', 'tenaga', 'pengajar', 'lecturer', 'faculty'],
716
- 'Penelitian': ['penelitian', 'research', 'jurnal', 'karya', 'ilmiah', 'publikasi', 'paper'],
717
- 'Mahasiswa': ['mahasiswa', 'student', 'alumni', 'lulusan', 'graduate', 'kegiatan', 'activity', 'kemahasiswaan'],
718
- 'Fasilitas': ['fasilitas', 'facility', 'lab', 'laboratorium', 'gedung', 'building', 'sarana', 'prasarana'],
719
- 'Informasi': ['informasi', 'info', 'pengumuman', 'announcement', 'agenda', 'berita', 'news', 'event'],
720
- 'Kerjasama': ['kerjasama', 'cooperation', 'mitra', 'partner', 'industri', 'industry', 'collaboration'],
721
- 'Dokumen': ['dokumen', 'document', 'unduhan', 'download', 'berkas', 'file']
722
- }
723
-
724
- # Check each category
725
- for category, terms in categories.items():
726
- if any(term in menu_lower for term in terms):
727
- return category
728
-
729
- # Default category if no match
730
- return 'Lainnya'
731
-
732
- def is_social_media_link(self, url):
733
- social_patterns = [
734
- 'facebook.com', 'twitter.com', 'instagram.com',
735
- 'youtube.com', 'linkedin.com', 'pinterest.com',
736
- 'tiktok.com', 'wa.me', 'whatsapp.com', 't.me'
737
- ]
738
- return any(pattern in url.lower() for pattern in social_patterns)
739
-
740
- def is_unwanted_url(self, url):
741
- """Check if URL should be skipped"""
742
- # Skip certain file types
743
- if re.search(r'\.(jpg|jpeg|png|gif|svg|ico|css|js)$', url.lower()):
744
- return True
745
-
746
- # Skip certain URL patterns
747
- unwanted_patterns = [
748
- 'login', 'logout', 'signin', 'signup', 'register', 'admin',
749
- 'wp-', '/wp/', 'wordpress', 'comment', 'feed', 'rss', 'atom',
750
- 'javascript:', 'mailto:', 'tel:', 'page/', '/tag/', '/author/',
751
- '/archive/', '/category/', '/search', 'kalender', '/ajax/', '/api/'
752
- ]
753
-
754
- return any(pattern in url.lower() for pattern in unwanted_patterns)
755
-
756
- def handle_error(self, failure):
757
- """Handle request errors"""
758
- url = failure.request.url
759
- self.visited_urls.add(url) # Mark as visited to prevent retries
760
- self.logger.error(f"Request failed: {url} - {str(failure.value)}")
761
-
762
- def closed(self, reason):
763
- """Finalize processing when spider is closed"""
764
- self.logger.info("Spider closed. Generating summary report...")
765
-
766
- # Log statistics
767
- departments_count = len(self.department_data)
768
- pages_count = sum(len(cat_data) for dept_data in self.department_data.values()
769
- for cat_data in dept_data.values())
770
-
771
- self.logger.info(f"Crawled {departments_count} departments and {pages_count} pages")
772
- self.logger.info(f"Found {len(self.study_programs)} departments with programs")
773
-
774
- for dept, programs in self.study_programs.items():
775
- self.logger.info(f"{dept}: {len(programs)} programs")
776
-
777
- # Generate and upload the summary file
778
- self.generate_summary_file()
779
-
780
- def generate_summary_file(self):
781
- """Generate comprehensive summary with program metadata"""
782
- content = """# Daftar Lengkap Jurusan dan Program Studi Politeknik Negeri Padang\n\n"""
783
- content += f"**Terakhir diperbarui**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
784
-
785
- # Create reverse mapping from department name to domain
786
- reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
787
-
788
- for department, programs in self.study_programs.items():
789
- # Get domain from reverse mapping
790
- domain = reverse_departments.get(department, '')
791
- website_url = f'https://{domain}' if domain else 'URL tidak ditemukan'
792
-
793
- content += f"## {department.replace('_', ' ')}\n"
794
- content += f"**Website**: {website_url}\n\n"
795
-
796
- if programs:
797
- for prog in programs:
798
- content += f"### {prog['title']}\n"
799
- content += f"- **Jenjang**: {prog['details'].get('degree', 'N/A')}\n"
800
- content += f"- **Akreditasi**: {prog['details'].get('accreditation', 'N/A')}\n"
801
- content += f"- **URL**: {prog['url']}\n"
802
-
803
- if 'description' in prog['details']:
804
- desc = prog['details']['description']
805
- content += f"\n**Deskripsi**:\n{desc}\n"
806
-
807
- content += "\n"
808
- else:
809
- content += "### Belum ada informasi program studi\n"
810
-
811
- content += "\n---\n\n"
812
-
813
- # Upload to Supabase
814
- filename = "Daftar_Jurusan_dan_Prodi_Politeknik_Negeri_Padang.txt"
815
- try:
816
- self.supabase.storage.from_(self.storage_bucket).remove(filename)
817
- self.supabase.storage.from_(self.storage_bucket).upload(
818
  path=filename,
819
- file=content.encode('utf-8'),
820
- file_options={"content-type": "text/plain", "x-upsert": "true"}
821
  )
822
- self.logger.info("Ringkasan jurusan berhasil diunggah")
823
- except Exception as e:
824
- self.logger.error(f"Gagal mengupload file ringkasan: {str(e)}")
825
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
 
827
- # Main execution
828
  if __name__ == "__main__":
829
- process = CrawlerProcess()
830
- process.crawl(PNPDepartmentSpider)
831
- process.start()
 
1
  import scrapy
2
  from scrapy.crawler import CrawlerProcess
3
+ from bs4 import BeautifulSoup
 
 
 
 
 
 
4
  from dotenv import load_dotenv
5
+ from supabase import create_client
6
+ from datetime import datetime
7
+ import os, re
8
+
9
+ # Load .env.production
10
+ load_dotenv(".env.production")
11
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
12
+ SUPABASE_KEY = os.getenv("SUPABASE_KEY")
13
+
14
+ supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
15
+
16
+ JURUSAN_URLS = {
17
+ 'akt.pnp.ac.id': 'Akuntansi',
18
+ 'an.pnp.ac.id': 'Administrasi_Niaga',
19
+ 'bing.pnp.ac.id': 'Bahasa_Inggris',
20
+ 'elektro.pnp.ac.id': 'Teknik_Elektro',
21
+ 'me.pnp.ac.id': 'Teknik_Mesin',
22
+ 'sipil.pnp.ac.id': 'Teknik_Sipil',
23
+ 'ti.pnp.ac.id': 'Teknologi_Informasi',
24
+ }
25
+
26
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
27
+ PRODI_LIST = []
28
+ TOTAL_PRODI = 0
29
+
30
+ class JurusanSpider(scrapy.Spider):
31
+ name = "jurusan"
32
+ start_urls = [f"https://{url}/" for url in JURUSAN_URLS.keys()]
33
+
34
+ def parse(self, response):
35
+ jurusan_domain = response.url.split("//")[1].split("/")[0]
36
+ jurusan_nama = JURUSAN_URLS[jurusan_domain]
37
+ url = response.url
38
+
39
+ # Extract all visible text
40
+ soup = BeautifulSoup(response.text, "html.parser")
41
+ for tag in soup(["script", "style", "noscript"]):
42
+ tag.decompose()
43
+ visible_text = soup.get_text(separator="\n")
44
+
45
+ # Clean text
46
+ lines = [line.strip() for line in visible_text.splitlines()]
47
+ lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
48
+ text_cleaned = "\n".join(lines)
49
+
50
+ # Extract Program Studi menu
51
+ program_studi = []
52
+ menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
53
+ for menu in menu_elements:
54
+ ul = menu.find_next("ul")
55
+ if ul:
56
+ lis = ul.find_all("li")
57
+ for li in lis:
58
+ item = li.get_text(strip=True)
59
+ if item and item not in program_studi:
60
+ program_studi.append(item)
61
+
62
+ # Simpan global untuk rekap
63
+ PRODI_LIST.append((jurusan_nama, program_studi))
64
+ global TOTAL_PRODI
65
+ TOTAL_PRODI += len(program_studi)
66
+
67
+ # Buat nama file
68
+ filename = f"{jurusan_nama.upper()}_{timestamp}.txt"
69
+ with open(filename, "w", encoding="utf-8") as f:
70
+ f.write(f"JURUSAN: {jurusan_nama.replace('_', ' ').upper()}\n")
71
+ f.write(f"URL: {url}\n\n")
72
+
73
+ f.write("== PROGRAM STUDI ==\n")
74
+ for i, item in enumerate(program_studi, 1):
75
+ f.write(f"{i}. {item}\n")
76
+ f.write(f"\nTotal Program Studi: {len(program_studi)}\n\n")
77
+
78
+ f.write("== PROFIL ==\n")
79
+ f.write(text_cleaned.strip()[:8000]) # Batas isi maksimal agar tidak terlalu besar
80
+ f.write("\n\n")
81
+
82
+ print(f"📄 Saved {filename}")
83
+
84
+ # Upload ke Supabase
85
+ with open(filename, "rb") as f_upload:
86
+ supabase.storage.from_("documents").upload(
87
+ file=f_upload,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  path=filename,
89
+ file_options={"content-type": "text/plain"},
90
+ upsert=True,
91
  )
92
+ print(f" Uploaded to Supabase: {filename}")
93
+
94
+ def run_spider():
95
+ process = CrawlerProcess(settings={
96
+ "LOG_LEVEL": "ERROR",
97
+ "USER_AGENT": "Mozilla/5.0",
98
+ })
99
+ process.crawl(JurusanSpider)
100
+ process.start()
101
+
102
+ # Simpan rekap program studi semua jurusan
103
+ rekap_file = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
104
+ with open(rekap_file, "w", encoding="utf-8") as f:
105
+ total = 0
106
+ for jurusan, daftar in PRODI_LIST:
107
+ f.write(f"{jurusan.replace('_', ' ')}:\n")
108
+ for p in daftar:
109
+ f.write(f"- {p}\n")
110
+ f.write(f"Jumlah: {len(daftar)}\n\n")
111
+ total += len(daftar)
112
+ f.write(f"TOTAL PROGRAM STUDI: {total}\n")
113
+
114
+ # Upload rekap
115
+ with open(rekap_file, "rb") as f_rekap:
116
+ supabase.storage.from_("documents").upload(
117
+ file=f_rekap,
118
+ path=rekap_file,
119
+ file_options={"content-type": "text/plain"},
120
+ upsert=True,
121
+ )
122
+ print(f"✅ Uploaded REKAP: {rekap_file}")
123
 
 
124
  if __name__ == "__main__":
125
+ run_spider()