FauziIsyrinApridal commited on
Commit
a972bc4
·
1 Parent(s): 66353e5

update pnp pimpinan struktur dan jurusan

Browse files
Files changed (2) hide show
  1. scrapping/jurusan_scrap.py +111 -329
  2. scrapping/pnp_scrap.py +38 -12
scrapping/jurusan_scrap.py CHANGED
@@ -5,7 +5,6 @@ from dotenv import load_dotenv
5
  from supabase import create_client
6
  from datetime import datetime
7
  import os, re, tempfile
8
- import logging
9
 
10
  load_dotenv()
11
 
@@ -13,26 +12,24 @@ SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
13
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
14
  SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
15
 
 
16
  def is_valid_prodi(nama):
17
- """Validate if a string represents a valid study program name"""
18
- if not nama or len(nama.strip()) < 3:
19
- return False
20
-
21
- pattern = r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b'
22
- return bool(re.match(pattern, nama.strip(), re.I))
23
 
24
  class JurusanSpider(scrapy.Spider):
25
  name = "jurusan"
26
  custom_settings = {
27
- 'DOWNLOAD_DELAY': 2,
28
  'USER_AGENT': 'PNPBot/1.0',
29
  'ROBOTSTXT_OBEY': True,
30
  'LOG_LEVEL': 'INFO',
 
31
  'CONCURRENT_REQUESTS': 1,
32
- 'DOWNLOAD_TIMEOUT': 100,
33
- 'RETRY_TIMES': 3,
34
- 'DEPTH_LIMIT': 3, # Prevent infinite crawling
35
- 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
36
  }
37
 
38
  domain_to_name = {
@@ -45,99 +42,46 @@ class JurusanSpider(scrapy.Spider):
45
  'ti.pnp.ac.id': 'Teknologi_Informasi',
46
  }
47
 
 
48
  start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
49
 
50
- def __init__(self, *args, **kwargs):
51
- super(JurusanSpider, self).__init__(*args, **kwargs)
52
-
53
- # Validate environment variables
54
- if not all([SUPABASE_URL, SUPABASE_KEY]):
55
- raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_KEY")
56
-
57
- try:
58
- self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
59
- except Exception as e:
60
- self.logger.error(f"Failed to initialize Supabase client: {e}")
61
- raise
62
-
63
  self.bucket = SUPABASE_BUCKET
64
  self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
65
  self.per_jurusan_pages = {}
66
  self.rekap_prodi = {}
67
- self.processed_urls = set() # Track processed URLs
68
 
69
  def parse(self, response):
70
- """Parse main department pages"""
71
- if response.status != 200:
72
- self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
73
- return
74
-
75
  domain = response.url.split("//")[1].split("/")[0]
76
  jurusan = self.domain_to_name.get(domain, domain)
77
-
78
- try:
79
- soup = BeautifulSoup(response.text, "html.parser")
80
- except Exception as e:
81
- self.logger.error(f"Failed to parse HTML from {response.url}: {e}")
82
- return
83
 
84
  program_studi = []
85
 
86
- # Extract study programs
87
- for a_tag in soup.find_all("a"):
88
- try:
89
- item = a_tag.get_text(strip=True)
90
- href = a_tag.get("href")
91
-
92
- if item and is_valid_prodi(item) and item not in program_studi:
93
- program_studi.append(item)
94
- if href:
95
- prodi_url = response.urljoin(href)
96
- if prodi_url not in self.processed_urls:
97
- self.processed_urls.add(prodi_url)
98
- self.logger.info(f"[🧩] Found prodi: {item} ({prodi_url}) in {jurusan}")
99
- yield scrapy.Request(
100
- prodi_url,
101
- callback=self.parse_detail,
102
- meta={"jurusan": jurusan, "url": prodi_url},
103
- dont_filter=False
104
- )
105
- except Exception as e:
106
- self.logger.warning(f"Error processing link in {response.url}: {e}")
107
- continue
108
 
109
- # Store initial results
110
  self.rekap_prodi[jurusan] = program_studi
111
 
112
- # Follow internal links with better filtering
113
  for a in soup.find_all("a", href=True):
114
- try:
115
- href = a["href"]
116
- full_url = None
117
-
118
- if href.startswith("http") and domain in href:
119
- full_url = href
120
- elif href.startswith("/"):
121
- full_url = response.urljoin(href)
122
-
123
- if full_url and full_url not in self.processed_urls:
124
- # Skip certain file types and external links
125
- if any(ext in full_url.lower() for ext in ['.pdf', '.doc', '.xls', '.ppt', '.jpg', '.png', '.gif']):
126
- continue
127
-
128
- self.processed_urls.add(full_url)
129
- yield scrapy.Request(
130
- full_url,
131
- callback=self.parse_detail,
132
- meta={"jurusan": jurusan, "url": full_url}
133
- )
134
- except Exception as e:
135
- self.logger.warning(f"Error processing internal link: {e}")
136
- continue
137
 
138
- def clean_html(self, soup):
139
- """Clean HTML content by removing unwanted elements"""
140
- # Remove unwanted elements
141
  for selector in [
142
  'header', 'footer', 'nav', 'aside', 'menu',
143
  '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
@@ -149,280 +93,118 @@ class JurusanSpider(scrapy.Spider):
149
  for tag in soup.select(selector):
150
  tag.decompose()
151
 
152
- # Remove empty containers
153
  for element in soup.find_all(True):
154
  if not element.get_text(strip=True) and not element.find_all(True):
155
  element.decompose()
156
 
157
- def parse_detail(self, response):
158
- """Parse detailed pages"""
159
- if response.status != 200:
160
- return
161
-
162
- jurusan = response.meta["jurusan"]
163
- url = response.meta["url"]
164
-
165
- try:
166
- soup = BeautifulSoup(response.text, "html.parser")
167
- except Exception as e:
168
- self.logger.error(f"Failed to parse HTML from {url}: {e}")
169
- return
170
-
171
- self.clean_html(soup)
172
-
173
  title_tag = soup.find("title") or soup.find("h1")
174
  page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
175
 
176
- # Handle specific TI pages
177
- if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
178
- content_text = self.parse_ti_dosen_page(soup, url)
179
- elif url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
180
- content_text = self.parse_ti_leadership_page(soup, url)
181
- else:
182
- content_text = self.parse_general_page(soup, url, jurusan, page_title)
183
-
184
- if content_text:
185
- self.per_jurusan_pages.setdefault(jurusan, []).append({
186
- "url": url,
187
- "title": page_title,
188
- "content": content_text
189
- })
190
-
191
- def parse_ti_dosen_page(self, soup, url):
192
- """Parse TI dosen page specifically"""
193
- dosen_list = []
194
-
195
- # Find names in gallery captions
196
- for nama_tag in soup.find_all("dd", class_="wp-caption-text"):
197
- nama = nama_tag.get_text(strip=True)
198
- if nama and nama not in dosen_list:
199
- dosen_list.append(nama)
200
-
201
- # Create narrative text
202
- naratif = ["## Daftar Dosen dan Staf Pengajar"]
203
- for nama in dosen_list:
204
- naratif.append(f"- {nama}")
205
-
206
- return f"""# Dosen dan Staf Pengajar Teknologi Informasi
207
 
208
- URL: {url}
209
- Jurusan: Teknologi Informasi
210
- Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
211
-
212
- {chr(10).join(naratif)}"""
 
 
213
 
214
- def parse_ti_leadership_page(self, soup, url):
215
- """Parse TI leadership page specifically"""
216
- leadership_data = {
217
- "Pimpinan Jurusan": [],
218
- "Koordinator Program Studi": [],
219
- "Kepala Labor": []
220
- }
221
-
222
- # Extract all member items
223
- member_items = soup.find_all(class_="member-item")
224
-
225
- for member in member_items:
226
- try:
227
  name_tag = member.find(class_="item-title")
228
  name = name_tag.get_text(strip=True) if name_tag else "N/A"
229
-
230
  position_tag = member.find(class_="small-text")
231
  position = position_tag.get_text(strip=True) if position_tag else "N/A"
232
-
233
- # Categorize based on position
234
- if any(role in position for role in ["Ketua Jurusan", "Sekretaris Jurusan"]):
235
  leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
236
- elif any(role in position for role in ["Koordinator Program Studi", "Koordinator PSDKU"]):
237
  leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
238
  elif "Kepala Labor" in position:
239
  leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
240
- except Exception as e:
241
- self.logger.warning(f"Error parsing member item: {e}")
242
- continue
243
-
244
- # Generate narrative
245
- naratif = []
246
- for section, members in leadership_data.items():
247
- if members:
248
- naratif.append(f"\n## {section}")
249
- for member in members:
250
- naratif.append(f"- {member['jabatan']}: {member['nama']}")
251
-
252
- return f"""# Pimpinan Jurusan Teknologi Informasi
253
-
254
- URL: {url}
255
- Jurusan: Teknologi Informasi
256
- Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
257
-
258
- {chr(10).join(naratif)}"""
259
-
260
- def parse_general_page(self, soup, url, jurusan, page_title):
261
- """Parse general pages"""
262
- body_text = []
263
- for element in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
264
- txt = element.get_text(strip=True)
265
- if txt and len(txt) > 10: # Filter out very short text
266
- body_text.append(txt)
267
-
268
- content_text = f"""# {page_title}
269
 
270
- URL: {url}
271
- Jurusan: {jurusan.replace('_', ' ')}
272
- Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
273
-
274
- {chr(10).join(body_text)}"""
275
-
276
- # Extract faculty information from tables
277
- dosen_entries = self.extract_faculty_info(soup)
278
- if dosen_entries:
279
- content_text += f"\n\n## Daftar Dosen\n\n{chr(10).join(dosen_entries)}"
280
 
281
- # Add general tables
282
- tables_content = self.extract_tables(soup)
283
- if tables_content:
284
- content_text += f"\n\n## Tabel Data\n\n{tables_content}"
 
285
 
286
- return content_text
 
 
 
 
 
 
287
 
288
- def extract_faculty_info(self, soup):
289
- """Extract faculty information from tables and text"""
290
- dosen_entries = []
291
-
292
- # Extract from tables
293
- for table in soup.find_all("table"):
294
- try:
295
- headers = [th.get_text(strip=True).lower() for th in table.find_all("th")]
296
- if any(keyword in " ".join(headers) for keyword in ["dosen", "jabatan", "nip", "nama"]):
297
- for row in table.find_all("tr")[1:]:
298
- cols = row.find_all(["td", "th"])
299
- if len(cols) >= 1:
300
- nama_dosen = cols[0].get_text(strip=True)
301
- jabatan = cols[1].get_text(strip=True) if len(cols) > 1 else "-"
302
- if nama_dosen and len(nama_dosen) > 3:
303
- dosen_entries.append(f"Nama: {nama_dosen} | Jabatan: {jabatan}")
304
- except Exception as e:
305
- self.logger.warning(f"Error extracting faculty from table: {e}")
306
- continue
307
 
308
- return list(set(dosen_entries)) # Remove duplicates
 
 
 
309
 
310
- def extract_tables(self, soup):
311
- """Extract table data"""
312
- tables_content = []
313
-
314
- for i, table in enumerate(soup.find_all("table")):
315
  try:
316
- table_data = [f"### Tabel {i+1}"]
317
- for row in table.find_all("tr"):
318
- cols = row.find_all(["td", "th"])
319
- if cols:
320
- row_data = [col.get_text(strip=True) for col in cols]
321
- table_data.append(" | ".join(row_data))
322
-
323
- if len(table_data) > 1: # Only add if table has content
324
- tables_content.extend(table_data)
325
- tables_content.append("") # Add spacing
326
  except Exception as e:
327
- self.logger.warning(f"Error extracting table {i}: {e}")
328
- continue
329
-
330
- return "\n".join(tables_content)
331
 
332
- def upload_to_supabase(self, filename, content, content_type="text/plain"):
333
- """Upload content to Supabase storage"""
334
  try:
335
  with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
336
- f.write(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  temp_path = f.name
338
-
339
- result = self.supabase.storage.from_(self.bucket).upload(
340
- path=filename,
341
- file=temp_path,
342
- file_options={"content-type": content_type}
343
- )
344
-
345
- self.logger.info(f"✅ Uploaded: {filename}")
346
- return True
347
-
348
  except Exception as e:
349
- self.logger.error(f"❌ Upload failed for {filename}: {e}")
350
- return False
351
  finally:
352
- if 'temp_path' in locals() and os.path.exists(temp_path):
353
  os.remove(temp_path)
354
 
355
- def closed(self, reason):
356
- """Called when spider closes"""
357
- self.logger.info(f"Spider closed: {reason}")
358
-
359
- # Upload files per department
360
- for jurusan, pages in self.per_jurusan_pages.items():
361
- if not pages:
362
- continue
363
-
364
- filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
365
- content = ""
366
-
367
- for page in pages:
368
- content += f"{page['content']}\n\n---\n\n"
369
-
370
- self.upload_to_supabase(filename, content)
371
-
372
- # Create and upload summary
373
- self.create_and_upload_summary()
374
-
375
- def create_and_upload_summary(self):
376
- """Create and upload program study summary"""
377
- rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
378
-
379
- content_lines = [
380
- f"# REKAP PROGRAM STUDI PNP",
381
- f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}",
382
- ""
383
- ]
384
-
385
- total_prodi = 0
386
- jumlah_jurusan = 0
387
-
388
- for jurusan, daftar in self.rekap_prodi.items():
389
- valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)]
390
-
391
- if not valid_prodi:
392
- continue
393
-
394
- jurusan_display = jurusan.replace("_", " ")
395
- content_lines.append(f"## {jurusan_display}:")
396
-
397
- for prodi in sorted(set(valid_prodi)):
398
- content_lines.append(f"- {prodi}")
399
-
400
- jumlah_prodi = len(set(valid_prodi))
401
- content_lines.append(f"Jumlah program studi: {jumlah_prodi}")
402
- content_lines.append("")
403
-
404
- total_prodi += jumlah_prodi
405
- jumlah_jurusan += 1
406
-
407
- content_lines.extend([
408
- f"**Total Jurusan di PNP: {jumlah_jurusan}**",
409
- f"**Total Program Studi di PNP: {total_prodi}**"
410
- ])
411
-
412
- content = "\n".join(content_lines)
413
- self.upload_to_supabase(rekap_filename, content)
414
 
415
  if __name__ == "__main__":
416
- # Add logging configuration
417
- logging.basicConfig(
418
- level=logging.INFO,
419
- format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
420
- )
421
-
422
- try:
423
- process = CrawlerProcess()
424
- process.crawl(JurusanSpider)
425
- process.start()
426
- except Exception as e:
427
- logging.error(f"Failed to run spider: {e}")
428
- raise
 
5
  from supabase import create_client
6
  from datetime import datetime
7
  import os, re, tempfile
 
8
 
9
  load_dotenv()
10
 
 
12
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
13
  SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
14
 
15
+
16
  def is_valid_prodi(nama):
17
+ return bool(re.match(
18
+ r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b',
19
+ nama, re.I
20
+ ))
21
+
 
22
 
23
  class JurusanSpider(scrapy.Spider):
24
  name = "jurusan"
25
  custom_settings = {
26
+ 'DOWNLOAD_DELAY': 1,
27
  'USER_AGENT': 'PNPBot/1.0',
28
  'ROBOTSTXT_OBEY': True,
29
  'LOG_LEVEL': 'INFO',
30
+ 'HTTPCACHE_ENABLED': False,
31
  'CONCURRENT_REQUESTS': 1,
32
+ 'RETRY_TIMES': 3
 
 
 
33
  }
34
 
35
  domain_to_name = {
 
42
  'ti.pnp.ac.id': 'Teknologi_Informasi',
43
  }
44
 
45
+ allowed_domains = list(domain_to_name.keys())
46
  start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
47
 
48
+ def __init__(self):
49
+ self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
 
 
 
 
 
 
 
 
 
 
 
50
  self.bucket = SUPABASE_BUCKET
51
  self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
52
  self.per_jurusan_pages = {}
53
  self.rekap_prodi = {}
 
54
 
55
  def parse(self, response):
 
 
 
 
 
56
  domain = response.url.split("//")[1].split("/")[0]
57
  jurusan = self.domain_to_name.get(domain, domain)
58
+ soup = BeautifulSoup(response.text, "html.parser")
 
 
 
 
 
59
 
60
  program_studi = []
61
 
62
+ for a_tag in soup.find_all("a", href=True):
63
+ item = a_tag.get_text(strip=True)
64
+ href = a_tag["href"]
65
+ if item and is_valid_prodi(item) and item not in program_studi:
66
+ program_studi.append(item)
67
+ prodi_url = response.urljoin(href)
68
+ self.logger.info(f"[🧩] Ditemukan prodi: {item} ({prodi_url}) di jurusan {jurusan}")
69
+ yield response.follow(href, callback=self.parse_detail,
70
+ meta={"jurusan": jurusan, "url": prodi_url})
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
 
72
  self.rekap_prodi[jurusan] = program_studi
73
 
74
+ # Follow semua link internal
75
  for a in soup.find_all("a", href=True):
76
+ yield response.follow(a["href"], callback=self.parse_detail,
77
+ meta={"jurusan": jurusan, "url": response.urljoin(a["href"])})
78
+
79
+ def parse_detail(self, response):
80
+ jurusan = response.meta["jurusan"]
81
+ url = response.meta["url"]
82
+ soup = BeautifulSoup(response.text, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ # Bersihkan elemen yang tidak diperlukan
 
 
85
  for selector in [
86
  'header', 'footer', 'nav', 'aside', 'menu',
87
  '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
 
93
  for tag in soup.select(selector):
94
  tag.decompose()
95
 
 
96
  for element in soup.find_all(True):
97
  if not element.get_text(strip=True) and not element.find_all(True):
98
  element.decompose()
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  title_tag = soup.find("title") or soup.find("h1")
101
  page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
102
 
103
+ # Selalu inisialisasi content_text
104
+ content_text = f"# {page_title}\nURL: {url}\nJurusan: {jurusan}\nTanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Special case: dosen TI
107
+ if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
108
+ dosen_list = [n.get_text(strip=True) for n in soup.find_all("dd", class_="wp-caption-text") if n.get_text(strip=True)]
109
+ naratif = ["## Daftar Dosen dan Staf Pengajar"] + [f"- {n}" for n in dosen_list]
110
+ content_text += "\n".join(naratif)
111
+ self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": "Dosen dan Staf Pengajar Teknologi Informasi", "content": content_text})
112
+ return
113
 
114
+ # Special case: pimpinan jurusan TI
115
+ if url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
116
+ leadership_data = {"Pimpinan Jurusan": [], "Koordinator Program Studi": [], "Kepala Labor": []}
117
+ for member in soup.find_all(class_="member-item"):
 
 
 
 
 
 
 
 
 
118
  name_tag = member.find(class_="item-title")
119
  name = name_tag.get_text(strip=True) if name_tag else "N/A"
 
120
  position_tag = member.find(class_="small-text")
121
  position = position_tag.get_text(strip=True) if position_tag else "N/A"
122
+ if "Ketua Jurusan" in position or "Sekretaris Jurusan" in position:
 
 
123
  leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
124
+ elif "Koordinator Program Studi" in position or "Koordinator PSDKU" in position:
125
  leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
126
  elif "Kepala Labor" in position:
127
  leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ naratif = ["## Pimpinan Jurusan"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Pimpinan Jurusan"]]
130
+ naratif += ["\n## Koordinator Program Studi"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Koordinator Program Studi"]]
131
+ naratif += ["\n## Kepala Labor"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Kepala Labor"]]
132
+ content_text += "\n".join(naratif)
133
+ self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": "Pimpinan Jurusan Teknologi Informasi", "content": content_text})
134
+ return
 
 
 
 
135
 
136
+ # Ambil body text
137
+ for p in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
138
+ txt = p.get_text(strip=True)
139
+ if txt:
140
+ content_text += txt + "\n"
141
 
142
+ # Ambil tabel
143
+ for i, table in enumerate(soup.find_all("table")):
144
+ content_text += f"\n\nTabel {i+1}\n\n"
145
+ for row in table.find_all("tr"):
146
+ cols = row.find_all(["td", "th"])
147
+ row_data = [col.get_text(strip=True) for col in cols]
148
+ content_text += " | ".join(row_data) + "\n"
149
 
150
+ self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": page_title, "content": content_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ # Follow link dari halaman detail juga
153
+ for a in soup.find_all("a", href=True):
154
+ yield response.follow(a["href"], callback=self.parse_detail,
155
+ meta={"jurusan": jurusan, "url": response.urljoin(a["href"])})
156
 
157
+ def closed(self, reason):
158
+ # Simpan per jurusan
159
+ for jurusan, pages in self.per_jurusan_pages.items():
160
+ filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
 
161
  try:
162
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
163
+ for page in pages:
164
+ f.write(page["content"] + "\n\n---\n\n")
165
+ temp_path = f.name
166
+ self.supabase.storage.from_(self.bucket).upload(path=filename, file=temp_path,
167
+ file_options={"content-type": "text/plain"})
168
+ self.logger.info(f"✅ Uploaded file jurusan: {filename}")
 
 
 
169
  except Exception as e:
170
+ self.logger.error(f" Gagal upload {filename}: {e}")
171
+ finally:
172
+ if os.path.exists(temp_path):
173
+ os.remove(temp_path)
174
 
175
+ # Simpan rekap
176
+ rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
177
  try:
178
  with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
179
+ f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
180
+ total_prodi = 0
181
+ jumlah_jurusan = 0
182
+ for jurusan, daftar in self.rekap_prodi.items():
183
+ valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)]
184
+ if not valid_prodi:
185
+ continue
186
+ jurusan_baca = jurusan.replace("_", " ")
187
+ f.write(f"{jurusan_baca}:\n")
188
+ for p in sorted(set(valid_prodi)):
189
+ f.write(f"- {p}\n")
190
+ jumlah_prodi = len(valid_prodi)
191
+ f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
192
+ total_prodi += jumlah_prodi
193
+ jumlah_jurusan += 1
194
+ f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
195
+ f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
196
  temp_path = f.name
197
+ self.supabase.storage.from_(self.bucket).upload(path=rekap_filename, file=temp_path,
198
+ file_options={"content-type": "text/plain"})
199
+ self.logger.info(f"✅ Uploaded file rekap: {rekap_filename}")
 
 
 
 
 
 
 
200
  except Exception as e:
201
+ self.logger.error(f"❌ Gagal upload rekap: {e}")
 
202
  finally:
203
+ if os.path.exists(temp_path):
204
  os.remove(temp_path)
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  if __name__ == "__main__":
208
+ process = CrawlerProcess()
209
+ process.crawl(JurusanSpider)
210
+ process.start()
 
 
 
 
 
 
 
 
 
 
scrapping/pnp_scrap.py CHANGED
@@ -192,26 +192,52 @@ class PNPContentSpider(scrapy.Spider):
192
  # Simple description format
193
  content = f"## Pimpinan {idx}\n\n{leader['description']}"
194
  else:
195
- # Structured data format
196
- position = leader.get("Posisi", f"Pimpinan {idx}")
197
- content = f"## {position}\n\n"
 
 
 
 
198
 
199
- # Format key information in a logical order
200
- ordered_keys = ['Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # Add ordered information first
203
- for key in ordered_keys:
204
- if key in leader:
205
- content += f"**{key}**: {leader[key]}\n\n"
206
 
207
- # Add remaining information
 
208
  for key, value in leader.items():
209
- if key not in ordered_keys and key not in ['Posisi', 'description']:
210
  content += f"**{key}**: {value}\n\n"
211
 
212
  # Add description if exists
213
  if 'description' in leader:
214
- content += f"\n{leader['description']}\n\n"
215
 
216
  formatted_content.append(content.strip())
217
 
 
192
  # Simple description format
193
  content = f"## Pimpinan {idx}\n\n{leader['description']}"
194
  else:
195
+ # Structured data format - create narrative
196
+ position = leader.get("Posisi", "")
197
+ nama = leader.get("Nama", "")
198
+ nidn = leader.get("NIDN", "")
199
+ jabatan_akademik = leader.get("Jabatan Akademik", "")
200
+ jurusan = leader.get("Jurusan", "")
201
+ program_studi = leader.get("Program Studi", "")
202
 
203
+ # Create narrative starting with position
204
+ if position and nama:
205
+ content = f"## {position}\n\n"
206
+ narrative = f"{position} Politeknik Negeri Padang adalah {nama}."
207
+ elif nama:
208
+ content = f"## Pimpinan {idx}\n\n"
209
+ narrative = f"Pimpinan ini adalah {nama}."
210
+ else:
211
+ content = f"## Pimpinan {idx}\n\n"
212
+ narrative = "Informasi pimpinan:"
213
+
214
+ # Add academic position
215
+ if jabatan_akademik:
216
+ narrative += f" Secara akademik, beliau menjabat sebagai {jabatan_akademik}."
217
+
218
+ # Add department information
219
+ if jurusan:
220
+ narrative += f" Beliau berasal dari Jurusan {jurusan}."
221
+
222
+ # Add study program
223
+ if program_studi:
224
+ narrative += f" Program studi yang diampu adalah {program_studi}."
225
+
226
+ # Add NIDN
227
+ if nidn:
228
+ narrative += f" NIDN beliau adalah {nidn}."
229
 
230
+ content += narrative + "\n\n"
 
 
 
231
 
232
+ # Add any remaining information that wasn't included in narrative
233
+ used_keys = ['Posisi', 'Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi', 'description']
234
  for key, value in leader.items():
235
+ if key not in used_keys:
236
  content += f"**{key}**: {value}\n\n"
237
 
238
  # Add description if exists
239
  if 'description' in leader:
240
+ content += f"**Informasi Tambahan**: {leader['description']}\n\n"
241
 
242
  formatted_content.append(content.strip())
243