Spaces:
Sleeping
Sleeping
FauziIsyrinApridal
commited on
Commit
·
a972bc4
1
Parent(s):
66353e5
update pnp pimpinan struktur dan jurusan
Browse files- scrapping/jurusan_scrap.py +111 -329
- scrapping/pnp_scrap.py +38 -12
scrapping/jurusan_scrap.py
CHANGED
@@ -5,7 +5,6 @@ from dotenv import load_dotenv
|
|
5 |
from supabase import create_client
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
8 |
-
import logging
|
9 |
|
10 |
load_dotenv()
|
11 |
|
@@ -13,26 +12,24 @@ SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
|
13 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
14 |
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
|
15 |
|
|
|
16 |
def is_valid_prodi(nama):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
return bool(re.match(pattern, nama.strip(), re.I))
|
23 |
|
24 |
class JurusanSpider(scrapy.Spider):
|
25 |
name = "jurusan"
|
26 |
custom_settings = {
|
27 |
-
'DOWNLOAD_DELAY':
|
28 |
'USER_AGENT': 'PNPBot/1.0',
|
29 |
'ROBOTSTXT_OBEY': True,
|
30 |
'LOG_LEVEL': 'INFO',
|
|
|
31 |
'CONCURRENT_REQUESTS': 1,
|
32 |
-
'
|
33 |
-
'RETRY_TIMES': 3,
|
34 |
-
'DEPTH_LIMIT': 3, # Prevent infinite crawling
|
35 |
-
'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
|
36 |
}
|
37 |
|
38 |
domain_to_name = {
|
@@ -45,99 +42,46 @@ class JurusanSpider(scrapy.Spider):
|
|
45 |
'ti.pnp.ac.id': 'Teknologi_Informasi',
|
46 |
}
|
47 |
|
|
|
48 |
start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
|
49 |
|
50 |
-
def __init__(self
|
51 |
-
|
52 |
-
|
53 |
-
# Validate environment variables
|
54 |
-
if not all([SUPABASE_URL, SUPABASE_KEY]):
|
55 |
-
raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_KEY")
|
56 |
-
|
57 |
-
try:
|
58 |
-
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
59 |
-
except Exception as e:
|
60 |
-
self.logger.error(f"Failed to initialize Supabase client: {e}")
|
61 |
-
raise
|
62 |
-
|
63 |
self.bucket = SUPABASE_BUCKET
|
64 |
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
|
65 |
self.per_jurusan_pages = {}
|
66 |
self.rekap_prodi = {}
|
67 |
-
self.processed_urls = set() # Track processed URLs
|
68 |
|
69 |
def parse(self, response):
|
70 |
-
"""Parse main department pages"""
|
71 |
-
if response.status != 200:
|
72 |
-
self.logger.warning(f"Non-200 response from {response.url}: {response.status}")
|
73 |
-
return
|
74 |
-
|
75 |
domain = response.url.split("//")[1].split("/")[0]
|
76 |
jurusan = self.domain_to_name.get(domain, domain)
|
77 |
-
|
78 |
-
try:
|
79 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
80 |
-
except Exception as e:
|
81 |
-
self.logger.error(f"Failed to parse HTML from {response.url}: {e}")
|
82 |
-
return
|
83 |
|
84 |
program_studi = []
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
prodi_url = response.urljoin(href)
|
96 |
-
if prodi_url not in self.processed_urls:
|
97 |
-
self.processed_urls.add(prodi_url)
|
98 |
-
self.logger.info(f"[🧩] Found prodi: {item} ({prodi_url}) in {jurusan}")
|
99 |
-
yield scrapy.Request(
|
100 |
-
prodi_url,
|
101 |
-
callback=self.parse_detail,
|
102 |
-
meta={"jurusan": jurusan, "url": prodi_url},
|
103 |
-
dont_filter=False
|
104 |
-
)
|
105 |
-
except Exception as e:
|
106 |
-
self.logger.warning(f"Error processing link in {response.url}: {e}")
|
107 |
-
continue
|
108 |
|
109 |
-
# Store initial results
|
110 |
self.rekap_prodi[jurusan] = program_studi
|
111 |
|
112 |
-
# Follow
|
113 |
for a in soup.find_all("a", href=True):
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
full_url = response.urljoin(href)
|
122 |
-
|
123 |
-
if full_url and full_url not in self.processed_urls:
|
124 |
-
# Skip certain file types and external links
|
125 |
-
if any(ext in full_url.lower() for ext in ['.pdf', '.doc', '.xls', '.ppt', '.jpg', '.png', '.gif']):
|
126 |
-
continue
|
127 |
-
|
128 |
-
self.processed_urls.add(full_url)
|
129 |
-
yield scrapy.Request(
|
130 |
-
full_url,
|
131 |
-
callback=self.parse_detail,
|
132 |
-
meta={"jurusan": jurusan, "url": full_url}
|
133 |
-
)
|
134 |
-
except Exception as e:
|
135 |
-
self.logger.warning(f"Error processing internal link: {e}")
|
136 |
-
continue
|
137 |
|
138 |
-
|
139 |
-
"""Clean HTML content by removing unwanted elements"""
|
140 |
-
# Remove unwanted elements
|
141 |
for selector in [
|
142 |
'header', 'footer', 'nav', 'aside', 'menu',
|
143 |
'.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
|
@@ -149,280 +93,118 @@ class JurusanSpider(scrapy.Spider):
|
|
149 |
for tag in soup.select(selector):
|
150 |
tag.decompose()
|
151 |
|
152 |
-
# Remove empty containers
|
153 |
for element in soup.find_all(True):
|
154 |
if not element.get_text(strip=True) and not element.find_all(True):
|
155 |
element.decompose()
|
156 |
|
157 |
-
def parse_detail(self, response):
|
158 |
-
"""Parse detailed pages"""
|
159 |
-
if response.status != 200:
|
160 |
-
return
|
161 |
-
|
162 |
-
jurusan = response.meta["jurusan"]
|
163 |
-
url = response.meta["url"]
|
164 |
-
|
165 |
-
try:
|
166 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
167 |
-
except Exception as e:
|
168 |
-
self.logger.error(f"Failed to parse HTML from {url}: {e}")
|
169 |
-
return
|
170 |
-
|
171 |
-
self.clean_html(soup)
|
172 |
-
|
173 |
title_tag = soup.find("title") or soup.find("h1")
|
174 |
page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
|
175 |
|
176 |
-
#
|
177 |
-
|
178 |
-
content_text = self.parse_ti_dosen_page(soup, url)
|
179 |
-
elif url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
|
180 |
-
content_text = self.parse_ti_leadership_page(soup, url)
|
181 |
-
else:
|
182 |
-
content_text = self.parse_general_page(soup, url, jurusan, page_title)
|
183 |
-
|
184 |
-
if content_text:
|
185 |
-
self.per_jurusan_pages.setdefault(jurusan, []).append({
|
186 |
-
"url": url,
|
187 |
-
"title": page_title,
|
188 |
-
"content": content_text
|
189 |
-
})
|
190 |
-
|
191 |
-
def parse_ti_dosen_page(self, soup, url):
|
192 |
-
"""Parse TI dosen page specifically"""
|
193 |
-
dosen_list = []
|
194 |
-
|
195 |
-
# Find names in gallery captions
|
196 |
-
for nama_tag in soup.find_all("dd", class_="wp-caption-text"):
|
197 |
-
nama = nama_tag.get_text(strip=True)
|
198 |
-
if nama and nama not in dosen_list:
|
199 |
-
dosen_list.append(nama)
|
200 |
-
|
201 |
-
# Create narrative text
|
202 |
-
naratif = ["## Daftar Dosen dan Staf Pengajar"]
|
203 |
-
for nama in dosen_list:
|
204 |
-
naratif.append(f"- {nama}")
|
205 |
-
|
206 |
-
return f"""# Dosen dan Staf Pengajar Teknologi Informasi
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
|
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
"Koordinator Program Studi": [],
|
219 |
-
"Kepala Labor": []
|
220 |
-
}
|
221 |
-
|
222 |
-
# Extract all member items
|
223 |
-
member_items = soup.find_all(class_="member-item")
|
224 |
-
|
225 |
-
for member in member_items:
|
226 |
-
try:
|
227 |
name_tag = member.find(class_="item-title")
|
228 |
name = name_tag.get_text(strip=True) if name_tag else "N/A"
|
229 |
-
|
230 |
position_tag = member.find(class_="small-text")
|
231 |
position = position_tag.get_text(strip=True) if position_tag else "N/A"
|
232 |
-
|
233 |
-
# Categorize based on position
|
234 |
-
if any(role in position for role in ["Ketua Jurusan", "Sekretaris Jurusan"]):
|
235 |
leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
|
236 |
-
elif
|
237 |
leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
|
238 |
elif "Kepala Labor" in position:
|
239 |
leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
|
240 |
-
except Exception as e:
|
241 |
-
self.logger.warning(f"Error parsing member item: {e}")
|
242 |
-
continue
|
243 |
-
|
244 |
-
# Generate narrative
|
245 |
-
naratif = []
|
246 |
-
for section, members in leadership_data.items():
|
247 |
-
if members:
|
248 |
-
naratif.append(f"\n## {section}")
|
249 |
-
for member in members:
|
250 |
-
naratif.append(f"- {member['jabatan']}: {member['nama']}")
|
251 |
-
|
252 |
-
return f"""# Pimpinan Jurusan Teknologi Informasi
|
253 |
-
|
254 |
-
URL: {url}
|
255 |
-
Jurusan: Teknologi Informasi
|
256 |
-
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
|
257 |
-
|
258 |
-
{chr(10).join(naratif)}"""
|
259 |
-
|
260 |
-
def parse_general_page(self, soup, url, jurusan, page_title):
|
261 |
-
"""Parse general pages"""
|
262 |
-
body_text = []
|
263 |
-
for element in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
|
264 |
-
txt = element.get_text(strip=True)
|
265 |
-
if txt and len(txt) > 10: # Filter out very short text
|
266 |
-
body_text.append(txt)
|
267 |
-
|
268 |
-
content_text = f"""# {page_title}
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
# Extract faculty information from tables
|
277 |
-
dosen_entries = self.extract_faculty_info(soup)
|
278 |
-
if dosen_entries:
|
279 |
-
content_text += f"\n\n## Daftar Dosen\n\n{chr(10).join(dosen_entries)}"
|
280 |
|
281 |
-
#
|
282 |
-
|
283 |
-
|
284 |
-
|
|
|
285 |
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
-
|
289 |
-
"""Extract faculty information from tables and text"""
|
290 |
-
dosen_entries = []
|
291 |
-
|
292 |
-
# Extract from tables
|
293 |
-
for table in soup.find_all("table"):
|
294 |
-
try:
|
295 |
-
headers = [th.get_text(strip=True).lower() for th in table.find_all("th")]
|
296 |
-
if any(keyword in " ".join(headers) for keyword in ["dosen", "jabatan", "nip", "nama"]):
|
297 |
-
for row in table.find_all("tr")[1:]:
|
298 |
-
cols = row.find_all(["td", "th"])
|
299 |
-
if len(cols) >= 1:
|
300 |
-
nama_dosen = cols[0].get_text(strip=True)
|
301 |
-
jabatan = cols[1].get_text(strip=True) if len(cols) > 1 else "-"
|
302 |
-
if nama_dosen and len(nama_dosen) > 3:
|
303 |
-
dosen_entries.append(f"Nama: {nama_dosen} | Jabatan: {jabatan}")
|
304 |
-
except Exception as e:
|
305 |
-
self.logger.warning(f"Error extracting faculty from table: {e}")
|
306 |
-
continue
|
307 |
|
308 |
-
|
|
|
|
|
|
|
309 |
|
310 |
-
def
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
for i, table in enumerate(soup.find_all("table")):
|
315 |
try:
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
if len(table_data) > 1: # Only add if table has content
|
324 |
-
tables_content.extend(table_data)
|
325 |
-
tables_content.append("") # Add spacing
|
326 |
except Exception as e:
|
327 |
-
self.logger.
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
|
332 |
-
|
333 |
-
|
334 |
try:
|
335 |
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
336 |
-
f.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
temp_path = f.name
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
file=temp_path,
|
342 |
-
file_options={"content-type": content_type}
|
343 |
-
)
|
344 |
-
|
345 |
-
self.logger.info(f"✅ Uploaded: {filename}")
|
346 |
-
return True
|
347 |
-
|
348 |
except Exception as e:
|
349 |
-
self.logger.error(f"❌
|
350 |
-
return False
|
351 |
finally:
|
352 |
-
if
|
353 |
os.remove(temp_path)
|
354 |
|
355 |
-
def closed(self, reason):
|
356 |
-
"""Called when spider closes"""
|
357 |
-
self.logger.info(f"Spider closed: {reason}")
|
358 |
-
|
359 |
-
# Upload files per department
|
360 |
-
for jurusan, pages in self.per_jurusan_pages.items():
|
361 |
-
if not pages:
|
362 |
-
continue
|
363 |
-
|
364 |
-
filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
|
365 |
-
content = ""
|
366 |
-
|
367 |
-
for page in pages:
|
368 |
-
content += f"{page['content']}\n\n---\n\n"
|
369 |
-
|
370 |
-
self.upload_to_supabase(filename, content)
|
371 |
-
|
372 |
-
# Create and upload summary
|
373 |
-
self.create_and_upload_summary()
|
374 |
-
|
375 |
-
def create_and_upload_summary(self):
|
376 |
-
"""Create and upload program study summary"""
|
377 |
-
rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
|
378 |
-
|
379 |
-
content_lines = [
|
380 |
-
f"# REKAP PROGRAM STUDI PNP",
|
381 |
-
f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}",
|
382 |
-
""
|
383 |
-
]
|
384 |
-
|
385 |
-
total_prodi = 0
|
386 |
-
jumlah_jurusan = 0
|
387 |
-
|
388 |
-
for jurusan, daftar in self.rekap_prodi.items():
|
389 |
-
valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)]
|
390 |
-
|
391 |
-
if not valid_prodi:
|
392 |
-
continue
|
393 |
-
|
394 |
-
jurusan_display = jurusan.replace("_", " ")
|
395 |
-
content_lines.append(f"## {jurusan_display}:")
|
396 |
-
|
397 |
-
for prodi in sorted(set(valid_prodi)):
|
398 |
-
content_lines.append(f"- {prodi}")
|
399 |
-
|
400 |
-
jumlah_prodi = len(set(valid_prodi))
|
401 |
-
content_lines.append(f"Jumlah program studi: {jumlah_prodi}")
|
402 |
-
content_lines.append("")
|
403 |
-
|
404 |
-
total_prodi += jumlah_prodi
|
405 |
-
jumlah_jurusan += 1
|
406 |
-
|
407 |
-
content_lines.extend([
|
408 |
-
f"**Total Jurusan di PNP: {jumlah_jurusan}**",
|
409 |
-
f"**Total Program Studi di PNP: {total_prodi}**"
|
410 |
-
])
|
411 |
-
|
412 |
-
content = "\n".join(content_lines)
|
413 |
-
self.upload_to_supabase(rekap_filename, content)
|
414 |
|
415 |
if __name__ == "__main__":
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
420 |
-
)
|
421 |
-
|
422 |
-
try:
|
423 |
-
process = CrawlerProcess()
|
424 |
-
process.crawl(JurusanSpider)
|
425 |
-
process.start()
|
426 |
-
except Exception as e:
|
427 |
-
logging.error(f"Failed to run spider: {e}")
|
428 |
-
raise
|
|
|
5 |
from supabase import create_client
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
|
|
8 |
|
9 |
load_dotenv()
|
10 |
|
|
|
12 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
13 |
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
|
14 |
|
15 |
+
|
16 |
def is_valid_prodi(nama):
|
17 |
+
return bool(re.match(
|
18 |
+
r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b',
|
19 |
+
nama, re.I
|
20 |
+
))
|
21 |
+
|
|
|
22 |
|
23 |
class JurusanSpider(scrapy.Spider):
|
24 |
name = "jurusan"
|
25 |
custom_settings = {
|
26 |
+
'DOWNLOAD_DELAY': 1,
|
27 |
'USER_AGENT': 'PNPBot/1.0',
|
28 |
'ROBOTSTXT_OBEY': True,
|
29 |
'LOG_LEVEL': 'INFO',
|
30 |
+
'HTTPCACHE_ENABLED': False,
|
31 |
'CONCURRENT_REQUESTS': 1,
|
32 |
+
'RETRY_TIMES': 3
|
|
|
|
|
|
|
33 |
}
|
34 |
|
35 |
domain_to_name = {
|
|
|
42 |
'ti.pnp.ac.id': 'Teknologi_Informasi',
|
43 |
}
|
44 |
|
45 |
+
allowed_domains = list(domain_to_name.keys())
|
46 |
start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
|
47 |
|
48 |
+
def __init__(self):
|
49 |
+
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
self.bucket = SUPABASE_BUCKET
|
51 |
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
|
52 |
self.per_jurusan_pages = {}
|
53 |
self.rekap_prodi = {}
|
|
|
54 |
|
55 |
def parse(self, response):
|
|
|
|
|
|
|
|
|
|
|
56 |
domain = response.url.split("//")[1].split("/")[0]
|
57 |
jurusan = self.domain_to_name.get(domain, domain)
|
58 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
program_studi = []
|
61 |
|
62 |
+
for a_tag in soup.find_all("a", href=True):
|
63 |
+
item = a_tag.get_text(strip=True)
|
64 |
+
href = a_tag["href"]
|
65 |
+
if item and is_valid_prodi(item) and item not in program_studi:
|
66 |
+
program_studi.append(item)
|
67 |
+
prodi_url = response.urljoin(href)
|
68 |
+
self.logger.info(f"[🧩] Ditemukan prodi: {item} ({prodi_url}) di jurusan {jurusan}")
|
69 |
+
yield response.follow(href, callback=self.parse_detail,
|
70 |
+
meta={"jurusan": jurusan, "url": prodi_url})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
|
|
72 |
self.rekap_prodi[jurusan] = program_studi
|
73 |
|
74 |
+
# Follow semua link internal
|
75 |
for a in soup.find_all("a", href=True):
|
76 |
+
yield response.follow(a["href"], callback=self.parse_detail,
|
77 |
+
meta={"jurusan": jurusan, "url": response.urljoin(a["href"])})
|
78 |
+
|
79 |
+
def parse_detail(self, response):
|
80 |
+
jurusan = response.meta["jurusan"]
|
81 |
+
url = response.meta["url"]
|
82 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
# Bersihkan elemen yang tidak diperlukan
|
|
|
|
|
85 |
for selector in [
|
86 |
'header', 'footer', 'nav', 'aside', 'menu',
|
87 |
'.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
|
|
|
93 |
for tag in soup.select(selector):
|
94 |
tag.decompose()
|
95 |
|
|
|
96 |
for element in soup.find_all(True):
|
97 |
if not element.get_text(strip=True) and not element.find_all(True):
|
98 |
element.decompose()
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
title_tag = soup.find("title") or soup.find("h1")
|
101 |
page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
|
102 |
|
103 |
+
# Selalu inisialisasi content_text
|
104 |
+
content_text = f"# {page_title}\nURL: {url}\nJurusan: {jurusan}\nTanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# Special case: dosen TI
|
107 |
+
if url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
|
108 |
+
dosen_list = [n.get_text(strip=True) for n in soup.find_all("dd", class_="wp-caption-text") if n.get_text(strip=True)]
|
109 |
+
naratif = ["## Daftar Dosen dan Staf Pengajar"] + [f"- {n}" for n in dosen_list]
|
110 |
+
content_text += "\n".join(naratif)
|
111 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": "Dosen dan Staf Pengajar Teknologi Informasi", "content": content_text})
|
112 |
+
return
|
113 |
|
114 |
+
# Special case: pimpinan jurusan TI
|
115 |
+
if url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
|
116 |
+
leadership_data = {"Pimpinan Jurusan": [], "Koordinator Program Studi": [], "Kepala Labor": []}
|
117 |
+
for member in soup.find_all(class_="member-item"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
name_tag = member.find(class_="item-title")
|
119 |
name = name_tag.get_text(strip=True) if name_tag else "N/A"
|
|
|
120 |
position_tag = member.find(class_="small-text")
|
121 |
position = position_tag.get_text(strip=True) if position_tag else "N/A"
|
122 |
+
if "Ketua Jurusan" in position or "Sekretaris Jurusan" in position:
|
|
|
|
|
123 |
leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
|
124 |
+
elif "Koordinator Program Studi" in position or "Koordinator PSDKU" in position:
|
125 |
leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
|
126 |
elif "Kepala Labor" in position:
|
127 |
leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
+
naratif = ["## Pimpinan Jurusan"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Pimpinan Jurusan"]]
|
130 |
+
naratif += ["\n## Koordinator Program Studi"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Koordinator Program Studi"]]
|
131 |
+
naratif += ["\n## Kepala Labor"] + [f"- {x['jabatan']}: {x['nama']}" for x in leadership_data["Kepala Labor"]]
|
132 |
+
content_text += "\n".join(naratif)
|
133 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": "Pimpinan Jurusan Teknologi Informasi", "content": content_text})
|
134 |
+
return
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
# Ambil body text
|
137 |
+
for p in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
|
138 |
+
txt = p.get_text(strip=True)
|
139 |
+
if txt:
|
140 |
+
content_text += txt + "\n"
|
141 |
|
142 |
+
# Ambil tabel
|
143 |
+
for i, table in enumerate(soup.find_all("table")):
|
144 |
+
content_text += f"\n\nTabel {i+1}\n\n"
|
145 |
+
for row in table.find_all("tr"):
|
146 |
+
cols = row.find_all(["td", "th"])
|
147 |
+
row_data = [col.get_text(strip=True) for col in cols]
|
148 |
+
content_text += " | ".join(row_data) + "\n"
|
149 |
|
150 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({"url": url, "title": page_title, "content": content_text})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
+
# Follow link dari halaman detail juga
|
153 |
+
for a in soup.find_all("a", href=True):
|
154 |
+
yield response.follow(a["href"], callback=self.parse_detail,
|
155 |
+
meta={"jurusan": jurusan, "url": response.urljoin(a["href"])})
|
156 |
|
157 |
+
def closed(self, reason):
|
158 |
+
# Simpan per jurusan
|
159 |
+
for jurusan, pages in self.per_jurusan_pages.items():
|
160 |
+
filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
|
|
|
161 |
try:
|
162 |
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
163 |
+
for page in pages:
|
164 |
+
f.write(page["content"] + "\n\n---\n\n")
|
165 |
+
temp_path = f.name
|
166 |
+
self.supabase.storage.from_(self.bucket).upload(path=filename, file=temp_path,
|
167 |
+
file_options={"content-type": "text/plain"})
|
168 |
+
self.logger.info(f"✅ Uploaded file jurusan: {filename}")
|
|
|
|
|
|
|
169 |
except Exception as e:
|
170 |
+
self.logger.error(f"❌ Gagal upload {filename}: {e}")
|
171 |
+
finally:
|
172 |
+
if os.path.exists(temp_path):
|
173 |
+
os.remove(temp_path)
|
174 |
|
175 |
+
# Simpan rekap
|
176 |
+
rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
|
177 |
try:
|
178 |
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
179 |
+
f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
180 |
+
total_prodi = 0
|
181 |
+
jumlah_jurusan = 0
|
182 |
+
for jurusan, daftar in self.rekap_prodi.items():
|
183 |
+
valid_prodi = [p.strip() for p in daftar if is_valid_prodi(p)]
|
184 |
+
if not valid_prodi:
|
185 |
+
continue
|
186 |
+
jurusan_baca = jurusan.replace("_", " ")
|
187 |
+
f.write(f"{jurusan_baca}:\n")
|
188 |
+
for p in sorted(set(valid_prodi)):
|
189 |
+
f.write(f"- {p}\n")
|
190 |
+
jumlah_prodi = len(valid_prodi)
|
191 |
+
f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
|
192 |
+
total_prodi += jumlah_prodi
|
193 |
+
jumlah_jurusan += 1
|
194 |
+
f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
|
195 |
+
f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
|
196 |
temp_path = f.name
|
197 |
+
self.supabase.storage.from_(self.bucket).upload(path=rekap_filename, file=temp_path,
|
198 |
+
file_options={"content-type": "text/plain"})
|
199 |
+
self.logger.info(f"✅ Uploaded file rekap: {rekap_filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
except Exception as e:
|
201 |
+
self.logger.error(f"❌ Gagal upload rekap: {e}")
|
|
|
202 |
finally:
|
203 |
+
if os.path.exists(temp_path):
|
204 |
os.remove(temp_path)
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
+
process = CrawlerProcess()
|
209 |
+
process.crawl(JurusanSpider)
|
210 |
+
process.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapping/pnp_scrap.py
CHANGED
@@ -192,26 +192,52 @@ class PNPContentSpider(scrapy.Spider):
|
|
192 |
# Simple description format
|
193 |
content = f"## Pimpinan {idx}\n\n{leader['description']}"
|
194 |
else:
|
195 |
-
# Structured data format
|
196 |
-
position = leader.get("Posisi",
|
197 |
-
|
|
|
|
|
|
|
|
|
198 |
|
199 |
-
#
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
-
|
203 |
-
for key in ordered_keys:
|
204 |
-
if key in leader:
|
205 |
-
content += f"**{key}**: {leader[key]}\n\n"
|
206 |
|
207 |
-
# Add remaining information
|
|
|
208 |
for key, value in leader.items():
|
209 |
-
if key not in
|
210 |
content += f"**{key}**: {value}\n\n"
|
211 |
|
212 |
# Add description if exists
|
213 |
if 'description' in leader:
|
214 |
-
content += f"
|
215 |
|
216 |
formatted_content.append(content.strip())
|
217 |
|
|
|
192 |
# Simple description format
|
193 |
content = f"## Pimpinan {idx}\n\n{leader['description']}"
|
194 |
else:
|
195 |
+
# Structured data format - create narrative
|
196 |
+
position = leader.get("Posisi", "")
|
197 |
+
nama = leader.get("Nama", "")
|
198 |
+
nidn = leader.get("NIDN", "")
|
199 |
+
jabatan_akademik = leader.get("Jabatan Akademik", "")
|
200 |
+
jurusan = leader.get("Jurusan", "")
|
201 |
+
program_studi = leader.get("Program Studi", "")
|
202 |
|
203 |
+
# Create narrative starting with position
|
204 |
+
if position and nama:
|
205 |
+
content = f"## {position}\n\n"
|
206 |
+
narrative = f"{position} Politeknik Negeri Padang adalah {nama}."
|
207 |
+
elif nama:
|
208 |
+
content = f"## Pimpinan {idx}\n\n"
|
209 |
+
narrative = f"Pimpinan ini adalah {nama}."
|
210 |
+
else:
|
211 |
+
content = f"## Pimpinan {idx}\n\n"
|
212 |
+
narrative = "Informasi pimpinan:"
|
213 |
+
|
214 |
+
# Add academic position
|
215 |
+
if jabatan_akademik:
|
216 |
+
narrative += f" Secara akademik, beliau menjabat sebagai {jabatan_akademik}."
|
217 |
+
|
218 |
+
# Add department information
|
219 |
+
if jurusan:
|
220 |
+
narrative += f" Beliau berasal dari Jurusan {jurusan}."
|
221 |
+
|
222 |
+
# Add study program
|
223 |
+
if program_studi:
|
224 |
+
narrative += f" Program studi yang diampu adalah {program_studi}."
|
225 |
+
|
226 |
+
# Add NIDN
|
227 |
+
if nidn:
|
228 |
+
narrative += f" NIDN beliau adalah {nidn}."
|
229 |
|
230 |
+
content += narrative + "\n\n"
|
|
|
|
|
|
|
231 |
|
232 |
+
# Add any remaining information that wasn't included in narrative
|
233 |
+
used_keys = ['Posisi', 'Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi', 'description']
|
234 |
for key, value in leader.items():
|
235 |
+
if key not in used_keys:
|
236 |
content += f"**{key}**: {value}\n\n"
|
237 |
|
238 |
# Add description if exists
|
239 |
if 'description' in leader:
|
240 |
+
content += f"**Informasi Tambahan**: {leader['description']}\n\n"
|
241 |
|
242 |
formatted_content.append(content.strip())
|
243 |
|