Spaces:
Sleeping
Sleeping
FauziIsyrinApridal
commited on
Commit
·
04e6021
1
Parent(s):
0517d51
revisi 10
Browse files- middleware.ts +0 -1
- requirements.txt +3 -1
- scrapping/dosen_scrap.py +10 -9
- scrapping/jadwal_scrap.py +147 -16
- scrapping/jurusan_scrap.py +50 -5
- scrapping/pnp_scrap.py +99 -15
- scrapping/utils/crawl4ai_utils.py +37 -0
middleware.ts
CHANGED
@@ -73,4 +73,3 @@ export const config = {
|
|
73 |
"/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
|
74 |
],
|
75 |
};
|
76 |
-
|
|
|
73 |
"/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
|
74 |
],
|
75 |
};
|
|
requirements.txt
CHANGED
@@ -2,4 +2,6 @@ scrapy
|
|
2 |
supabase
|
3 |
python-dotenv
|
4 |
requests
|
5 |
-
beautifulsoup4
|
|
|
|
|
|
2 |
supabase
|
3 |
python-dotenv
|
4 |
requests
|
5 |
+
beautifulsoup4
|
6 |
+
crawl4ai
|
7 |
+
playwright
|
scrapping/dosen_scrap.py
CHANGED
@@ -5,6 +5,15 @@ import re
|
|
5 |
from supabase import create_client
|
6 |
import os
|
7 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Try import shared dedup upload utility
|
10 |
try:
|
@@ -314,12 +323,4 @@ class DosenSpider(scrapy.Spider):
|
|
314 |
if item.get('jurusan'):
|
315 |
paragraph += f" Ia bertugas di {item['jurusan']}."
|
316 |
if item.get('detail'):
|
317 |
-
paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
|
318 |
-
output.append(paragraph + "\n\n")
|
319 |
-
|
320 |
-
return ''.join(output)
|
321 |
-
|
322 |
-
if __name__ == '__main__':
|
323 |
-
process = CrawlerProcess()
|
324 |
-
process.crawl(DosenSpider)
|
325 |
-
process.start()
|
|
|
5 |
from supabase import create_client
|
6 |
import os
|
7 |
import sys
|
8 |
+
from typing import List, Dict
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
# Crawl4AI helper for rendered fetching
|
12 |
+
try:
|
13 |
+
from utils.crawl4ai_utils import fetch_html_sync
|
14 |
+
except Exception:
|
15 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
16 |
+
from crawl4ai_utils import fetch_html_sync
|
17 |
|
18 |
# Try import shared dedup upload utility
|
19 |
try:
|
|
|
323 |
if item.get('jurusan'):
|
324 |
paragraph += f" Ia bertugas di {item['jurusan']}."
|
325 |
if item.get('detail'):
|
326 |
+
paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapping/jadwal_scrap.py
CHANGED
@@ -5,6 +5,24 @@ import re
|
|
5 |
from datetime import datetime
|
6 |
from supabase import create_client
|
7 |
from io import StringIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
|
@@ -398,19 +416,132 @@ class PnpSpider(scrapy.Spider):
|
|
398 |
self.process_table_rows(table, schedule_grid, days, time_slots)
|
399 |
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
|
400 |
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from datetime import datetime
|
6 |
from supabase import create_client
|
7 |
from io import StringIO
|
8 |
+
from typing import Dict, List, Tuple
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
# Crawl4AI helper for rendered fetching
|
12 |
+
try:
|
13 |
+
from utils.crawl4ai_utils import fetch_html_sync
|
14 |
+
except Exception:
|
15 |
+
import sys as _sys
|
16 |
+
_sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
17 |
+
from crawl4ai_utils import fetch_html_sync
|
18 |
+
|
19 |
+
# Shared dedup upload utility
|
20 |
+
try:
|
21 |
+
from utils.supabase_utils import upload_if_changed
|
22 |
+
except Exception:
|
23 |
+
import sys as _sys2
|
24 |
+
_sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
25 |
+
from supabase_utils import upload_if_changed
|
26 |
|
27 |
|
28 |
|
|
|
416 |
self.process_table_rows(table, schedule_grid, days, time_slots)
|
417 |
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
|
418 |
|
419 |
+
# Days and time slots
|
420 |
+
days = clean_text_list(table.select('thead th.xAxis'))
|
421 |
+
if not days:
|
422 |
+
days = clean_text_list(table.select('thead th[class*="xAxis"]'))
|
423 |
+
time_slots = clean_text_list(table.select('tbody tr:not(.foot) th.yAxis'))
|
424 |
+
if not time_slots:
|
425 |
+
time_slots = clean_text_list(table.select('tbody th[class*="yAxis"]'))
|
426 |
+
if not days or not time_slots:
|
427 |
+
return
|
428 |
+
|
429 |
+
schedule_grid = build_schedule_grid(days, time_slots)
|
430 |
+
|
431 |
+
# Handle rows with rowspans/colspans
|
432 |
+
rows = table.select('tbody tr:not(.foot)')
|
433 |
+
active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
|
434 |
+
for row_idx, row in enumerate(rows):
|
435 |
+
if row_idx >= len(time_slots):
|
436 |
+
continue
|
437 |
+
current_time = time_slots[row_idx]
|
438 |
+
filled_cols = set()
|
439 |
+
|
440 |
+
# Apply active rowspans
|
441 |
+
to_remove = []
|
442 |
+
for (rs_col_idx, rs_row_start), (rs_left, content) in list(active_rowspans.items()):
|
443 |
+
if rs_left > 0 and rs_col_idx < len(days):
|
444 |
+
day = days[rs_col_idx]
|
445 |
+
schedule_grid[day][current_time] = content
|
446 |
+
filled_cols.add(rs_col_idx)
|
447 |
+
active_rowspans[(rs_col_idx, rs_row_start)] = (rs_left - 1, content)
|
448 |
+
if rs_left - 1 <= 0:
|
449 |
+
to_remove.append((rs_col_idx, rs_row_start))
|
450 |
+
for k in to_remove:
|
451 |
+
del active_rowspans[k]
|
452 |
+
|
453 |
+
# Process this row cells
|
454 |
+
cells = row.select('td')
|
455 |
+
col_idx = 0
|
456 |
+
for cell in cells:
|
457 |
+
while col_idx < len(days) and col_idx in filled_cols:
|
458 |
+
col_idx += 1
|
459 |
+
if col_idx >= len(days):
|
460 |
+
break
|
461 |
+
cell_text = ' '.join(cell.get_text(" ", strip=True).split())
|
462 |
+
cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
|
463 |
+
rowspan = int(cell.get('rowspan', '1') or '1')
|
464 |
+
colspan = int(cell.get('colspan', '1') or '1')
|
465 |
+
# update grid
|
466 |
+
for c in range(colspan):
|
467 |
+
cur_c = col_idx + c
|
468 |
+
if cur_c < len(days):
|
469 |
+
schedule_grid[days[cur_c]][current_time] = cell_text
|
470 |
+
# track rowspan
|
471 |
+
if rowspan > 1:
|
472 |
+
for c in range(colspan):
|
473 |
+
active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
|
474 |
+
col_idx += colspan
|
475 |
+
|
476 |
+
write_schedule_to_buffer(buffer, schedule_grid, days, time_slots)
|
477 |
+
|
478 |
+
# 1) Special Elektro page
|
479 |
+
try:
|
480 |
+
elektro_html = fetch_html_sync(ELEKTRO_URL)
|
481 |
+
esoup = BeautifulSoup(elektro_html, 'html.parser')
|
482 |
+
tables = esoup.select('table')
|
483 |
+
if tables:
|
484 |
+
jurusan_id = 'teknik_elektro'
|
485 |
+
jurusan_name = 'Jurusan Teknik Elektro'
|
486 |
+
for idx, tbl in enumerate(tables):
|
487 |
+
process_table(tbl, jurusan_id, jurusan_name, idx)
|
488 |
+
except Exception as e:
|
489 |
+
print(f"[Jadwal] Error fetching Elektro page: {e}")
|
490 |
+
|
491 |
+
# 2) Presensi home traversal -> jurusan pages -> groups_days_horizontal
|
492 |
+
try:
|
493 |
+
home_html = fetch_html_sync(BASE_PRESENSI)
|
494 |
+
hsoup = BeautifulSoup(home_html, 'html.parser')
|
495 |
+
links = set(a.get('href') for a in hsoup.select('article.section a[href]'))
|
496 |
+
for link in links:
|
497 |
+
if not link:
|
498 |
+
continue
|
499 |
+
if any(ex in link.lower() for ex in EXCLUDED):
|
500 |
+
continue
|
501 |
+
jurusan_url = link if link.startswith('http') else (BASE_PRESENSI + link.lstrip('/'))
|
502 |
+
# deduce jurusan_id from dep param
|
503 |
+
m = re.search(r'department\?dep=(\d+)', jurusan_url)
|
504 |
+
jurusan_id = m.group(1) if m else f"unknown_{abs(hash(jurusan_url)) % 1000}"
|
505 |
+
|
506 |
+
try:
|
507 |
+
jur_html = fetch_html_sync(jurusan_url)
|
508 |
+
jsoup = BeautifulSoup(jur_html, 'html.parser')
|
509 |
+
title = jsoup.title.get_text(strip=True) if jsoup.title else f"Jurusan_{jurusan_id}"
|
510 |
+
# find groups_days_horizontal (not subgroups)
|
511 |
+
g_link = None
|
512 |
+
for a in jsoup.select('td a[href]'):
|
513 |
+
href = a.get('href')
|
514 |
+
if href and 'groups_days_horizontal' in href and 'subgroups_days_horizontal' not in href:
|
515 |
+
g_link = href
|
516 |
+
break
|
517 |
+
if not g_link:
|
518 |
+
continue
|
519 |
+
g_url = g_link if g_link.startswith('http') else (BASE_PRESENSI + g_link.lstrip('/'))
|
520 |
+
g_html = fetch_html_sync(g_url)
|
521 |
+
gsoup = BeautifulSoup(g_html, 'html.parser')
|
522 |
+
gtables = gsoup.select('table[id^="table_"], table')
|
523 |
+
for idx, tbl in enumerate(gtables):
|
524 |
+
process_table(tbl, jurusan_id=title.replace(' ', '_'), jurusan_name=title, idx=idx)
|
525 |
+
except Exception as inner:
|
526 |
+
print(f"[Jadwal] Error processing jurusan page {jurusan_url}: {inner}")
|
527 |
+
except Exception as e:
|
528 |
+
print(f"[Jadwal] Error fetching presensi home: {e}")
|
529 |
+
|
530 |
+
# Upload all buffers with dedup
|
531 |
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
532 |
+
for jurusan_id, buffer in file_buffers.items():
|
533 |
+
filename = f"{jurusan_id}_{ts}.txt"
|
534 |
+
content = buffer.getvalue()
|
535 |
+
try:
|
536 |
+
result = upload_if_changed(supabase, bucket, filename, content)
|
537 |
+
status = result.get('result')
|
538 |
+
if status == 'uploaded':
|
539 |
+
print(f"✅ Successfully uploaded {filename}")
|
540 |
+
elif status == 'skipped':
|
541 |
+
print(f"⏭️ Skipped upload for {filename} (content unchanged)")
|
542 |
+
else:
|
543 |
+
print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
|
544 |
+
except Exception as e:
|
545 |
+
print(f"❌ Error uploading {filename}: {e}")
|
546 |
+
finally:
|
547 |
+
buffer.close()
|
scrapping/jurusan_scrap.py
CHANGED
@@ -6,6 +6,14 @@ from supabase import create_client
|
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
8 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Try import shared dedup upload utility
|
11 |
try:
|
@@ -317,10 +325,47 @@ class JurusanSpider(scrapy.Spider):
|
|
317 |
except Exception as e:
|
318 |
self.logger.error(f"❌ Gagal upload rekap: {e}")
|
319 |
finally:
|
320 |
-
if os.path.exists(temp_path):
|
321 |
os.remove(temp_path)
|
322 |
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
8 |
import sys
|
9 |
+
from typing import Dict, List
|
10 |
+
|
11 |
+
# Crawl4AI helper for rendered fetching
|
12 |
+
try:
|
13 |
+
from utils.crawl4ai_utils import fetch_html_sync
|
14 |
+
except Exception:
|
15 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
16 |
+
from crawl4ai_utils import fetch_html_sync
|
17 |
|
18 |
# Try import shared dedup upload utility
|
19 |
try:
|
|
|
325 |
except Exception as e:
|
326 |
self.logger.error(f"❌ Gagal upload rekap: {e}")
|
327 |
finally:
|
328 |
+
if temp_path and os.path.exists(temp_path):
|
329 |
os.remove(temp_path)
|
330 |
|
331 |
+
# Build and upload REKAP file
|
332 |
+
rekap_filename = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
|
333 |
+
temp_path = None
|
334 |
+
try:
|
335 |
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
336 |
+
f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
337 |
+
total_prodi = 0
|
338 |
+
jumlah_jurusan = 0
|
339 |
+
for jurusan_key, daftar in rekap_prodi.items():
|
340 |
+
valid_prodi = []
|
341 |
+
for p in daftar:
|
342 |
+
if is_valid_prodi(p):
|
343 |
+
valid_prodi.append(p.strip())
|
344 |
+
if not valid_prodi:
|
345 |
+
continue
|
346 |
+
jurusan_baca = jurusan_key.replace("_", " ")
|
347 |
+
f.write(f"{jurusan_baca}:\n")
|
348 |
+
for p in sorted(set(valid_prodi)):
|
349 |
+
f.write(f"- {p}\n")
|
350 |
+
jumlah_prodi = len(valid_prodi)
|
351 |
+
f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
|
352 |
+
total_prodi += jumlah_prodi
|
353 |
+
jumlah_jurusan += 1
|
354 |
+
f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
|
355 |
+
f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
|
356 |
+
temp_path = f.name
|
357 |
+
with open(temp_path, 'r', encoding='utf-8') as rf:
|
358 |
+
rekap_text = rf.read()
|
359 |
+
result = upload_if_changed(supabase, bucket, rekap_filename, rekap_text)
|
360 |
+
status = result.get('result')
|
361 |
+
if status == 'uploaded':
|
362 |
+
print(f"✅ Uploaded file rekap: {rekap_filename}")
|
363 |
+
elif status == 'skipped':
|
364 |
+
print(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
|
365 |
+
else:
|
366 |
+
print(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
|
367 |
+
except Exception as e:
|
368 |
+
print(f"❌ Gagal upload rekap: {e}")
|
369 |
+
finally:
|
370 |
+
if temp_path and os.path.exists(temp_path):
|
371 |
+
os.remove(temp_path)
|
scrapping/pnp_scrap.py
CHANGED
@@ -5,6 +5,16 @@ import re
|
|
5 |
import os
|
6 |
from supabase import create_client, Client
|
7 |
import html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
10 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
@@ -427,21 +437,95 @@ class PNPContentSpider(scrapy.Spider):
|
|
427 |
yield scrapy.Request(
|
428 |
url=full_link,
|
429 |
callback=self.parse_content,
|
430 |
-
meta={'page_title': '
|
431 |
)
|
432 |
|
433 |
-
|
434 |
if __name__ == '__main__':
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
'
|
442 |
-
'
|
443 |
-
'
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import os
|
6 |
from supabase import create_client, Client
|
7 |
import html
|
8 |
+
from typing import List
|
9 |
+
|
10 |
+
# New: HTML parsing and Crawl4AI rendering
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
try:
|
13 |
+
from utils.crawl4ai_utils import fetch_html_sync
|
14 |
+
except Exception:
|
15 |
+
import sys
|
16 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
17 |
+
from crawl4ai_utils import fetch_html_sync
|
18 |
|
19 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
20 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
|
|
437 |
yield scrapy.Request(
|
438 |
url=full_link,
|
439 |
callback=self.parse_content,
|
440 |
+
meta={'page_title': '', 'menu_path': menu_path}
|
441 |
)
|
442 |
|
|
|
443 |
if __name__ == '__main__':
|
444 |
+
# Crawl4AI-based lightweight runner to fetch and upload core pages
|
445 |
+
START_URLS = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
|
446 |
+
|
447 |
+
def _clean_text(text: str) -> str:
|
448 |
+
if not text:
|
449 |
+
return ''
|
450 |
+
t = html.unescape(' '.join(text.split()))
|
451 |
+
t = t.replace('“', '"').replace('â€', '"').replace('’', "'")
|
452 |
+
t = t.replace('â€"', '—').replace('â€"', '–')
|
453 |
+
return t.strip()
|
454 |
+
|
455 |
+
def _extract_paragraphs(html_text: str, base_url: str) -> List[str]:
|
456 |
+
soup = BeautifulSoup(html_text, 'html.parser')
|
457 |
+
selectors = [
|
458 |
+
'div.entry-content', 'article.post', 'main.site-main',
|
459 |
+
'div.content', 'div.main-content', 'div#content', 'div.page-content'
|
460 |
+
]
|
461 |
+
content_area = None
|
462 |
+
for sel in selectors:
|
463 |
+
content_area = soup.select_one(sel)
|
464 |
+
if content_area:
|
465 |
+
break
|
466 |
+
nodes = content_area.select('p, h1, h2, h3, h4, h5, h6, li') if content_area else soup.select('p, h1, h2, h3, h4, h5, h6, li')
|
467 |
+
out: List[str] = []
|
468 |
+
for node in nodes:
|
469 |
+
text = _clean_text(node.get_text(' ', strip=True))
|
470 |
+
if text and len(text.split()) >= 5:
|
471 |
+
for a in node.find_all('a', href=True):
|
472 |
+
href = a['href']
|
473 |
+
if href and not href.startswith('#'):
|
474 |
+
abs_url = href if href.startswith('http') else os.path.join(base_url, href)
|
475 |
+
text += f" (Link: {abs_url})"
|
476 |
+
out.append(text)
|
477 |
+
return out
|
478 |
+
|
479 |
+
def _extract_tables(html_text: str, base_url: str) -> str:
|
480 |
+
soup = BeautifulSoup(html_text, 'html.parser')
|
481 |
+
blocks: List[str] = []
|
482 |
+
for ti, table in enumerate(soup.select('table')):
|
483 |
+
rows = []
|
484 |
+
for tr in table.select('tr'):
|
485 |
+
cells = []
|
486 |
+
for c in tr.select('th, td'):
|
487 |
+
tx = _clean_text(c.get_text(' ', strip=True))
|
488 |
+
a = c.find('a', href=True)
|
489 |
+
if a and a['href']:
|
490 |
+
href = a['href']
|
491 |
+
abs_url = href if href.startswith('http') else os.path.join(base_url, href)
|
492 |
+
tx += f" (Link: {abs_url})"
|
493 |
+
if tx:
|
494 |
+
cells.append(tx)
|
495 |
+
if cells:
|
496 |
+
rows.append(' | '.join(cells))
|
497 |
+
if rows:
|
498 |
+
blocks.append(f"### Tabel {ti + 1}\n\n" + "\n".join(rows))
|
499 |
+
return "\n\n".join(blocks)
|
500 |
+
|
501 |
+
def _final_md(title: str, url: str, paras: List[str], tables: str) -> str:
|
502 |
+
md = f"# {title}\n\n**Tanggal**: {datetime.now().strftime('%d %B %Y')}\n**URL**: {url}\n\n" + "\n".join(paras)
|
503 |
+
if tables:
|
504 |
+
md += "\n\n## Data Tabel\n\n" + tables
|
505 |
+
return md
|
506 |
+
|
507 |
+
def _upload(page_title: str, content_text: str) -> str:
|
508 |
+
safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
|
509 |
+
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
510 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
511 |
+
filename = f"{safe_title}_{timestamp}.txt"
|
512 |
+
try:
|
513 |
+
result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
|
514 |
+
return filename if result.get('result') == 'uploaded' else f"skipped_{filename}"
|
515 |
+
except Exception as e:
|
516 |
+
print(f"Upload error: {e}")
|
517 |
+
return f"failed_{filename}"
|
518 |
+
|
519 |
+
for url in START_URLS:
|
520 |
+
try:
|
521 |
+
html_text = fetch_html_sync(url)
|
522 |
+
soup = BeautifulSoup(html_text, 'html.parser')
|
523 |
+
title_node = soup.select_one('h1.entry-title, h1.page-title')
|
524 |
+
page_title = title_node.get_text(strip=True) if title_node else (soup.title.string.strip() if soup.title and soup.title.string else 'Unknown Page')
|
525 |
+
paras = _extract_paragraphs(html_text, url)
|
526 |
+
tables = _extract_tables(html_text, url)
|
527 |
+
content = _final_md(page_title, url, paras, tables)
|
528 |
+
up = _upload(page_title, content)
|
529 |
+
print(f"[PNP crawl] {url} -> {up}")
|
530 |
+
except Exception as e:
|
531 |
+
print(f"[PNP crawl] Error processing {url}: {e}")
|
scrapping/utils/crawl4ai_utils.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
try:
|
5 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
6 |
+
except Exception as e:
|
7 |
+
AsyncWebCrawler = None # type: ignore
|
8 |
+
BrowserConfig = None # type: ignore
|
9 |
+
CrawlerRunConfig = None # type: ignore
|
10 |
+
CacheMode = None # type: ignore
|
11 |
+
|
12 |
+
|
13 |
+
class Crawl4AIUnavailable(Exception):
|
14 |
+
pass
|
15 |
+
|
16 |
+
|
17 |
+
async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
|
18 |
+
"""Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
|
19 |
+
if AsyncWebCrawler is None:
|
20 |
+
raise Crawl4AIUnavailable(
|
21 |
+
"crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
|
22 |
+
)
|
23 |
+
browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
|
24 |
+
run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
|
25 |
+
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
26 |
+
result = await crawler.arun(url=url, config=run_conf)
|
27 |
+
# Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
|
28 |
+
html = getattr(result, "html", None)
|
29 |
+
if not html:
|
30 |
+
# Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
|
31 |
+
html = getattr(result, "content", None) or getattr(result, "markdown", "")
|
32 |
+
return html
|
33 |
+
|
34 |
+
|
35 |
+
def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
|
36 |
+
"""Synchronous wrapper for fetch_html."""
|
37 |
+
return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))
|