Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on about 1 month ago

Commit

04e6021

1 Parent(s): 0517d51

revisi 10

Browse files

Files changed (7) hide show

middleware.ts +0 -1
requirements.txt +3 -1
scrapping/dosen_scrap.py +10 -9
scrapping/jadwal_scrap.py +147 -16
scrapping/jurusan_scrap.py +50 -5
scrapping/pnp_scrap.py +99 -15
scrapping/utils/crawl4ai_utils.py +37 -0

middleware.ts CHANGED Viewed

@@ -73,4 +73,3 @@ export const config = {
     "/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
   ],
 };

     "/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
   ],
 };

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ scrapy
 supabase
 python-dotenv
 requests
-beautifulsoup4

 supabase
 python-dotenv
 requests
+beautifulsoup4
+crawl4ai
+playwright

scrapping/dosen_scrap.py CHANGED Viewed

@@ -5,6 +5,15 @@ import re
 from supabase import create_client
 import os
 import sys
 # Try import shared dedup upload utility
 try:
@@ -314,12 +323,4 @@ class DosenSpider(scrapy.Spider):
                     if item.get('jurusan'):
                         paragraph += f" Ia bertugas di {item['jurusan']}."
                     if item.get('detail'):
-                        paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
-                output.append(paragraph + "\n\n")
-        return ''.join(output)
-if __name__ == '__main__':
-    process = CrawlerProcess()
-    process.crawl(DosenSpider)
-    process.start()

 from supabase import create_client
 import os
 import sys
+from typing import List, Dict
+from bs4 import BeautifulSoup
+# Crawl4AI helper for rendered fetching
+try:
+    from utils.crawl4ai_utils import fetch_html_sync
+except Exception:
+    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
+    from crawl4ai_utils import fetch_html_sync
 # Try import shared dedup upload utility
 try:
                     if item.get('jurusan'):
                         paragraph += f" Ia bertugas di {item['jurusan']}."
                     if item.get('detail'):
+                        paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."

scrapping/jadwal_scrap.py CHANGED Viewed

@@ -5,6 +5,24 @@ import re
 from datetime import datetime
 from supabase import create_client
 from io import StringIO
@@ -398,19 +416,132 @@ class PnpSpider(scrapy.Spider):
             self.process_table_rows(table, schedule_grid, days, time_slots)
             self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
-    def extract_title_jurusan_name(self, response):
-        title = response.xpath('//title/text()').get()
-        return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}"
-if __name__ == "__main__":
-    process = CrawlerProcess(settings={
-        'DOWNLOAD_DELAY': 1,
-        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
-        'ROBOTSTXT_OBEY': True,
-        'LOG_LEVEL': 'INFO',
-        'HTTPCACHE_ENABLED': False,
-        'CONCURRENT_REQUESTS': 1,
-        'RETRY_TIMES': 3
-    })
-    process.crawl(PnpSpider)
-    process.start()

 from datetime import datetime
 from supabase import create_client
 from io import StringIO
+from typing import Dict, List, Tuple
+from bs4 import BeautifulSoup
+# Crawl4AI helper for rendered fetching
+try:
+    from utils.crawl4ai_utils import fetch_html_sync
+except Exception:
+    import sys as _sys
+    _sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
+    from crawl4ai_utils import fetch_html_sync
+# Shared dedup upload utility
+try:
+    from utils.supabase_utils import upload_if_changed
+except Exception:
+    import sys as _sys2
+    _sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
+    from supabase_utils import upload_if_changed
             self.process_table_rows(table, schedule_grid, days, time_slots)
             self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
+        # Days and time slots
+        days = clean_text_list(table.select('thead th.xAxis'))
+        if not days:
+            days = clean_text_list(table.select('thead th[class*="xAxis"]'))
+        time_slots = clean_text_list(table.select('tbody tr:not(.foot) th.yAxis'))
+        if not time_slots:
+            time_slots = clean_text_list(table.select('tbody th[class*="yAxis"]'))
+        if not days or not time_slots:
+            return
+        schedule_grid = build_schedule_grid(days, time_slots)
+        # Handle rows with rowspans/colspans
+        rows = table.select('tbody tr:not(.foot)')
+        active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
+        for row_idx, row in enumerate(rows):
+            if row_idx >= len(time_slots):
+                continue
+            current_time = time_slots[row_idx]
+            filled_cols = set()
+            # Apply active rowspans
+            to_remove = []
+            for (rs_col_idx, rs_row_start), (rs_left, content) in list(active_rowspans.items()):
+                if rs_left > 0 and rs_col_idx < len(days):
+                    day = days[rs_col_idx]
+                    schedule_grid[day][current_time] = content
+                    filled_cols.add(rs_col_idx)
+                    active_rowspans[(rs_col_idx, rs_row_start)] = (rs_left - 1, content)
+                    if rs_left - 1 <= 0:
+                        to_remove.append((rs_col_idx, rs_row_start))
+            for k in to_remove:
+                del active_rowspans[k]
+            # Process this row cells
+            cells = row.select('td')
+            col_idx = 0
+            for cell in cells:
+                while col_idx < len(days) and col_idx in filled_cols:
+                    col_idx += 1
+                if col_idx >= len(days):
+                    break
+                cell_text = ' '.join(cell.get_text(" ", strip=True).split())
+                cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
+                rowspan = int(cell.get('rowspan', '1') or '1')
+                colspan = int(cell.get('colspan', '1') or '1')
+                # update grid
+                for c in range(colspan):
+                    cur_c = col_idx + c
+                    if cur_c < len(days):
+                        schedule_grid[days[cur_c]][current_time] = cell_text
+                # track rowspan
+                if rowspan > 1:
+                    for c in range(colspan):
+                        active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
+                col_idx += colspan
+        write_schedule_to_buffer(buffer, schedule_grid, days, time_slots)
+    # 1) Special Elektro page
+    try:
+        elektro_html = fetch_html_sync(ELEKTRO_URL)
+        esoup = BeautifulSoup(elektro_html, 'html.parser')
+        tables = esoup.select('table')
+        if tables:
+            jurusan_id = 'teknik_elektro'
+            jurusan_name = 'Jurusan Teknik Elektro'
+            for idx, tbl in enumerate(tables):
+                process_table(tbl, jurusan_id, jurusan_name, idx)
+    except Exception as e:
+        print(f"[Jadwal] Error fetching Elektro page: {e}")
+    # 2) Presensi home traversal -> jurusan pages -> groups_days_horizontal
+    try:
+        home_html = fetch_html_sync(BASE_PRESENSI)
+        hsoup = BeautifulSoup(home_html, 'html.parser')
+        links = set(a.get('href') for a in hsoup.select('article.section a[href]'))
+        for link in links:
+            if not link:
+                continue
+            if any(ex in link.lower() for ex in EXCLUDED):
+                continue
+            jurusan_url = link if link.startswith('http') else (BASE_PRESENSI + link.lstrip('/'))
+            # deduce jurusan_id from dep param
+            m = re.search(r'department\?dep=(\d+)', jurusan_url)
+            jurusan_id = m.group(1) if m else f"unknown_{abs(hash(jurusan_url)) % 1000}"
+            try:
+                jur_html = fetch_html_sync(jurusan_url)
+                jsoup = BeautifulSoup(jur_html, 'html.parser')
+                title = jsoup.title.get_text(strip=True) if jsoup.title else f"Jurusan_{jurusan_id}"
+                # find groups_days_horizontal (not subgroups)
+                g_link = None
+                for a in jsoup.select('td a[href]'):
+                    href = a.get('href')
+                    if href and 'groups_days_horizontal' in href and 'subgroups_days_horizontal' not in href:
+                        g_link = href
+                        break
+                if not g_link:
+                    continue
+                g_url = g_link if g_link.startswith('http') else (BASE_PRESENSI + g_link.lstrip('/'))
+                g_html = fetch_html_sync(g_url)
+                gsoup = BeautifulSoup(g_html, 'html.parser')
+                gtables = gsoup.select('table[id^="table_"], table')
+                for idx, tbl in enumerate(gtables):
+                    process_table(tbl, jurusan_id=title.replace(' ', '_'), jurusan_name=title, idx=idx)
+            except Exception as inner:
+                print(f"[Jadwal] Error processing jurusan page {jurusan_url}: {inner}")
+    except Exception as e:
+        print(f"[Jadwal] Error fetching presensi home: {e}")
+    # Upload all buffers with dedup
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    for jurusan_id, buffer in file_buffers.items():
+        filename = f"{jurusan_id}_{ts}.txt"
+        content = buffer.getvalue()
+        try:
+            result = upload_if_changed(supabase, bucket, filename, content)
+            status = result.get('result')
+            if status == 'uploaded':
+                print(f"✅ Successfully uploaded {filename}")
+            elif status == 'skipped':
+                print(f"⏭️ Skipped upload for {filename} (content unchanged)")
+            else:
+                print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
+        except Exception as e:
+            print(f"❌ Error uploading {filename}: {e}")
+        finally:
+            buffer.close()

scrapping/jurusan_scrap.py CHANGED Viewed

@@ -6,6 +6,14 @@ from supabase import create_client
 from datetime import datetime
 import os, re, tempfile
 import sys
 # Try import shared dedup upload utility
 try:
@@ -317,10 +325,47 @@ class JurusanSpider(scrapy.Spider):
         except Exception as e:
             self.logger.error(f"❌ Gagal upload rekap: {e}")
         finally:
-            if os.path.exists(temp_path):
                 os.remove(temp_path)
-if __name__ == "__main__":
-    process = CrawlerProcess()
-    process.crawl(JurusanSpider)
-    process.start()

 from datetime import datetime
 import os, re, tempfile
 import sys
+from typing import Dict, List
+# Crawl4AI helper for rendered fetching
+try:
+    from utils.crawl4ai_utils import fetch_html_sync
+except Exception:
+    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
+    from crawl4ai_utils import fetch_html_sync
 # Try import shared dedup upload utility
 try:
         except Exception as e:
             self.logger.error(f"❌ Gagal upload rekap: {e}")
         finally:
+            if temp_path and os.path.exists(temp_path):
                 os.remove(temp_path)
+    # Build and upload REKAP file
+    rekap_filename = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
+    temp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
+            f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
+            total_prodi = 0
+            jumlah_jurusan = 0
+            for jurusan_key, daftar in rekap_prodi.items():
+                valid_prodi = []
+                for p in daftar:
+                    if is_valid_prodi(p):
+                        valid_prodi.append(p.strip())
+                if not valid_prodi:
+                    continue
+                jurusan_baca = jurusan_key.replace("_", " ")
+                f.write(f"{jurusan_baca}:\n")
+                for p in sorted(set(valid_prodi)):
+                    f.write(f"- {p}\n")
+                jumlah_prodi = len(valid_prodi)
+                f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
+                total_prodi += jumlah_prodi
+                jumlah_jurusan += 1
+            f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
+            f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
+            temp_path = f.name
+        with open(temp_path, 'r', encoding='utf-8') as rf:
+            rekap_text = rf.read()
+        result = upload_if_changed(supabase, bucket, rekap_filename, rekap_text)
+        status = result.get('result')
+        if status == 'uploaded':
+            print(f"✅ Uploaded file rekap: {rekap_filename}")
+        elif status == 'skipped':
+            print(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
+        else:
+            print(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
+    except Exception as e:
+        print(f"❌ Gagal upload rekap: {e}")
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            os.remove(temp_path)

scrapping/pnp_scrap.py CHANGED Viewed

@@ -5,6 +5,16 @@ import re
 import os
 from supabase import create_client, Client
 import html
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
@@ -427,21 +437,95 @@ class PNPContentSpider(scrapy.Spider):
                     yield scrapy.Request(
                         url=full_link,
                         callback=self.parse_content,
-                        meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
                     )
 if __name__ == '__main__':
-    process = CrawlerProcess({
-        'USER_AGENT': 'PNPBot/1.0',
-        'DOWNLOAD_DELAY': 2,
-        'ROBOTSTXT_OBEY': True,
-        'LOG_LEVEL': 'INFO',
-        'CONCURRENT_REQUESTS': 1,
-        'DOWNLOAD_TIMEOUT': 100,
-        'RETRY_TIMES': 3,
-        'HTTPCACHE_ENABLED': False,
-        'FEED_EXPORT_ENCODING': 'utf-8'
-    })
-    process.crawl(PNPContentSpider)
-    process.start()

 import os
 from supabase import create_client, Client
 import html
+from typing import List
+# New: HTML parsing and Crawl4AI rendering
+from bs4 import BeautifulSoup
+try:
+    from utils.crawl4ai_utils import fetch_html_sync
+except Exception:
+    import sys
+    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
+    from crawl4ai_utils import fetch_html_sync
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
                     yield scrapy.Request(
                         url=full_link,
                         callback=self.parse_content,
+                        meta={'page_title': '', 'menu_path': menu_path}
                     )
 if __name__ == '__main__':
+    # Crawl4AI-based lightweight runner to fetch and upload core pages
+    START_URLS = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
+    def _clean_text(text: str) -> str:
+        if not text:
+            return ''
+        t = html.unescape(' '.join(text.split()))
+        t = t.replace('â€œ', '"').replace('â€', '"').replace('â€™', "'")
+        t = t.replace('â€"', '—').replace('â€"', '–')
+        return t.strip()
+    def _extract_paragraphs(html_text: str, base_url: str) -> List[str]:
+        soup = BeautifulSoup(html_text, 'html.parser')
+        selectors = [
+            'div.entry-content', 'article.post', 'main.site-main',
+            'div.content', 'div.main-content', 'div#content', 'div.page-content'
+        ]
+        content_area = None
+        for sel in selectors:
+            content_area = soup.select_one(sel)
+            if content_area:
+                break
+        nodes = content_area.select('p, h1, h2, h3, h4, h5, h6, li') if content_area else soup.select('p, h1, h2, h3, h4, h5, h6, li')
+        out: List[str] = []
+        for node in nodes:
+            text = _clean_text(node.get_text(' ', strip=True))
+            if text and len(text.split()) >= 5:
+                for a in node.find_all('a', href=True):
+                    href = a['href']
+                    if href and not href.startswith('#'):
+                        abs_url = href if href.startswith('http') else os.path.join(base_url, href)
+                        text += f" (Link: {abs_url})"
+                out.append(text)
+        return out
+    def _extract_tables(html_text: str, base_url: str) -> str:
+        soup = BeautifulSoup(html_text, 'html.parser')
+        blocks: List[str] = []
+        for ti, table in enumerate(soup.select('table')):
+            rows = []
+            for tr in table.select('tr'):
+                cells = []
+                for c in tr.select('th, td'):
+                    tx = _clean_text(c.get_text(' ', strip=True))
+                    a = c.find('a', href=True)
+                    if a and a['href']:
+                        href = a['href']
+                        abs_url = href if href.startswith('http') else os.path.join(base_url, href)
+                        tx += f" (Link: {abs_url})"
+                    if tx:
+                        cells.append(tx)
+                if cells:
+                    rows.append(' | '.join(cells))
+            if rows:
+                blocks.append(f"### Tabel {ti + 1}\n\n" + "\n".join(rows))
+        return "\n\n".join(blocks)
+    def _final_md(title: str, url: str, paras: List[str], tables: str) -> str:
+        md = f"# {title}\n\n**Tanggal**: {datetime.now().strftime('%d %B %Y')}\n**URL**: {url}\n\n" + "\n".join(paras)
+        if tables:
+            md += "\n\n## Data Tabel\n\n" + tables
+        return md
+    def _upload(page_title: str, content_text: str) -> str:
+        safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
+        safe_title = re.sub(r'[-\s]+', '-', safe_title)
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        filename = f"{safe_title}_{timestamp}.txt"
+        try:
+            result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
+            return filename if result.get('result') == 'uploaded' else f"skipped_{filename}"
+        except Exception as e:
+            print(f"Upload error: {e}")
+            return f"failed_{filename}"
+    for url in START_URLS:
+        try:
+            html_text = fetch_html_sync(url)
+            soup = BeautifulSoup(html_text, 'html.parser')
+            title_node = soup.select_one('h1.entry-title, h1.page-title')
+            page_title = title_node.get_text(strip=True) if title_node else (soup.title.string.strip() if soup.title and soup.title.string else 'Unknown Page')
+            paras = _extract_paragraphs(html_text, url)
+            tables = _extract_tables(html_text, url)
+            content = _final_md(page_title, url, paras, tables)
+            up = _upload(page_title, content)
+            print(f"[PNP crawl] {url} -> {up}")
+        except Exception as e:
+            print(f"[PNP crawl] Error processing {url}: {e}")

scrapping/utils/crawl4ai_utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import asyncio
+from typing import Optional
+try:
+    from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+except Exception as e:
+    AsyncWebCrawler = None  # type: ignore
+    BrowserConfig = None  # type: ignore
+    CrawlerRunConfig = None  # type: ignore
+    CacheMode = None  # type: ignore
+class Crawl4AIUnavailable(Exception):
+    pass
+async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
+    """Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
+    if AsyncWebCrawler is None:
+        raise Crawl4AIUnavailable(
+            "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
+        )
+    browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
+    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(url=url, config=run_conf)
+        # Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
+        html = getattr(result, "html", None)
+        if not html:
+            # Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
+            html = getattr(result, "content", None) or getattr(result, "markdown", "")
+        return html
+def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
+    """Synchronous wrapper for fetch_html."""
+    return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))