FauziIsyrinApridal commited on
Commit
04e6021
·
1 Parent(s): 0517d51
middleware.ts CHANGED
@@ -73,4 +73,3 @@ export const config = {
73
  "/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
74
  ],
75
  };
76
-
 
73
  "/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
74
  ],
75
  };
 
requirements.txt CHANGED
@@ -2,4 +2,6 @@ scrapy
2
  supabase
3
  python-dotenv
4
  requests
5
- beautifulsoup4
 
 
 
2
  supabase
3
  python-dotenv
4
  requests
5
+ beautifulsoup4
6
+ crawl4ai
7
+ playwright
scrapping/dosen_scrap.py CHANGED
@@ -5,6 +5,15 @@ import re
5
  from supabase import create_client
6
  import os
7
  import sys
 
 
 
 
 
 
 
 
 
8
 
9
  # Try import shared dedup upload utility
10
  try:
@@ -314,12 +323,4 @@ class DosenSpider(scrapy.Spider):
314
  if item.get('jurusan'):
315
  paragraph += f" Ia bertugas di {item['jurusan']}."
316
  if item.get('detail'):
317
- paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
318
- output.append(paragraph + "\n\n")
319
-
320
- return ''.join(output)
321
-
322
- if __name__ == '__main__':
323
- process = CrawlerProcess()
324
- process.crawl(DosenSpider)
325
- process.start()
 
5
  from supabase import create_client
6
  import os
7
  import sys
8
+ from typing import List, Dict
9
+ from bs4 import BeautifulSoup
10
+
11
+ # Crawl4AI helper for rendered fetching
12
+ try:
13
+ from utils.crawl4ai_utils import fetch_html_sync
14
+ except Exception:
15
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
16
+ from crawl4ai_utils import fetch_html_sync
17
 
18
  # Try import shared dedup upload utility
19
  try:
 
323
  if item.get('jurusan'):
324
  paragraph += f" Ia bertugas di {item['jurusan']}."
325
  if item.get('detail'):
326
+ paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
 
 
 
 
 
 
 
 
scrapping/jadwal_scrap.py CHANGED
@@ -5,6 +5,24 @@ import re
5
  from datetime import datetime
6
  from supabase import create_client
7
  from io import StringIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
 
@@ -398,19 +416,132 @@ class PnpSpider(scrapy.Spider):
398
  self.process_table_rows(table, schedule_grid, days, time_slots)
399
  self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
400
 
401
- def extract_title_jurusan_name(self, response):
402
- title = response.xpath('//title/text()').get()
403
- return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}"
404
-
405
- if __name__ == "__main__":
406
- process = CrawlerProcess(settings={
407
- 'DOWNLOAD_DELAY': 1,
408
- 'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
409
- 'ROBOTSTXT_OBEY': True,
410
- 'LOG_LEVEL': 'INFO',
411
- 'HTTPCACHE_ENABLED': False,
412
- 'CONCURRENT_REQUESTS': 1,
413
- 'RETRY_TIMES': 3
414
- })
415
- process.crawl(PnpSpider)
416
- process.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from datetime import datetime
6
  from supabase import create_client
7
  from io import StringIO
8
+ from typing import Dict, List, Tuple
9
+ from bs4 import BeautifulSoup
10
+
11
+ # Crawl4AI helper for rendered fetching
12
+ try:
13
+ from utils.crawl4ai_utils import fetch_html_sync
14
+ except Exception:
15
+ import sys as _sys
16
+ _sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
17
+ from crawl4ai_utils import fetch_html_sync
18
+
19
+ # Shared dedup upload utility
20
+ try:
21
+ from utils.supabase_utils import upload_if_changed
22
+ except Exception:
23
+ import sys as _sys2
24
+ _sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
25
+ from supabase_utils import upload_if_changed
26
 
27
 
28
 
 
416
  self.process_table_rows(table, schedule_grid, days, time_slots)
417
  self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
418
 
419
+ # Days and time slots
420
+ days = clean_text_list(table.select('thead th.xAxis'))
421
+ if not days:
422
+ days = clean_text_list(table.select('thead th[class*="xAxis"]'))
423
+ time_slots = clean_text_list(table.select('tbody tr:not(.foot) th.yAxis'))
424
+ if not time_slots:
425
+ time_slots = clean_text_list(table.select('tbody th[class*="yAxis"]'))
426
+ if not days or not time_slots:
427
+ return
428
+
429
+ schedule_grid = build_schedule_grid(days, time_slots)
430
+
431
+ # Handle rows with rowspans/colspans
432
+ rows = table.select('tbody tr:not(.foot)')
433
+ active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
434
+ for row_idx, row in enumerate(rows):
435
+ if row_idx >= len(time_slots):
436
+ continue
437
+ current_time = time_slots[row_idx]
438
+ filled_cols = set()
439
+
440
+ # Apply active rowspans
441
+ to_remove = []
442
+ for (rs_col_idx, rs_row_start), (rs_left, content) in list(active_rowspans.items()):
443
+ if rs_left > 0 and rs_col_idx < len(days):
444
+ day = days[rs_col_idx]
445
+ schedule_grid[day][current_time] = content
446
+ filled_cols.add(rs_col_idx)
447
+ active_rowspans[(rs_col_idx, rs_row_start)] = (rs_left - 1, content)
448
+ if rs_left - 1 <= 0:
449
+ to_remove.append((rs_col_idx, rs_row_start))
450
+ for k in to_remove:
451
+ del active_rowspans[k]
452
+
453
+ # Process this row cells
454
+ cells = row.select('td')
455
+ col_idx = 0
456
+ for cell in cells:
457
+ while col_idx < len(days) and col_idx in filled_cols:
458
+ col_idx += 1
459
+ if col_idx >= len(days):
460
+ break
461
+ cell_text = ' '.join(cell.get_text(" ", strip=True).split())
462
+ cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
463
+ rowspan = int(cell.get('rowspan', '1') or '1')
464
+ colspan = int(cell.get('colspan', '1') or '1')
465
+ # update grid
466
+ for c in range(colspan):
467
+ cur_c = col_idx + c
468
+ if cur_c < len(days):
469
+ schedule_grid[days[cur_c]][current_time] = cell_text
470
+ # track rowspan
471
+ if rowspan > 1:
472
+ for c in range(colspan):
473
+ active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
474
+ col_idx += colspan
475
+
476
+ write_schedule_to_buffer(buffer, schedule_grid, days, time_slots)
477
+
478
+ # 1) Special Elektro page
479
+ try:
480
+ elektro_html = fetch_html_sync(ELEKTRO_URL)
481
+ esoup = BeautifulSoup(elektro_html, 'html.parser')
482
+ tables = esoup.select('table')
483
+ if tables:
484
+ jurusan_id = 'teknik_elektro'
485
+ jurusan_name = 'Jurusan Teknik Elektro'
486
+ for idx, tbl in enumerate(tables):
487
+ process_table(tbl, jurusan_id, jurusan_name, idx)
488
+ except Exception as e:
489
+ print(f"[Jadwal] Error fetching Elektro page: {e}")
490
+
491
+ # 2) Presensi home traversal -> jurusan pages -> groups_days_horizontal
492
+ try:
493
+ home_html = fetch_html_sync(BASE_PRESENSI)
494
+ hsoup = BeautifulSoup(home_html, 'html.parser')
495
+ links = set(a.get('href') for a in hsoup.select('article.section a[href]'))
496
+ for link in links:
497
+ if not link:
498
+ continue
499
+ if any(ex in link.lower() for ex in EXCLUDED):
500
+ continue
501
+ jurusan_url = link if link.startswith('http') else (BASE_PRESENSI + link.lstrip('/'))
502
+ # deduce jurusan_id from dep param
503
+ m = re.search(r'department\?dep=(\d+)', jurusan_url)
504
+ jurusan_id = m.group(1) if m else f"unknown_{abs(hash(jurusan_url)) % 1000}"
505
+
506
+ try:
507
+ jur_html = fetch_html_sync(jurusan_url)
508
+ jsoup = BeautifulSoup(jur_html, 'html.parser')
509
+ title = jsoup.title.get_text(strip=True) if jsoup.title else f"Jurusan_{jurusan_id}"
510
+ # find groups_days_horizontal (not subgroups)
511
+ g_link = None
512
+ for a in jsoup.select('td a[href]'):
513
+ href = a.get('href')
514
+ if href and 'groups_days_horizontal' in href and 'subgroups_days_horizontal' not in href:
515
+ g_link = href
516
+ break
517
+ if not g_link:
518
+ continue
519
+ g_url = g_link if g_link.startswith('http') else (BASE_PRESENSI + g_link.lstrip('/'))
520
+ g_html = fetch_html_sync(g_url)
521
+ gsoup = BeautifulSoup(g_html, 'html.parser')
522
+ gtables = gsoup.select('table[id^="table_"], table')
523
+ for idx, tbl in enumerate(gtables):
524
+ process_table(tbl, jurusan_id=title.replace(' ', '_'), jurusan_name=title, idx=idx)
525
+ except Exception as inner:
526
+ print(f"[Jadwal] Error processing jurusan page {jurusan_url}: {inner}")
527
+ except Exception as e:
528
+ print(f"[Jadwal] Error fetching presensi home: {e}")
529
+
530
+ # Upload all buffers with dedup
531
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
532
+ for jurusan_id, buffer in file_buffers.items():
533
+ filename = f"{jurusan_id}_{ts}.txt"
534
+ content = buffer.getvalue()
535
+ try:
536
+ result = upload_if_changed(supabase, bucket, filename, content)
537
+ status = result.get('result')
538
+ if status == 'uploaded':
539
+ print(f"✅ Successfully uploaded {filename}")
540
+ elif status == 'skipped':
541
+ print(f"⏭️ Skipped upload for {filename} (content unchanged)")
542
+ else:
543
+ print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
544
+ except Exception as e:
545
+ print(f"❌ Error uploading {filename}: {e}")
546
+ finally:
547
+ buffer.close()
scrapping/jurusan_scrap.py CHANGED
@@ -6,6 +6,14 @@ from supabase import create_client
6
  from datetime import datetime
7
  import os, re, tempfile
8
  import sys
 
 
 
 
 
 
 
 
9
 
10
  # Try import shared dedup upload utility
11
  try:
@@ -317,10 +325,47 @@ class JurusanSpider(scrapy.Spider):
317
  except Exception as e:
318
  self.logger.error(f"❌ Gagal upload rekap: {e}")
319
  finally:
320
- if os.path.exists(temp_path):
321
  os.remove(temp_path)
322
 
323
- if __name__ == "__main__":
324
- process = CrawlerProcess()
325
- process.crawl(JurusanSpider)
326
- process.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from datetime import datetime
7
  import os, re, tempfile
8
  import sys
9
+ from typing import Dict, List
10
+
11
+ # Crawl4AI helper for rendered fetching
12
+ try:
13
+ from utils.crawl4ai_utils import fetch_html_sync
14
+ except Exception:
15
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
16
+ from crawl4ai_utils import fetch_html_sync
17
 
18
  # Try import shared dedup upload utility
19
  try:
 
325
  except Exception as e:
326
  self.logger.error(f"❌ Gagal upload rekap: {e}")
327
  finally:
328
+ if temp_path and os.path.exists(temp_path):
329
  os.remove(temp_path)
330
 
331
+ # Build and upload REKAP file
332
+ rekap_filename = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
333
+ temp_path = None
334
+ try:
335
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
336
+ f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
337
+ total_prodi = 0
338
+ jumlah_jurusan = 0
339
+ for jurusan_key, daftar in rekap_prodi.items():
340
+ valid_prodi = []
341
+ for p in daftar:
342
+ if is_valid_prodi(p):
343
+ valid_prodi.append(p.strip())
344
+ if not valid_prodi:
345
+ continue
346
+ jurusan_baca = jurusan_key.replace("_", " ")
347
+ f.write(f"{jurusan_baca}:\n")
348
+ for p in sorted(set(valid_prodi)):
349
+ f.write(f"- {p}\n")
350
+ jumlah_prodi = len(valid_prodi)
351
+ f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
352
+ total_prodi += jumlah_prodi
353
+ jumlah_jurusan += 1
354
+ f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
355
+ f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
356
+ temp_path = f.name
357
+ with open(temp_path, 'r', encoding='utf-8') as rf:
358
+ rekap_text = rf.read()
359
+ result = upload_if_changed(supabase, bucket, rekap_filename, rekap_text)
360
+ status = result.get('result')
361
+ if status == 'uploaded':
362
+ print(f"✅ Uploaded file rekap: {rekap_filename}")
363
+ elif status == 'skipped':
364
+ print(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
365
+ else:
366
+ print(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
367
+ except Exception as e:
368
+ print(f"❌ Gagal upload rekap: {e}")
369
+ finally:
370
+ if temp_path and os.path.exists(temp_path):
371
+ os.remove(temp_path)
scrapping/pnp_scrap.py CHANGED
@@ -5,6 +5,16 @@ import re
5
  import os
6
  from supabase import create_client, Client
7
  import html
 
 
 
 
 
 
 
 
 
 
8
 
9
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
10
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
@@ -427,21 +437,95 @@ class PNPContentSpider(scrapy.Spider):
427
  yield scrapy.Request(
428
  url=full_link,
429
  callback=self.parse_content,
430
- meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
431
  )
432
 
433
-
434
  if __name__ == '__main__':
435
- process = CrawlerProcess({
436
- 'USER_AGENT': 'PNPBot/1.0',
437
- 'DOWNLOAD_DELAY': 2,
438
- 'ROBOTSTXT_OBEY': True,
439
- 'LOG_LEVEL': 'INFO',
440
- 'CONCURRENT_REQUESTS': 1,
441
- 'DOWNLOAD_TIMEOUT': 100,
442
- 'RETRY_TIMES': 3,
443
- 'HTTPCACHE_ENABLED': False,
444
- 'FEED_EXPORT_ENCODING': 'utf-8'
445
- })
446
- process.crawl(PNPContentSpider)
447
- process.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
  from supabase import create_client, Client
7
  import html
8
+ from typing import List
9
+
10
+ # New: HTML parsing and Crawl4AI rendering
11
+ from bs4 import BeautifulSoup
12
+ try:
13
+ from utils.crawl4ai_utils import fetch_html_sync
14
+ except Exception:
15
+ import sys
16
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
17
+ from crawl4ai_utils import fetch_html_sync
18
 
19
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
20
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
 
437
  yield scrapy.Request(
438
  url=full_link,
439
  callback=self.parse_content,
440
+ meta={'page_title': '', 'menu_path': menu_path}
441
  )
442
 
 
443
  if __name__ == '__main__':
444
+ # Crawl4AI-based lightweight runner to fetch and upload core pages
445
+ START_URLS = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
446
+
447
+ def _clean_text(text: str) -> str:
448
+ if not text:
449
+ return ''
450
+ t = html.unescape(' '.join(text.split()))
451
+ t = t.replace('“', '"').replace('â€', '"').replace('’', "'")
452
+ t = t.replace('â€"', '—').replace('â€"', '–')
453
+ return t.strip()
454
+
455
+ def _extract_paragraphs(html_text: str, base_url: str) -> List[str]:
456
+ soup = BeautifulSoup(html_text, 'html.parser')
457
+ selectors = [
458
+ 'div.entry-content', 'article.post', 'main.site-main',
459
+ 'div.content', 'div.main-content', 'div#content', 'div.page-content'
460
+ ]
461
+ content_area = None
462
+ for sel in selectors:
463
+ content_area = soup.select_one(sel)
464
+ if content_area:
465
+ break
466
+ nodes = content_area.select('p, h1, h2, h3, h4, h5, h6, li') if content_area else soup.select('p, h1, h2, h3, h4, h5, h6, li')
467
+ out: List[str] = []
468
+ for node in nodes:
469
+ text = _clean_text(node.get_text(' ', strip=True))
470
+ if text and len(text.split()) >= 5:
471
+ for a in node.find_all('a', href=True):
472
+ href = a['href']
473
+ if href and not href.startswith('#'):
474
+ abs_url = href if href.startswith('http') else os.path.join(base_url, href)
475
+ text += f" (Link: {abs_url})"
476
+ out.append(text)
477
+ return out
478
+
479
+ def _extract_tables(html_text: str, base_url: str) -> str:
480
+ soup = BeautifulSoup(html_text, 'html.parser')
481
+ blocks: List[str] = []
482
+ for ti, table in enumerate(soup.select('table')):
483
+ rows = []
484
+ for tr in table.select('tr'):
485
+ cells = []
486
+ for c in tr.select('th, td'):
487
+ tx = _clean_text(c.get_text(' ', strip=True))
488
+ a = c.find('a', href=True)
489
+ if a and a['href']:
490
+ href = a['href']
491
+ abs_url = href if href.startswith('http') else os.path.join(base_url, href)
492
+ tx += f" (Link: {abs_url})"
493
+ if tx:
494
+ cells.append(tx)
495
+ if cells:
496
+ rows.append(' | '.join(cells))
497
+ if rows:
498
+ blocks.append(f"### Tabel {ti + 1}\n\n" + "\n".join(rows))
499
+ return "\n\n".join(blocks)
500
+
501
+ def _final_md(title: str, url: str, paras: List[str], tables: str) -> str:
502
+ md = f"# {title}\n\n**Tanggal**: {datetime.now().strftime('%d %B %Y')}\n**URL**: {url}\n\n" + "\n".join(paras)
503
+ if tables:
504
+ md += "\n\n## Data Tabel\n\n" + tables
505
+ return md
506
+
507
+ def _upload(page_title: str, content_text: str) -> str:
508
+ safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
509
+ safe_title = re.sub(r'[-\s]+', '-', safe_title)
510
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
511
+ filename = f"{safe_title}_{timestamp}.txt"
512
+ try:
513
+ result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
514
+ return filename if result.get('result') == 'uploaded' else f"skipped_{filename}"
515
+ except Exception as e:
516
+ print(f"Upload error: {e}")
517
+ return f"failed_{filename}"
518
+
519
+ for url in START_URLS:
520
+ try:
521
+ html_text = fetch_html_sync(url)
522
+ soup = BeautifulSoup(html_text, 'html.parser')
523
+ title_node = soup.select_one('h1.entry-title, h1.page-title')
524
+ page_title = title_node.get_text(strip=True) if title_node else (soup.title.string.strip() if soup.title and soup.title.string else 'Unknown Page')
525
+ paras = _extract_paragraphs(html_text, url)
526
+ tables = _extract_tables(html_text, url)
527
+ content = _final_md(page_title, url, paras, tables)
528
+ up = _upload(page_title, content)
529
+ print(f"[PNP crawl] {url} -> {up}")
530
+ except Exception as e:
531
+ print(f"[PNP crawl] Error processing {url}: {e}")
scrapping/utils/crawl4ai_utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import Optional
3
+
4
+ try:
5
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
6
+ except Exception as e:
7
+ AsyncWebCrawler = None # type: ignore
8
+ BrowserConfig = None # type: ignore
9
+ CrawlerRunConfig = None # type: ignore
10
+ CacheMode = None # type: ignore
11
+
12
+
13
+ class Crawl4AIUnavailable(Exception):
14
+ pass
15
+
16
+
17
+ async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
18
+ """Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
19
+ if AsyncWebCrawler is None:
20
+ raise Crawl4AIUnavailable(
21
+ "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
22
+ )
23
+ browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
24
+ run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
25
+ async with AsyncWebCrawler(config=browser_conf) as crawler:
26
+ result = await crawler.arun(url=url, config=run_conf)
27
+ # Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
28
+ html = getattr(result, "html", None)
29
+ if not html:
30
+ # Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
31
+ html = getattr(result, "content", None) or getattr(result, "markdown", "")
32
+ return html
33
+
34
+
35
+ def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
36
+ """Synchronous wrapper for fetch_html."""
37
+ return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))