FauziIsyrinApridal commited on
Commit
06b1c18
Β·
1 Parent(s): d45677d
Files changed (1) hide show
  1. scrapping/jurusan_scrap.py +71 -65
scrapping/jurusan_scrap.py CHANGED
@@ -6,7 +6,7 @@ from supabase import create_client
6
  from datetime import datetime
7
  import os, re, tempfile
8
 
9
- # Load environment variables from .env
10
  load_dotenv()
11
  SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
12
  SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
@@ -14,99 +14,105 @@ SUPABASE_BUCKET = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-stor
14
 
15
  class JurusanSpider(scrapy.Spider):
16
  name = "jurusan"
17
- start_urls = [
18
- f"https://{url}/" for url in {
19
- 'akt.pnp.ac.id': 'Akuntansi',
20
- 'an.pnp.ac.id': 'Administrasi_Niaga',
21
- 'bing.pnp.ac.id': 'Bahasa_Inggris',
22
- 'elektro.pnp.ac.id': 'Teknik_Elektro',
23
- 'me.pnp.ac.id': 'Teknik_Mesin',
24
- 'sipil.pnp.ac.id': 'Teknik_Sipil',
25
- 'ti.pnp.ac.id': 'Teknologi_Informasi',
26
- }.keys()
27
- ]
28
-
29
- custom_settings = {
30
- "LOG_LEVEL": "INFO",
31
- "USER_AGENT": "Mozilla/5.0",
32
  }
33
 
 
 
34
  def __init__(self):
35
  self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
36
  self.bucket = SUPABASE_BUCKET
37
  self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
38
- self.collected_data = []
39
- self.domain_to_name = {
40
- 'akt.pnp.ac.id': 'Akuntansi',
41
- 'an.pnp.ac.id': 'Administrasi_Niaga',
42
- 'bing.pnp.ac.id': 'Bahasa_Inggris',
43
- 'elektro.pnp.ac.id': 'Teknik_Elektro',
44
- 'me.pnp.ac.id': 'Teknik_Mesin',
45
- 'sipil.pnp.ac.id': 'Teknik_Sipil',
46
- 'ti.pnp.ac.id': 'Teknologi_Informasi',
47
- }
48
 
49
  def parse(self, response):
50
  domain = response.url.split("//")[1].split("/")[0]
51
  jurusan = self.domain_to_name.get(domain, domain)
52
- url = response.url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  soup = BeautifulSoup(response.text, "html.parser")
55
  for tag in soup(["script", "style", "noscript"]):
56
  tag.decompose()
57
  visible_text = soup.get_text(separator="\n")
58
-
59
  lines = [line.strip() for line in visible_text.splitlines()]
60
  lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
61
  text_cleaned = "\n".join(lines)[:8000]
62
 
63
- program_studi = []
64
- menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
65
- for menu in menu_elements:
66
- program_studi.append(menu.get_text(strip=True))
67
-
68
- self.collected_data.append({
69
- "jurusan": jurusan,
70
  "url": url,
71
- "prodi": program_studi,
72
- "profil": text_cleaned,
73
  })
74
 
75
- filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
76
- try:
77
- with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
78
- f.write(f"[Jurusan: {jurusan.replace('_', ' ')}]\n\n")
79
- f.write(f"URL: {url}\n\n")
80
- f.write("Program Studi:\n")
81
- for i, p in enumerate(program_studi, 1):
82
- f.write(f"{i}. {p}\n")
83
- f.write(f"\nTotal Program Studi: {len(program_studi)}\n\n")
84
- f.write("Profil Jurusan:\n")
85
- f.write(text_cleaned + "\n\n")
86
- temp_path = f.name
87
 
88
- self.supabase.storage.from_(self.bucket).upload(
89
- path=filename,
90
- file=temp_path,
91
- file_options={"content-type": "text/plain"}
92
- )
93
- self.logger.info(f"βœ… Uploaded file jurusan: {filename}")
94
- except Exception as e:
95
- self.logger.error(f"❌ Gagal upload {filename}: {e}")
96
- finally:
97
- if os.path.exists(temp_path):
98
- os.remove(temp_path)
99
 
100
- def closed(self, reason):
101
- """Dipanggil saat spider selesai β€” membuat dan upload file rekap"""
102
  filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
103
  try:
104
  with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
105
  f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
106
  total = 0
107
- for item in self.collected_data:
108
- jurusan = item["jurusan"]
109
- daftar = item["prodi"]
110
  f.write(f"{jurusan.replace('_', ' ')}:\n")
111
  for p in daftar:
112
  f.write(f"- {p}\n")
 
6
  from datetime import datetime
7
  import os, re, tempfile
8
 
9
+ # Load environment variables
10
  load_dotenv()
11
  SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
12
  SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
 
14
 
15
  class JurusanSpider(scrapy.Spider):
16
  name = "jurusan"
17
+ custom_settings = {"LOG_LEVEL": "INFO", "USER_AGENT": "Mozilla/5.0"}
18
+
19
+ domain_to_name = {
20
+ 'akt.pnp.ac.id': 'Akuntansi',
21
+ 'an.pnp.ac.id': 'Administrasi_Niaga',
22
+ 'bing.pnp.ac.id': 'Bahasa_Inggris',
23
+ 'elektro.pnp.ac.id': 'Teknik_Elektro',
24
+ 'me.pnp.ac.id': 'Teknik_Mesin',
25
+ 'sipil.pnp.ac.id': 'Teknik_Sipil',
26
+ 'ti.pnp.ac.id': 'Teknologi_Informasi',
 
 
 
 
 
27
  }
28
 
29
+ start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
30
+
31
  def __init__(self):
32
  self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
33
  self.bucket = SUPABASE_BUCKET
34
  self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
35
+ self.rekap_prodi = {}
36
+ self.per_jurusan_pages = {}
 
 
 
 
 
 
 
 
37
 
38
  def parse(self, response):
39
  domain = response.url.split("//")[1].split("/")[0]
40
  jurusan = self.domain_to_name.get(domain, domain)
41
+ soup = BeautifulSoup(response.text, "html.parser")
42
+
43
+ # Cari link penting
44
+ all_links = set()
45
+ for a in soup.find_all("a", href=True):
46
+ href = a["href"]
47
+ if href.startswith("http") and domain in href:
48
+ all_links.add(href)
49
+ elif href.startswith("/"):
50
+ all_links.add(response.urljoin(href))
51
+
52
+ # Ambil link Program Studi
53
+ program_studi = []
54
+ menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
55
+ for menu in menu_elements:
56
+ ul = menu.find_next("ul")
57
+ if ul:
58
+ for li in ul.find_all("li"):
59
+ item = li.get_text(strip=True)
60
+ if item and item not in program_studi:
61
+ program_studi.append(item)
62
+
63
+ self.rekap_prodi[jurusan] = program_studi
64
+
65
+ for link in all_links:
66
+ yield scrapy.Request(link, callback=self.parse_detail, meta={"jurusan": jurusan, "url": link})
67
+
68
+ def parse_detail(self, response):
69
+ jurusan = response.meta["jurusan"]
70
+ url = response.meta["url"]
71
 
72
  soup = BeautifulSoup(response.text, "html.parser")
73
  for tag in soup(["script", "style", "noscript"]):
74
  tag.decompose()
75
  visible_text = soup.get_text(separator="\n")
 
76
  lines = [line.strip() for line in visible_text.splitlines()]
77
  lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
78
  text_cleaned = "\n".join(lines)[:8000]
79
 
80
+ self.per_jurusan_pages.setdefault(jurusan, []).append({
 
 
 
 
 
 
81
  "url": url,
82
+ "content": text_cleaned
 
83
  })
84
 
85
+ def closed(self, reason):
86
+ # Upload file per jurusan
87
+ for jurusan, pages in self.per_jurusan_pages.items():
88
+ filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
89
+ try:
90
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
91
+ for page in pages:
92
+ f.write(f"=== [JURUSAN: {jurusan.replace('_', ' ').upper()}] ===\n")
93
+ f.write(f"=== [HALAMAN: {page['url']}] ===\n\n")
94
+ f.write(page["content"] + "\n\n")
95
+ temp_path = f.name
 
96
 
97
+ self.supabase.storage.from_(self.bucket).upload(
98
+ path=filename,
99
+ file=temp_path,
100
+ file_options={"content-type": "text/plain"}
101
+ )
102
+ self.logger.info(f"βœ… Uploaded file jurusan: {filename}")
103
+ except Exception as e:
104
+ self.logger.error(f"❌ Gagal upload {filename}: {e}")
105
+ finally:
106
+ if os.path.exists(temp_path):
107
+ os.remove(temp_path)
108
 
109
+ # Upload file rekap program studi
 
110
  filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
111
  try:
112
  with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
113
  f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
114
  total = 0
115
+ for jurusan, daftar in self.rekap_prodi.items():
 
 
116
  f.write(f"{jurusan.replace('_', ' ')}:\n")
117
  for p in daftar:
118
  f.write(f"- {p}\n")