Spaces:
Sleeping
Sleeping
FauziIsyrinApridal
commited on
Commit
Β·
06b1c18
1
Parent(s):
d45677d
..
Browse files- scrapping/jurusan_scrap.py +71 -65
scrapping/jurusan_scrap.py
CHANGED
@@ -6,7 +6,7 @@ from supabase import create_client
|
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
8 |
|
9 |
-
# Load environment variables
|
10 |
load_dotenv()
|
11 |
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
|
12 |
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
|
@@ -14,99 +14,105 @@ SUPABASE_BUCKET = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-stor
|
|
14 |
|
15 |
class JurusanSpider(scrapy.Spider):
|
16 |
name = "jurusan"
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
]
|
28 |
-
|
29 |
-
custom_settings = {
|
30 |
-
"LOG_LEVEL": "INFO",
|
31 |
-
"USER_AGENT": "Mozilla/5.0",
|
32 |
}
|
33 |
|
|
|
|
|
34 |
def __init__(self):
|
35 |
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
36 |
self.bucket = SUPABASE_BUCKET
|
37 |
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
38 |
-
self.
|
39 |
-
self.
|
40 |
-
'akt.pnp.ac.id': 'Akuntansi',
|
41 |
-
'an.pnp.ac.id': 'Administrasi_Niaga',
|
42 |
-
'bing.pnp.ac.id': 'Bahasa_Inggris',
|
43 |
-
'elektro.pnp.ac.id': 'Teknik_Elektro',
|
44 |
-
'me.pnp.ac.id': 'Teknik_Mesin',
|
45 |
-
'sipil.pnp.ac.id': 'Teknik_Sipil',
|
46 |
-
'ti.pnp.ac.id': 'Teknologi_Informasi',
|
47 |
-
}
|
48 |
|
49 |
def parse(self, response):
|
50 |
domain = response.url.split("//")[1].split("/")[0]
|
51 |
jurusan = self.domain_to_name.get(domain, domain)
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
soup = BeautifulSoup(response.text, "html.parser")
|
55 |
for tag in soup(["script", "style", "noscript"]):
|
56 |
tag.decompose()
|
57 |
visible_text = soup.get_text(separator="\n")
|
58 |
-
|
59 |
lines = [line.strip() for line in visible_text.splitlines()]
|
60 |
lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
|
61 |
text_cleaned = "\n".join(lines)[:8000]
|
62 |
|
63 |
-
|
64 |
-
menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
|
65 |
-
for menu in menu_elements:
|
66 |
-
program_studi.append(menu.get_text(strip=True))
|
67 |
-
|
68 |
-
self.collected_data.append({
|
69 |
-
"jurusan": jurusan,
|
70 |
"url": url,
|
71 |
-
"
|
72 |
-
"profil": text_cleaned,
|
73 |
})
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
temp_path = f.name
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
|
100 |
-
|
101 |
-
"""Dipanggil saat spider selesai β membuat dan upload file rekap"""
|
102 |
filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
|
103 |
try:
|
104 |
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
105 |
f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
106 |
total = 0
|
107 |
-
for
|
108 |
-
jurusan = item["jurusan"]
|
109 |
-
daftar = item["prodi"]
|
110 |
f.write(f"{jurusan.replace('_', ' ')}:\n")
|
111 |
for p in daftar:
|
112 |
f.write(f"- {p}\n")
|
|
|
6 |
from datetime import datetime
|
7 |
import os, re, tempfile
|
8 |
|
9 |
+
# Load environment variables
|
10 |
load_dotenv()
|
11 |
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
|
12 |
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
|
|
|
14 |
|
15 |
class JurusanSpider(scrapy.Spider):
|
16 |
name = "jurusan"
|
17 |
+
custom_settings = {"LOG_LEVEL": "INFO", "USER_AGENT": "Mozilla/5.0"}
|
18 |
+
|
19 |
+
domain_to_name = {
|
20 |
+
'akt.pnp.ac.id': 'Akuntansi',
|
21 |
+
'an.pnp.ac.id': 'Administrasi_Niaga',
|
22 |
+
'bing.pnp.ac.id': 'Bahasa_Inggris',
|
23 |
+
'elektro.pnp.ac.id': 'Teknik_Elektro',
|
24 |
+
'me.pnp.ac.id': 'Teknik_Mesin',
|
25 |
+
'sipil.pnp.ac.id': 'Teknik_Sipil',
|
26 |
+
'ti.pnp.ac.id': 'Teknologi_Informasi',
|
|
|
|
|
|
|
|
|
|
|
27 |
}
|
28 |
|
29 |
+
start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
|
30 |
+
|
31 |
def __init__(self):
|
32 |
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
33 |
self.bucket = SUPABASE_BUCKET
|
34 |
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
35 |
+
self.rekap_prodi = {}
|
36 |
+
self.per_jurusan_pages = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def parse(self, response):
|
39 |
domain = response.url.split("//")[1].split("/")[0]
|
40 |
jurusan = self.domain_to_name.get(domain, domain)
|
41 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
42 |
+
|
43 |
+
# Cari link penting
|
44 |
+
all_links = set()
|
45 |
+
for a in soup.find_all("a", href=True):
|
46 |
+
href = a["href"]
|
47 |
+
if href.startswith("http") and domain in href:
|
48 |
+
all_links.add(href)
|
49 |
+
elif href.startswith("/"):
|
50 |
+
all_links.add(response.urljoin(href))
|
51 |
+
|
52 |
+
# Ambil link Program Studi
|
53 |
+
program_studi = []
|
54 |
+
menu_elements = soup.find_all("a", string=re.compile("program studi", re.I))
|
55 |
+
for menu in menu_elements:
|
56 |
+
ul = menu.find_next("ul")
|
57 |
+
if ul:
|
58 |
+
for li in ul.find_all("li"):
|
59 |
+
item = li.get_text(strip=True)
|
60 |
+
if item and item not in program_studi:
|
61 |
+
program_studi.append(item)
|
62 |
+
|
63 |
+
self.rekap_prodi[jurusan] = program_studi
|
64 |
+
|
65 |
+
for link in all_links:
|
66 |
+
yield scrapy.Request(link, callback=self.parse_detail, meta={"jurusan": jurusan, "url": link})
|
67 |
+
|
68 |
+
def parse_detail(self, response):
|
69 |
+
jurusan = response.meta["jurusan"]
|
70 |
+
url = response.meta["url"]
|
71 |
|
72 |
soup = BeautifulSoup(response.text, "html.parser")
|
73 |
for tag in soup(["script", "style", "noscript"]):
|
74 |
tag.decompose()
|
75 |
visible_text = soup.get_text(separator="\n")
|
|
|
76 |
lines = [line.strip() for line in visible_text.splitlines()]
|
77 |
lines = [line for line in lines if line and not re.match(r'^\W+$', line)]
|
78 |
text_cleaned = "\n".join(lines)[:8000]
|
79 |
|
80 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
"url": url,
|
82 |
+
"content": text_cleaned
|
|
|
83 |
})
|
84 |
|
85 |
+
def closed(self, reason):
|
86 |
+
# Upload file per jurusan
|
87 |
+
for jurusan, pages in self.per_jurusan_pages.items():
|
88 |
+
filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
|
89 |
+
try:
|
90 |
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
91 |
+
for page in pages:
|
92 |
+
f.write(f"=== [JURUSAN: {jurusan.replace('_', ' ').upper()}] ===\n")
|
93 |
+
f.write(f"=== [HALAMAN: {page['url']}] ===\n\n")
|
94 |
+
f.write(page["content"] + "\n\n")
|
95 |
+
temp_path = f.name
|
|
|
96 |
|
97 |
+
self.supabase.storage.from_(self.bucket).upload(
|
98 |
+
path=filename,
|
99 |
+
file=temp_path,
|
100 |
+
file_options={"content-type": "text/plain"}
|
101 |
+
)
|
102 |
+
self.logger.info(f"β
Uploaded file jurusan: {filename}")
|
103 |
+
except Exception as e:
|
104 |
+
self.logger.error(f"β Gagal upload {filename}: {e}")
|
105 |
+
finally:
|
106 |
+
if os.path.exists(temp_path):
|
107 |
+
os.remove(temp_path)
|
108 |
|
109 |
+
# Upload file rekap program studi
|
|
|
110 |
filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
|
111 |
try:
|
112 |
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
113 |
f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
114 |
total = 0
|
115 |
+
for jurusan, daftar in self.rekap_prodi.items():
|
|
|
|
|
116 |
f.write(f"{jurusan.replace('_', ' ')}:\n")
|
117 |
for p in daftar:
|
118 |
f.write(f"- {p}\n")
|