Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
from typing import List, Tuple
|
5 |
+
import time
|
6 |
+
from pandas.io.formats.style import Styler
|
7 |
+
import streamlit as st
|
8 |
+
import os
|
9 |
+
from datetime import datetime
|
10 |
+
import io
|
11 |
+
|
12 |
+
BASE_URL = "https://cgc.twse.com.tw/front/chPage"
|
13 |
+
|
14 |
+
def fetch_page(offset: int, max_per: int = 30, fmt: str = "") -> str:
|
15 |
+
params = {"offset": offset, "max": max_per, "format": fmt}
|
16 |
+
resp = requests.get(BASE_URL, params=params, timeout=10)
|
17 |
+
resp.raise_for_status()
|
18 |
+
return resp.text
|
19 |
+
|
20 |
+
def parse_companies(html: str) -> List[Tuple[str, str, str]]:
|
21 |
+
soup = BeautifulSoup(html, "html.parser")
|
22 |
+
results = []
|
23 |
+
for tr in soup.select("table tr"):
|
24 |
+
tds = tr.find_all("td")
|
25 |
+
if len(tds) >= 3:
|
26 |
+
code = tds[1].get_text(strip=True)
|
27 |
+
name = tds[2].get_text(strip=True)
|
28 |
+
link_tag = tds[2].find("a")
|
29 |
+
url = link_tag["href"].strip() if link_tag and "href" in link_tag.attrs else ""
|
30 |
+
if code.isdigit():
|
31 |
+
results.append((code, name, url))
|
32 |
+
return results
|
33 |
+
|
34 |
+
def collect_all(start_offset: int = 0, max_per: int = 30, max_pages: int = 100, progress_bar=None, status_text=None) -> pd.DataFrame:
|
35 |
+
all_rows = []
|
36 |
+
offset = start_offset
|
37 |
+
|
38 |
+
for i in range(max_pages):
|
39 |
+
try:
|
40 |
+
# 更新進度條和狀態
|
41 |
+
if progress_bar:
|
42 |
+
progress_bar.progress((i + 1) / max_pages)
|
43 |
+
if status_text:
|
44 |
+
status_text.text(f"正在爬取第 {i + 1} 頁,偏移量: {offset}")
|
45 |
+
|
46 |
+
html = fetch_page(offset, max_per)
|
47 |
+
rows = parse_companies(html)
|
48 |
+
if not rows:
|
49 |
+
if status_text:
|
50 |
+
status_text.text(f"已完成爬取,共處理 {i + 1} 頁")
|
51 |
+
break
|
52 |
+
all_rows.extend(rows)
|
53 |
+
offset += max_per
|
54 |
+
time.sleep(0.5)
|
55 |
+
except Exception as e:
|
56 |
+
if status_text:
|
57 |
+
status_text.text(f"錯誤發生於偏移量 {offset}: {e}")
|
58 |
+
break
|
59 |
+
|
60 |
+
# 加入編號欄位
|
61 |
+
df = pd.DataFrame(all_rows, columns=["公司代碼", "公司名稱", "公司網址"])
|
62 |
+
df.insert(0, "編號", range(1, len(df) + 1))
|
63 |
+
return df
|
64 |
+
|
65 |
+
def style_dataframe(df: pd.DataFrame) -> Styler:
|
66 |
+
"""
|
67 |
+
設定DataFrame的樣式:
|
68 |
+
- 編號、公司代碼、公司名稱欄位標題為藍色背景
|
69 |
+
- 每個欄位的值交替黃色背景
|
70 |
+
"""
|
71 |
+
def header_style(s):
|
72 |
+
"""設定標題樣式"""
|
73 |
+
styles = []
|
74 |
+
for col in s.index:
|
75 |
+
if col in ["編號", "公司代碼", "公司名稱"]:
|
76 |
+
styles.append('background-color: #4472C4; color: white; font-weight: bold')
|
77 |
+
else:
|
78 |
+
styles.append('background-color: #D9D9D9; color: black; font-weight: bold')
|
79 |
+
return styles
|
80 |
+
|
81 |
+
def alternating_rows(s):
|
82 |
+
"""設定交替行顏色"""
|
83 |
+
styles = []
|
84 |
+
for i, col in enumerate(s.index):
|
85 |
+
if col in ["編號", "公司代碼", "公司名稱"]:
|
86 |
+
if s.name % 2 == 0: # 偶數行
|
87 |
+
styles.append('background-color: #FFF2CC') # 淺黃色
|
88 |
+
else: # 奇數行
|
89 |
+
styles.append('background-color: #FFFFFF') # 白色
|
90 |
+
else:
|
91 |
+
styles.append('background-color: #F8F8F8') # 淺灰色
|
92 |
+
return styles
|
93 |
+
|
94 |
+
# 應用樣式
|
95 |
+
styled = df.style.apply(alternating_rows, axis=1).apply(header_style, axis=0)
|
96 |
+
|
97 |
+
# 設定表格整體樣式
|
98 |
+
styled = styled.set_table_styles([
|
99 |
+
{'selector': 'th', 'props': [('text-align', 'center'), ('padding', '8px')]},
|
100 |
+
{'selector': 'td', 'props': [('text-align', 'center'), ('padding', '6px')]},
|
101 |
+
{'selector': 'table', 'props': [('border-collapse', 'collapse'), ('margin', 'auto')]},
|
102 |
+
{'selector': 'th, td', 'props': [('border', '1px solid #CCCCCC')]}
|
103 |
+
])
|
104 |
+
|
105 |
+
return styled
|
106 |
+
|
107 |
+
def save_to_excel(df: pd.DataFrame) -> bytes:
|
108 |
+
"""儲存為Excel格式並應用樣式,返回bytes"""
|
109 |
+
output = io.BytesIO()
|
110 |
+
|
111 |
+
# 建立樣式化的DataFrame
|
112 |
+
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
113 |
+
# 先寫入基本資料
|
114 |
+
df.to_excel(writer, sheet_name='公司資料', index=False)
|
115 |
+
|
116 |
+
# 取得工作表以進行格式設定
|
117 |
+
worksheet = writer.sheets['公司資料']
|
118 |
+
|
119 |
+
# 設定欄寬
|
120 |
+
worksheet.column_dimensions['A'].width = 8 # 編號
|
121 |
+
worksheet.column_dimensions['B'].width = 12 # 公司代碼
|
122 |
+
worksheet.column_dimensions['C'].width = 25 # 公司名稱
|
123 |
+
worksheet.column_dimensions['D'].width = 40 # 公司網址
|
124 |
+
|
125 |
+
# 使用openpyxl進行進階格式設定
|
126 |
+
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
|
127 |
+
|
128 |
+
# 定義顏色
|
129 |
+
blue_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
130 |
+
yellow_fill = PatternFill(start_color="FFF2CC", end_color="FFF2CC", fill_type="solid")
|
131 |
+
white_fill = PatternFill(start_color="FFFFFF", end_color="FFFFFF", fill_type="solid")
|
132 |
+
gray_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
|
133 |
+
|
134 |
+
# 定義字體
|
135 |
+
header_font = Font(bold=True, color="FFFFFF")
|
136 |
+
normal_font = Font(color="000000")
|
137 |
+
|
138 |
+
# 定義對齊
|
139 |
+
center_alignment = Alignment(horizontal="center", vertical="center")
|
140 |
+
|
141 |
+
# 定義邊框
|
142 |
+
thin_border = Border(
|
143 |
+
left=Side(style='thin'),
|
144 |
+
right=Side(style='thin'),
|
145 |
+
top=Side(style='thin'),
|
146 |
+
bottom=Side(style='thin')
|
147 |
+
)
|
148 |
+
|
149 |
+
# 設定標題行格式
|
150 |
+
for col_num, col_name in enumerate(['編號', '公司代碼', '公司名稱', '公司網址'], 1):
|
151 |
+
cell = worksheet.cell(row=1, column=col_num)
|
152 |
+
cell.font = header_font
|
153 |
+
cell.alignment = center_alignment
|
154 |
+
cell.border = thin_border
|
155 |
+
|
156 |
+
if col_name in ['編號', '公司代碼', '公司名稱']:
|
157 |
+
cell.fill = blue_fill
|
158 |
+
else:
|
159 |
+
cell.fill = gray_fill
|
160 |
+
|
161 |
+
# 設定資料行格式
|
162 |
+
for row_num in range(2, len(df) + 2):
|
163 |
+
for col_num in range(1, 5):
|
164 |
+
cell = worksheet.cell(row=row_num, column=col_num)
|
165 |
+
cell.font = normal_font
|
166 |
+
cell.alignment = center_alignment
|
167 |
+
cell.border = thin_border
|
168 |
+
|
169 |
+
# 針對編號、公司代碼、公司名稱欄位設定交替顏色
|
170 |
+
if col_num <= 3: # 編號、公司代碼、公司名稱
|
171 |
+
if (row_num - 2) % 2 == 0: # 偶數行
|
172 |
+
cell.fill = yellow_fill
|
173 |
+
else: # 奇數行
|
174 |
+
cell.fill = white_fill
|
175 |
+
|
176 |
+
output.seek(0)
|
177 |
+
return output.getvalue()
|
178 |
+
|
179 |
+
def save_to_csv(df: pd.DataFrame) -> str:
|
180 |
+
"""儲存為CSV格式,返回CSV字串"""
|
181 |
+
return df.to_csv(index=False, encoding="utf-8-sig")
|
182 |
+
|
183 |
+
def main():
|
184 |
+
st.set_page_config(
|
185 |
+
page_title="台灣證交所公司資料爬取工具",
|
186 |
+
page_icon="🏢",
|
187 |
+
layout="wide",
|
188 |
+
initial_sidebar_state="expanded"
|
189 |
+
)
|
190 |
+
|
191 |
+
st.title("🏢 台灣證交所公司資料爬取工具")
|
192 |
+
st.markdown("這個工具可以幫您從台灣證交所網站爬取上市公司資料,並提供CSV或Excel格式下載。")
|
193 |
+
|
194 |
+
# 側邊欄參數設定
|
195 |
+
with st.sidebar:
|
196 |
+
st.header("⚙️ 參數設定")
|
197 |
+
|
198 |
+
start_offset = st.number_input(
|
199 |
+
"起始偏移量",
|
200 |
+
min_value=0,
|
201 |
+
value=0,
|
202 |
+
step=1,
|
203 |
+
help="從第幾筆資料開始爬取"
|
204 |
+
)
|
205 |
+
|
206 |
+
max_per = st.slider(
|
207 |
+
"每頁筆數",
|
208 |
+
min_value=1,
|
209 |
+
max_value=100,
|
210 |
+
value=30,
|
211 |
+
step=1,
|
212 |
+
help="每次請求爬取的資料筆數"
|
213 |
+
)
|
214 |
+
|
215 |
+
max_pages = st.slider(
|
216 |
+
"最大頁數",
|
217 |
+
min_value=1,
|
218 |
+
max_value=100,
|
219 |
+
value=50,
|
220 |
+
step=1,
|
221 |
+
help="最多爬取幾頁資料"
|
222 |
+
)
|
223 |
+
|
224 |
+
output_format = st.radio(
|
225 |
+
"輸出格式",
|
226 |
+
options=["CSV", "Excel", "兩者都要"],
|
227 |
+
index=1,
|
228 |
+
help="選擇要下載的檔案格式"
|
229 |
+
)
|
230 |
+
|
231 |
+
st.markdown("---")
|
232 |
+
|
233 |
+
# 使用說明
|
234 |
+
with st.expander("📖 使用說明"):
|
235 |
+
st.markdown("""
|
236 |
+
### 參數說明:
|
237 |
+
- **起始偏移量**:從第幾筆資料開始爬取,通常設為0
|
238 |
+
- **每頁筆數**:每次API請求的資料筆數,建議30-50
|
239 |
+
- **最大頁數**:最多爬取幾頁,避免設定太大導致執行時間過長
|
240 |
+
- **輸出格式**:
|
241 |
+
- CSV:純文字格式,適合後續程式處理
|
242 |
+
- Excel:包含樣式格式的Excel檔案
|
243 |
+
- 兩者都要:同時產生CSV和Excel檔案
|
244 |
+
|
245 |
+
### 注意事項:
|
246 |
+
- 爬取過程中請勿關閉瀏覽器
|
247 |
+
- 建議先用較小的參數測試
|
248 |
+
- 檔案會自動加上時間戳記避免重複
|
249 |
+
""")
|
250 |
+
|
251 |
+
# 主要內容區域
|
252 |
+
col1, col2 = st.columns([2, 1])
|
253 |
+
|
254 |
+
with col2:
|
255 |
+
start_scraping = st.button("🚀 開始爬取", type="primary", use_container_width=True)
|
256 |
+
|
257 |
+
# 執行爬取
|
258 |
+
if start_scraping:
|
259 |
+
# 驗證輸入參數
|
260 |
+
if start_offset < 0:
|
261 |
+
st.error("起始偏移量不能小於0")
|
262 |
+
return
|
263 |
+
if max_per <= 0 or max_per > 100:
|
264 |
+
st.error("每頁筆數必須在1-100之間")
|
265 |
+
return
|
266 |
+
if max_pages <= 0 or max_pages > 1000:
|
267 |
+
st.error("最大頁數必須在1-1000之間")
|
268 |
+
return
|
269 |
+
|
270 |
+
try:
|
271 |
+
# 建立進度條和狀態顯示
|
272 |
+
progress_bar = st.progress(0)
|
273 |
+
status_text = st.empty()
|
274 |
+
|
275 |
+
# 開始爬取資料
|
276 |
+
status_text.text("開始爬取資料...")
|
277 |
+
df = collect_all(start_offset, max_per, max_pages, progress_bar, status_text)
|
278 |
+
|
279 |
+
if df.empty:
|
280 |
+
st.warning("未爬取到任何資料")
|
281 |
+
return
|
282 |
+
|
283 |
+
# 產生時間戳記
|
284 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
285 |
+
|
286 |
+
# 完成狀態
|
287 |
+
progress_bar.progress(1.0)
|
288 |
+
status_text.text(f"✅ 成功爬取 {len(df)} 筆公司資料!")
|
289 |
+
|
290 |
+
# 顯示資料預覽
|
291 |
+
st.subheader("📊 資料預覽(前10筆)")
|
292 |
+
st.dataframe(df.head(10), use_container_width=True)
|
293 |
+
|
294 |
+
# 檔案下載區域
|
295 |
+
st.subheader("📁 檔案下載")
|
296 |
+
|
297 |
+
download_col1, download_col2 = st.columns(2)
|
298 |
+
|
299 |
+
if output_format in ["CSV", "兩者都要"]:
|
300 |
+
csv_data = save_to_csv(df)
|
301 |
+
with download_col1:
|
302 |
+
st.download_button(
|
303 |
+
label="⬇️ 下載 CSV 檔案",
|
304 |
+
data=csv_data,
|
305 |
+
file_name=f"companies_{timestamp}.csv",
|
306 |
+
mime="text/csv",
|
307 |
+
use_container_width=True
|
308 |
+
)
|
309 |
+
|
310 |
+
if output_format in ["Excel", "兩者都要"]:
|
311 |
+
excel_data = save_to_excel(df)
|
312 |
+
with download_col2:
|
313 |
+
st.download_button(
|
314 |
+
label="⬇️ 下載 Excel 檔案",
|
315 |
+
data=excel_data,
|
316 |
+
file_name=f"companies_styled_{timestamp}.xlsx",
|
317 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
318 |
+
use_container_width=True
|
319 |
+
)
|
320 |
+
|
321 |
+
# 顯示統計資訊
|
322 |
+
st.subheader("📈 統計資訊")
|
323 |
+
stat_col1, stat_col2, stat_col3 = st.columns(3)
|
324 |
+
|
325 |
+
with stat_col1:
|
326 |
+
st.metric("總公司數量", len(df))
|
327 |
+
|
328 |
+
with stat_col2:
|
329 |
+
st.metric("有網址的公司", len(df[df['公司網址'] != '']))
|
330 |
+
|
331 |
+
with stat_col3:
|
332 |
+
st.metric("執行頁數", min(max_pages, (len(df) // max_per) + 1))
|
333 |
+
|
334 |
+
except Exception as e:
|
335 |
+
st.error(f"❌ 爬取過程中發生錯誤:{str(e)}")
|
336 |
+
|
337 |
+
if __name__ == "__main__":
|
338 |
+
main()
|