news / app.py
taeyeol's picture
Create app.py
8a0aec6 verified
# -*- coding: utf-8 -*-
import requests
import pandas as pd
from bs4 import BeautifulSoup
import gradio as gr
from io import BytesIO
import tempfile
import os
def scrap_naver_news(keyword):
"""
[๋””๋ฒ„๊น… ๋ฉ”์„ธ์ง€ ์ถ”๊ฐ€]
1) ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ๋กœ ๋„ค์ด๋ฒ„๋‰ด์Šค ๊ฒ€์ƒ‰URL ๊ตฌ์„ฑ
2) requests๋ฅผ ํ†ตํ•ด HTML ๋ฐ์ดํ„ฐ ์ˆ˜์‹ 
3) BeautifulSoup๋กœ ํŒŒ์‹ฑํ•˜์—ฌ ๊ธฐ์‚ฌ ์ •๋ณด ์ˆ˜์ง‘
4) HTML ํ‘œ๋กœ ์ •๋ฆฌ
5) BytesIO -> ์ž„์‹œํŒŒ์ผ๋กœ ์ €์žฅ -> ๋‹ค์šด๋กœ๋“œ ๊ฐ€๋Šฅํ•˜๋„๋ก ๋ฐ˜ํ™˜
"""
debug_msgs = []
base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
target_url = base_url + keyword
debug_msgs.append(f"[๋””๋ฒ„๊ทธ] ์š”์ฒญ URL: {target_url}")
response = requests.get(target_url)
if response.status_code != 200:
debug_msgs.append(f"[์˜ค๋ฅ˜] ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๊ฒ€์ƒ‰ ์‹คํŒจ (์‘๋‹ต ์ฝ”๋“œ: {response.status_code})")
return "์˜ค๋ฅ˜ ๋ฐœ์ƒ", None, debug_msgs
soup = BeautifulSoup(response.text, "html.parser")
# div.news_area ๋‚ด๋ถ€์— ๊ธฐ์‚ฌ ์ •๋ณด๊ฐ€ ์กด์žฌ(์š”์ฒญ ์˜ˆ์‹œ ๊ตฌ์กฐ ์ฐธ๊ณ )
news_list = soup.select("div.news_area")
debug_msgs.append(f"[๋””๋ฒ„๊ทธ] news_area ์ถ”์ถœ ๊ฐœ์ˆ˜: {len(news_list)}")
results = []
for idx, news in enumerate(news_list):
# ์‹ ๋ฌธ์‚ฌ
try:
press = news.select_one(".info.press").get_text(strip=True)
except:
press = "ํ™•์ธ๋ถˆ๊ฐ€"
# ๋‚ ์งœ/๋ฐœํ–‰์ผ
# - .info_group > .info:nth-last-child(1) ํ˜•ํƒœ๋กœ ๋“ค์–ด๊ฐ€๋Š” ๊ฒฝ์šฐ๊ฐ€ ๋งŽ์Œ
# - ์ผ์ž ์ •๋ณด๊ฐ€ ์—ฌ๋Ÿฌ ๊ฐœ๋ฉด ๋งˆ์ง€๋ง‰ ๊ฒƒ(๋˜๋Š” ์ค‘๊ฐ„) ๋“ฑ์„ ๊ณ ๋ ค
try:
info_group = news.select_one(".info_group")
# info ํƒœ๊ทธ๊ฐ€ ์—ฌ๋Ÿฌ๊ฐœ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋‹ˆ '1์ผ ์ „', '3์‹œ๊ฐ„ ์ „' ๋“ฑ๋งŒ ๊ณจ๋ผ๋‚ผ ์ˆ˜๋„ ์žˆ์Œ
# ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ .info๋ฅผ ์ถ”์ถœ(๋‚ ์งœ ๋˜๋Š” '๋„ค์ด๋ฒ„๋‰ด์Šค' ๋“ฑ์ผ ์ˆ˜ ์žˆ์Œ)
# ์‹ค์ œ๋กœ๋Š” ์–ธ๋ก ์‚ฌ ์˜†์— ์žˆ๋Š”๊ฒŒ ๋‚ ์งœ์ด๋ฏ€๋กœ, ์ƒํ™ฉ์— ๋”ฐ๋ผ ํŒŒ์‹ฑ์ด ๋‹ฌ๋ผ์งˆ ์ˆ˜ ์žˆ์Œ
info_all = info_group.select(".info")
date = info_all[-1].get_text(strip=True) if info_all else "ํ™•์ธ๋ถˆ๊ฐ€"
# '๋„ค์ด๋ฒ„๋‰ด์Šค' ๋“ฑ ํ…์ŠคํŠธ๊ฐ€ ๋“ค์–ด์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ๋‚ ์งœ ํ˜•ํƒœ๊ฐ€ ์•„๋‹ˆ๋ฉด ์žฌ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ
# ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ˆœ ์˜ˆ์‹œ๋กœ ๋„˜์–ด๊ฐ
except:
date = "ํ™•์ธ๋ถˆ๊ฐ€"
# ์ œ๋ชฉ & ๋งํฌ
try:
title_elem = news.select_one(".news_tit")
title = title_elem.get("title", "").strip()
link = title_elem.get("href", "").strip()
except:
title = "์ œ๋ชฉํ™•์ธ๋ถˆ๊ฐ€"
link = ""
# ๋‰ด์Šค ๊ฐ„๋žต์ •๋ณด
try:
desc_elem = news.select_one(".news_dsc .api_txt_lines")
desc = desc_elem.get_text(strip=True) if desc_elem else "๋‚ด์šฉํ™•์ธ๋ถˆ๊ฐ€"
except:
desc = "๋‚ด์šฉํ™•์ธ๋ถˆ๊ฐ€"
debug_msgs.append(f"[๋””๋ฒ„๊ทธ] {idx+1}๋ฒˆ์งธ ๊ธฐ์‚ฌ ํŒŒ์‹ฑ๊ฒฐ๊ณผ -> ์‹ ๋ฌธ์‚ฌ: {press}, ๋ฐœํ–‰์ผ: {date}, ์ œ๋ชฉ: {title}, ๋งํฌ: {link}")
results.append({
"์‹ ๋ฌธ์‚ฌ": press,
"๋ฐœํ–‰์ผ": date,
"์ œ๋ชฉ": title,
"๋‰ด์Šค๊ฐ„๋žต์ •๋ณด": desc,
"๋งํฌ": link
})
# ------------------------------
# HTML ํ…Œ์ด๋ธ” ์ƒ์„ฑ
# ------------------------------
table_html = """
<table border="1" style="border-collapse: collapse; width: 100%;">
<thead>
<tr>
<th style="padding: 5px;">์‹ ๋ฌธ์‚ฌ</th>
<th style="padding: 5px;">๋ฐœํ–‰์ผ</th>
<th style="padding: 5px;">์ œ๋ชฉ</th>
<th style="padding: 5px;">๋‰ด์Šค๊ฐ„๋žต์ •๋ณด</th>
<th style="padding: 5px;">๋งํฌ</th>
</tr>
</thead>
<tbody>
"""
for row in results:
table_html += "<tr>"
table_html += f"<td style='padding: 5px;'>{row['์‹ ๋ฌธ์‚ฌ']}</td>"
table_html += f"<td style='padding: 5px;'>{row['๋ฐœํ–‰์ผ']}</td>"
table_html += f"<td style='padding: 5px;'>{row['์ œ๋ชฉ']}</td>"
table_html += f"<td style='padding: 5px;'>{row['๋‰ด์Šค๊ฐ„๋žต์ •๋ณด']}</td>"
# ๋งํฌ๋Š” ํด๋ฆญ ๊ฐ€๋Šฅํ•˜๋„๋ก a ํƒœ๊ทธ ์‚ฝ์ž…
table_html += f"<td style='padding: 5px;'><a href='{row['๋งํฌ']}' target='_blank'>๋ฐ”๋กœ๊ฐ€๊ธฐ</a></td>"
table_html += "</tr>"
table_html += """
</tbody>
</table>
"""
# ------------------------------
# ์—‘์…€(Excel) ์ƒ์„ฑ
# ------------------------------
df = pd.DataFrame(results)
output_io = BytesIO()
with pd.ExcelWriter(output_io, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="๋„ค์ด๋ฒ„๋‰ด์Šค")
output_io.seek(0) # ํฌ์ธํ„ฐ ์œ„์น˜ ์ดˆ๊ธฐํ™”
# ์ž„์‹œํŒŒ์ผ์— ์“ฐ๊ธฐ
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
tmp.write(output_io.read())
tmp_path = tmp.name
debug_msgs.append(f"[๋””๋ฒ„๊ทธ] ์—‘์…€ ์ž„์‹œํŒŒ์ผ ์ƒ์„ฑ ์™„๋ฃŒ -> {tmp_path}")
return table_html, tmp_path, debug_msgs
def run_search(keyword):
"""
Gradio ์ธํ„ฐํŽ˜์ด์Šค์—์„œ ํ˜ธ์ถœ๋˜๋Š” ํ•จ์ˆ˜
- HTML ์ถœ๋ ฅ
- ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
- ๋””๋ฒ„๊ทธ ๋ฉ”์„ธ์ง€
"""
table_html, file_path, debug_info = scrap_naver_news(keyword)
if file_path is None:
return table_html, None, "\n".join(debug_info)
# Gradio์—์„œ ํŒŒ์ผ์„ ์—…๋ฐ์ดํŠธํ•  ๋•Œ๋Š” ๋ฐ˜ํ™˜๊ฐ’์œผ๋กœ ํŒŒ์ผ ๊ฒฝ๋กœ ์ง€์ •
return table_html, file_path, "\n".join(debug_info)
def launch_app():
with gr.Blocks() as demo:
gr.Markdown("## ๋„ค์ด๋ฒ„ ๋‰ด์Šค ์Šคํฌ๋ž˜ํ•‘ ์˜ˆ์‹œ")
with gr.Row():
keyword_input = gr.Textbox(label="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ", placeholder="์˜ˆ: ์˜ค์ง•์–ด")
with gr.Row():
search_button = gr.Button("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘")
# ๊ฒฐ๊ณผ ์ถœ๋ ฅ (HTML)
result_html = gr.HTML(label="์Šคํฌ๋ž˜ํ•‘ ๊ฒฐ๊ณผ (ํ‘œ ํ˜•์‹)")
# ์—‘์…€ ๋‹ค์šด๋กœ๋“œ
download_file = gr.File(label="์—‘์…€ ๋‹ค์šด๋กœ๋“œ")
# ๋””๋ฒ„๊ทธ ๋ฉ”์„ธ์ง€
debug_box = gr.Textbox(label="๋””๋ฒ„๊ทธ ๋กœ๊ทธ", lines=10)
# ๋ฒ„ํŠผ ๋™์ž‘ ์ •์˜
search_button.click(
fn=run_search,
inputs=[keyword_input],
outputs=[result_html, download_file, debug_box]
)
demo.launch()
if __name__ == "__main__":
launch_app()