|
|
|
import requests |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
import gradio as gr |
|
from io import BytesIO |
|
import tempfile |
|
import os |
|
|
|
def scrap_naver_news(keyword): |
|
""" |
|
[๋๋ฒ๊น
๋ฉ์ธ์ง ์ถ๊ฐ] |
|
1) ๊ฒ์ ํค์๋๋ก ๋ค์ด๋ฒ๋ด์ค ๊ฒ์URL ๊ตฌ์ฑ |
|
2) requests๋ฅผ ํตํด HTML ๋ฐ์ดํฐ ์์ |
|
3) BeautifulSoup๋ก ํ์ฑํ์ฌ ๊ธฐ์ฌ ์ ๋ณด ์์ง |
|
4) HTML ํ๋ก ์ ๋ฆฌ |
|
5) BytesIO -> ์์ํ์ผ๋ก ์ ์ฅ -> ๋ค์ด๋ก๋ ๊ฐ๋ฅํ๋๋ก ๋ฐํ |
|
""" |
|
debug_msgs = [] |
|
|
|
base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query=" |
|
target_url = base_url + keyword |
|
debug_msgs.append(f"[๋๋ฒ๊ทธ] ์์ฒญ URL: {target_url}") |
|
|
|
response = requests.get(target_url) |
|
if response.status_code != 200: |
|
debug_msgs.append(f"[์ค๋ฅ] ๋ค์ด๋ฒ ๋ด์ค ๊ฒ์ ์คํจ (์๋ต ์ฝ๋: {response.status_code})") |
|
return "์ค๋ฅ ๋ฐ์", None, debug_msgs |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
|
|
|
|
news_list = soup.select("div.news_area") |
|
debug_msgs.append(f"[๋๋ฒ๊ทธ] news_area ์ถ์ถ ๊ฐ์: {len(news_list)}") |
|
|
|
results = [] |
|
|
|
for idx, news in enumerate(news_list): |
|
|
|
try: |
|
press = news.select_one(".info.press").get_text(strip=True) |
|
except: |
|
press = "ํ์ธ๋ถ๊ฐ" |
|
|
|
|
|
|
|
|
|
try: |
|
info_group = news.select_one(".info_group") |
|
|
|
|
|
|
|
info_all = info_group.select(".info") |
|
date = info_all[-1].get_text(strip=True) if info_all else "ํ์ธ๋ถ๊ฐ" |
|
|
|
|
|
except: |
|
date = "ํ์ธ๋ถ๊ฐ" |
|
|
|
|
|
try: |
|
title_elem = news.select_one(".news_tit") |
|
title = title_elem.get("title", "").strip() |
|
link = title_elem.get("href", "").strip() |
|
except: |
|
title = "์ ๋ชฉํ์ธ๋ถ๊ฐ" |
|
link = "" |
|
|
|
|
|
try: |
|
desc_elem = news.select_one(".news_dsc .api_txt_lines") |
|
desc = desc_elem.get_text(strip=True) if desc_elem else "๋ด์ฉํ์ธ๋ถ๊ฐ" |
|
except: |
|
desc = "๋ด์ฉํ์ธ๋ถ๊ฐ" |
|
|
|
debug_msgs.append(f"[๋๋ฒ๊ทธ] {idx+1}๋ฒ์งธ ๊ธฐ์ฌ ํ์ฑ๊ฒฐ๊ณผ -> ์ ๋ฌธ์ฌ: {press}, ๋ฐํ์ผ: {date}, ์ ๋ชฉ: {title}, ๋งํฌ: {link}") |
|
|
|
results.append({ |
|
"์ ๋ฌธ์ฌ": press, |
|
"๋ฐํ์ผ": date, |
|
"์ ๋ชฉ": title, |
|
"๋ด์ค๊ฐ๋ต์ ๋ณด": desc, |
|
"๋งํฌ": link |
|
}) |
|
|
|
|
|
|
|
|
|
table_html = """ |
|
<table border="1" style="border-collapse: collapse; width: 100%;"> |
|
<thead> |
|
<tr> |
|
<th style="padding: 5px;">์ ๋ฌธ์ฌ</th> |
|
<th style="padding: 5px;">๋ฐํ์ผ</th> |
|
<th style="padding: 5px;">์ ๋ชฉ</th> |
|
<th style="padding: 5px;">๋ด์ค๊ฐ๋ต์ ๋ณด</th> |
|
<th style="padding: 5px;">๋งํฌ</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
""" |
|
|
|
for row in results: |
|
table_html += "<tr>" |
|
table_html += f"<td style='padding: 5px;'>{row['์ ๋ฌธ์ฌ']}</td>" |
|
table_html += f"<td style='padding: 5px;'>{row['๋ฐํ์ผ']}</td>" |
|
table_html += f"<td style='padding: 5px;'>{row['์ ๋ชฉ']}</td>" |
|
table_html += f"<td style='padding: 5px;'>{row['๋ด์ค๊ฐ๋ต์ ๋ณด']}</td>" |
|
|
|
table_html += f"<td style='padding: 5px;'><a href='{row['๋งํฌ']}' target='_blank'>๋ฐ๋ก๊ฐ๊ธฐ</a></td>" |
|
table_html += "</tr>" |
|
|
|
table_html += """ |
|
</tbody> |
|
</table> |
|
""" |
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(results) |
|
output_io = BytesIO() |
|
with pd.ExcelWriter(output_io, engine="openpyxl") as writer: |
|
df.to_excel(writer, index=False, sheet_name="๋ค์ด๋ฒ๋ด์ค") |
|
output_io.seek(0) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
|
tmp.write(output_io.read()) |
|
tmp_path = tmp.name |
|
|
|
debug_msgs.append(f"[๋๋ฒ๊ทธ] ์์
์์ํ์ผ ์์ฑ ์๋ฃ -> {tmp_path}") |
|
|
|
return table_html, tmp_path, debug_msgs |
|
|
|
def run_search(keyword): |
|
""" |
|
Gradio ์ธํฐํ์ด์ค์์ ํธ์ถ๋๋ ํจ์ |
|
- HTML ์ถ๋ ฅ |
|
- ํ์ผ ๋ค์ด๋ก๋ |
|
- ๋๋ฒ๊ทธ ๋ฉ์ธ์ง |
|
""" |
|
table_html, file_path, debug_info = scrap_naver_news(keyword) |
|
|
|
if file_path is None: |
|
return table_html, None, "\n".join(debug_info) |
|
|
|
|
|
return table_html, file_path, "\n".join(debug_info) |
|
|
|
def launch_app(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("## ๋ค์ด๋ฒ ๋ด์ค ์คํฌ๋ํ ์์") |
|
|
|
with gr.Row(): |
|
keyword_input = gr.Textbox(label="๊ฒ์ ํค์๋ ์
๋ ฅ", placeholder="์: ์ค์ง์ด") |
|
|
|
with gr.Row(): |
|
search_button = gr.Button("์คํฌ๋ํ ์์") |
|
|
|
|
|
result_html = gr.HTML(label="์คํฌ๋ํ ๊ฒฐ๊ณผ (ํ ํ์)") |
|
|
|
|
|
download_file = gr.File(label="์์
๋ค์ด๋ก๋") |
|
|
|
|
|
debug_box = gr.Textbox(label="๋๋ฒ๊ทธ ๋ก๊ทธ", lines=10) |
|
|
|
|
|
search_button.click( |
|
fn=run_search, |
|
inputs=[keyword_input], |
|
outputs=[result_html, download_file, debug_box] |
|
) |
|
|
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
launch_app() |