Spaces:

taeyeol
/

news

Sleeping

File size: 6,460 Bytes

8a0aec6

# -*- coding: utf-8 -*-
import requests
import pandas as pd
from bs4 import BeautifulSoup
import gradio as gr
from io import BytesIO
import tempfile
import os

def scrap_naver_news(keyword):
    """
    [디버깅 메세지 추가]
    1) 검색 키워드로 네이버뉴스 검색URL 구성
    2) requests를 통해 HTML 데이터 수신
    3) BeautifulSoup로 파싱하여 기사 정보 수집
    4) HTML 표로 정리
    5) BytesIO -> 임시파일로 저장 -> 다운로드 가능하도록 반환
    """
    debug_msgs = []

    base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
    target_url = base_url + keyword
    debug_msgs.append(f"[디버그] 요청 URL: {target_url}")

    response = requests.get(target_url)
    if response.status_code != 200:
        debug_msgs.append(f"[오류] 네이버 뉴스 검색 실패 (응답 코드: {response.status_code})")
        return "오류 발생", None, debug_msgs

    soup = BeautifulSoup(response.text, "html.parser")

    # div.news_area 내부에 기사 정보가 존재(요청 예시 구조 참고)
    news_list = soup.select("div.news_area")
    debug_msgs.append(f"[디버그] news_area 추출 개수: {len(news_list)}")

    results = []

    for idx, news in enumerate(news_list):
        # 신문사
        try:
            press = news.select_one(".info.press").get_text(strip=True)
        except:
            press = "확인불가"

        # 날짜/발행일
        # - .info_group > .info:nth-last-child(1) 형태로 들어가는 경우가 많음
        # - 일자 정보가 여러 개면 마지막 것(또는 중간) 등을 고려
        try:
            info_group = news.select_one(".info_group")
            # info 태그가 여러개 있을 수 있으니 '1일 전', '3시간 전' 등만 골라낼 수도 있음
            # 여기서는 가장 마지막 .info를 추출(날짜 또는 '네이버뉴스' 등일 수 있음)
            # 실제로는 언론사 옆에 있는게 날짜이므로, 상황에 따라 파싱이 달라질 수 있음
            info_all = info_group.select(".info")
            date = info_all[-1].get_text(strip=True) if info_all else "확인불가"
            # '네이버뉴스' 등 텍스트가 들어있을 수 있으므로 날짜 형태가 아니면 재처리 가능
            # 여기서는 단순 예시로 넘어감
        except:
            date = "확인불가"

        # 제목 & 링크
        try:
            title_elem = news.select_one(".news_tit")
            title = title_elem.get("title", "").strip()
            link = title_elem.get("href", "").strip()
        except:
            title = "제목확인불가"
            link = ""

        # 뉴스 간략정보
        try:
            desc_elem = news.select_one(".news_dsc .api_txt_lines")
            desc = desc_elem.get_text(strip=True) if desc_elem else "내용확인불가"
        except:
            desc = "내용확인불가"

        debug_msgs.append(f"[디버그] {idx+1}번째 기사 파싱결과 -> 신문사: {press}, 발행일: {date}, 제목: {title}, 링크: {link}")

        results.append({
            "신문사": press,
            "발행일": date,
            "제목": title,
            "뉴스간략정보": desc,
            "링크": link
        })

    # ------------------------------
    # HTML 테이블 생성
    # ------------------------------
    table_html = """
    <table border="1" style="border-collapse: collapse; width: 100%;">
      <thead>
        <tr>
          <th style="padding: 5px;">신문사</th>
          <th style="padding: 5px;">발행일</th>
          <th style="padding: 5px;">제목</th>
          <th style="padding: 5px;">뉴스간략정보</th>
          <th style="padding: 5px;">링크</th>
        </tr>
      </thead>
      <tbody>
    """

    for row in results:
        table_html += "<tr>"
        table_html += f"<td style='padding: 5px;'>{row['신문사']}</td>"
        table_html += f"<td style='padding: 5px;'>{row['발행일']}</td>"
        table_html += f"<td style='padding: 5px;'>{row['제목']}</td>"
        table_html += f"<td style='padding: 5px;'>{row['뉴스간략정보']}</td>"
        # 링크는 클릭 가능하도록 a 태그 삽입
        table_html += f"<td style='padding: 5px;'><a href='{row['링크']}' target='_blank'>바로가기</a></td>"
        table_html += "</tr>"

    table_html += """
      </tbody>
    </table>
    """

    # ------------------------------
    # 엑셀(Excel) 생성
    # ------------------------------
    df = pd.DataFrame(results)
    output_io = BytesIO()
    with pd.ExcelWriter(output_io, engine="openpyxl") as writer:
        df.to_excel(writer, index=False, sheet_name="네이버뉴스")
    output_io.seek(0)  # 포인터 위치 초기화

    # 임시파일에 쓰기
    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
        tmp.write(output_io.read())
        tmp_path = tmp.name

    debug_msgs.append(f"[디버그] 엑셀 임시파일 생성 완료 -> {tmp_path}")

    return table_html, tmp_path, debug_msgs

def run_search(keyword):
    """
    Gradio 인터페이스에서 호출되는 함수
    - HTML 출력
    - 파일 다운로드
    - 디버그 메세지
    """
    table_html, file_path, debug_info = scrap_naver_news(keyword)

    if file_path is None:
        return table_html, None, "\n".join(debug_info)

    # Gradio에서 파일을 업데이트할 때는 반환값으로 파일 경로 지정
    return table_html, file_path, "\n".join(debug_info)

def launch_app():
    with gr.Blocks() as demo:
        gr.Markdown("## 네이버 뉴스 스크래핑 예시")

        with gr.Row():
            keyword_input = gr.Textbox(label="검색 키워드 입력", placeholder="예: 오징어")

        with gr.Row():
            search_button = gr.Button("스크래핑 시작")

        # 결과 출력 (HTML)
        result_html = gr.HTML(label="스크래핑 결과 (표 형식)")

        # 엑셀 다운로드
        download_file = gr.File(label="엑셀 다운로드")

        # 디버그 메세지
        debug_box = gr.Textbox(label="디버그 로그", lines=10)

        # 버튼 동작 정의
        search_button.click(
            fn=run_search,
            inputs=[keyword_input],
            outputs=[result_html, download_file, debug_box]
        )

    demo.launch()

if __name__ == "__main__":
    launch_app()