from newspaper import Article, build
from pathlib import Path
import pandas as pd
import datetime

# Output path
BASE_DIR = Path(__file__).resolve().parent
# OUTPUT_PATH = BASE_DIR / "scraped_real.csv"
OUTPUT_PATH = Path("/tmp/scraped_real.csv")

# News sources
NEWS_SITES = [
    ("Reuters", "https://www.reuters.com/"),
    ("BBC", "https://www.bbc.com/news"),
    ("NPR", "https://www.npr.org/")
]

MAX_ARTICLES = 15

def scrape_articles():
    all_articles = []
    total_scraped = 0

    for source_name, url in NEWS_SITES:
        print(f"📡 Scraping from {source_name}...")
        paper = build(url, memoize_articles=False)

        for article in paper.articles:
            if total_scraped >= MAX_ARTICLES:
                break

            try:
                article.download()
                article.parse()

                if len(article.text.strip()) < 100:
                    continue  # Skip very short ones

                text = article.title + ". " + article.text
                all_articles.append({
                    "text": text,
                    "label": 0,
                    "source": source_name,
                    "timestamp": datetime.datetime.now().isoformat()
                })

                total_scraped += 1

            except Exception:
                continue  # Skip failed downloads

        if total_scraped >= MAX_ARTICLES:
            break  # Stop scraping once target reached

    if all_articles:
        df = pd.DataFrame(all_articles)

        if OUTPUT_PATH.exists():
            df_existing = pd.read_csv(OUTPUT_PATH)
            df = pd.concat([df_existing, df], ignore_index=True)

        df.to_csv(OUTPUT_PATH, index=False)
        print(f"✅ Scraped and saved {len(all_articles)} new articles.")
    else:
        print("⚠️ No articles scraped.")

if __name__ == "__main__":
    scrape_articles()