Ahmedik95316 commited on
Commit
3f77053
·
1 Parent(s): 8f6f5f7

Update data/scrape_real_news.py

Browse files

Renamed `scrape_articles` method to `scrape_real_articles`

Files changed (1) hide show
  1. data/scrape_real_news.py +67 -67
data/scrape_real_news.py CHANGED
@@ -1,67 +1,67 @@
1
- from newspaper import Article, build
2
- from pathlib import Path
3
- import pandas as pd
4
- import datetime
5
-
6
- # Output path
7
- BASE_DIR = Path(__file__).resolve().parent
8
- OUTPUT_PATH = BASE_DIR / "scraped_real.csv"
9
-
10
- # News sources
11
- NEWS_SITES = [
12
- ("Reuters", "https://www.reuters.com/"),
13
- ("BBC", "https://www.bbc.com/news"),
14
- ("NPR", "https://www.npr.org/")
15
- ]
16
-
17
- MAX_ARTICLES = 15
18
-
19
- def scrape_articles():
20
- all_articles = []
21
- total_scraped = 0
22
-
23
- for source_name, url in NEWS_SITES:
24
- print(f"📡 Scraping from {source_name}...")
25
- paper = build(url, memoize_articles=False)
26
-
27
- for article in paper.articles:
28
- if total_scraped >= MAX_ARTICLES:
29
- break
30
-
31
- try:
32
- article.download()
33
- article.parse()
34
-
35
- if len(article.text.strip()) < 100:
36
- continue # Skip very short ones
37
-
38
- text = article.title + ". " + article.text
39
- all_articles.append({
40
- "text": text,
41
- "label": 0,
42
- "source": source_name,
43
- "timestamp": datetime.datetime.now().isoformat()
44
- })
45
-
46
- total_scraped += 1
47
-
48
- except Exception:
49
- continue # Skip failed downloads
50
-
51
- if total_scraped >= MAX_ARTICLES:
52
- break # Stop scraping once target reached
53
-
54
- if all_articles:
55
- df = pd.DataFrame(all_articles)
56
-
57
- if OUTPUT_PATH.exists():
58
- df_existing = pd.read_csv(OUTPUT_PATH)
59
- df = pd.concat([df_existing, df], ignore_index=True)
60
-
61
- df.to_csv(OUTPUT_PATH, index=False)
62
- print(f"✅ Scraped and saved {len(all_articles)} new articles.")
63
- else:
64
- print("⚠️ No articles scraped.")
65
-
66
- if __name__ == "__main__":
67
- scrape_articles()
 
1
+ from newspaper import Article, build
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import datetime
5
+
6
+ # Output path
7
+ BASE_DIR = Path(__file__).resolve().parent
8
+ OUTPUT_PATH = BASE_DIR / "scraped_real.csv"
9
+
10
+ # News sources
11
+ NEWS_SITES = [
12
+ ("Reuters", "https://www.reuters.com/"),
13
+ ("BBC", "https://www.bbc.com/news"),
14
+ ("NPR", "https://www.npr.org/")
15
+ ]
16
+
17
+ MAX_ARTICLES = 15
18
+
19
+ def scrape_real_articles():
20
+ all_articles = []
21
+ total_scraped = 0
22
+
23
+ for source_name, url in NEWS_SITES:
24
+ print(f"📡 Scraping from {source_name}...")
25
+ paper = build(url, memoize_articles=False)
26
+
27
+ for article in paper.articles:
28
+ if total_scraped >= MAX_ARTICLES:
29
+ break
30
+
31
+ try:
32
+ article.download()
33
+ article.parse()
34
+
35
+ if len(article.text.strip()) < 100:
36
+ continue # Skip very short ones
37
+
38
+ text = article.title + ". " + article.text
39
+ all_articles.append({
40
+ "text": text,
41
+ "label": 0,
42
+ "source": source_name,
43
+ "timestamp": datetime.datetime.now().isoformat()
44
+ })
45
+
46
+ total_scraped += 1
47
+
48
+ except Exception:
49
+ continue # Skip failed downloads
50
+
51
+ if total_scraped >= MAX_ARTICLES:
52
+ break # Stop scraping once target reached
53
+
54
+ if all_articles:
55
+ df = pd.DataFrame(all_articles)
56
+
57
+ if OUTPUT_PATH.exists():
58
+ df_existing = pd.read_csv(OUTPUT_PATH)
59
+ df = pd.concat([df_existing, df], ignore_index=True)
60
+
61
+ df.to_csv(OUTPUT_PATH, index=False)
62
+ print(f"✅ Scraped and saved {len(all_articles)} new articles.")
63
+ else:
64
+ print("⚠️ No articles scraped.")
65
+
66
+ if __name__ == "__main__":
67
+ scrape_articles()