Commit
·
2e2b497
1
Parent(s):
bfc4267
Update data/scrape_real_news.py
Browse files- data/scrape_real_news.py +72 -29
data/scrape_real_news.py
CHANGED
|
@@ -1,17 +1,20 @@
|
|
| 1 |
-
import
|
| 2 |
import time
|
|
|
|
|
|
|
|
|
|
| 3 |
import logging
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
from pathlib import Path
|
| 6 |
-
from datetime import datetime, timedelta
|
| 7 |
-
from typing import List, Dict, Optional, Tuple
|
| 8 |
from newspaper import Article, build
|
| 9 |
-
import
|
| 10 |
-
import json
|
| 11 |
from urllib.parse import urljoin, urlparse
|
| 12 |
-
import
|
|
|
|
|
|
|
| 13 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 14 |
-
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(
|
|
@@ -324,40 +327,42 @@ class RobustNewsScraper:
|
|
| 324 |
return all_articles
|
| 325 |
|
| 326 |
def save_scraped_articles(self, articles: List[Dict]) -> bool:
|
| 327 |
-
"""Save scraped articles
|
| 328 |
try:
|
| 329 |
if not articles:
|
| 330 |
-
logger.info("No articles to save")
|
| 331 |
return True
|
| 332 |
|
| 333 |
-
#
|
| 334 |
-
|
| 335 |
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
if self.output_path.exists():
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
# Remove duplicates based on URL
|
| 343 |
-
df_combined = df_combined.drop_duplicates(subset=['url'], keep='last')
|
| 344 |
-
|
| 345 |
-
logger.info(f"Combined with existing data. Total: {len(df_combined)} articles")
|
| 346 |
-
|
| 347 |
-
except Exception as e:
|
| 348 |
-
logger.warning(f"Failed to load existing data: {e}")
|
| 349 |
-
df_combined = df_new
|
| 350 |
else:
|
| 351 |
df_combined = df_new
|
| 352 |
|
| 353 |
-
# Save to CSV
|
| 354 |
df_combined.to_csv(self.output_path, index=False)
|
| 355 |
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
| 358 |
|
|
|
|
|
|
|
| 359 |
except Exception as e:
|
| 360 |
-
logger.error(f"Failed to save articles: {
|
| 361 |
return False
|
| 362 |
|
| 363 |
def generate_scraping_metadata(self, articles: List[Dict]) -> Dict:
|
|
@@ -435,7 +440,45 @@ class RobustNewsScraper:
|
|
| 435 |
error_msg = f"Scraping process failed: {str(e)}"
|
| 436 |
logger.error(error_msg)
|
| 437 |
return False, error_msg
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
def scrape_articles():
|
| 440 |
"""Main function for external calls"""
|
| 441 |
scraper = RobustNewsScraper()
|
|
|
|
| 1 |
+
import re
|
| 2 |
import time
|
| 3 |
+
import json
|
| 4 |
+
import random
|
| 5 |
+
import hashlib
|
| 6 |
import logging
|
| 7 |
+
import requests
|
| 8 |
import pandas as pd
|
| 9 |
from pathlib import Path
|
|
|
|
|
|
|
| 10 |
from newspaper import Article, build
|
| 11 |
+
from datetime import datetime, timedelta
|
|
|
|
| 12 |
from urllib.parse import urljoin, urlparse
|
| 13 |
+
from typing import List, Dict, Optional, Tuple
|
| 14 |
+
from data.validation_schemas import ValidationLevel
|
| 15 |
+
from data.data_validator import DataValidationPipeline
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
+
|
| 18 |
|
| 19 |
# Configure logging
|
| 20 |
logging.basicConfig(
|
|
|
|
| 327 |
return all_articles
|
| 328 |
|
| 329 |
def save_scraped_articles(self, articles: List[Dict]) -> bool:
|
| 330 |
+
"""Save scraped articles with validation"""
|
| 331 |
try:
|
| 332 |
if not articles:
|
|
|
|
| 333 |
return True
|
| 334 |
|
| 335 |
+
# Validate articles first
|
| 336 |
+
valid_articles, validation_summary = self.validate_scraped_articles(articles)
|
| 337 |
|
| 338 |
+
logger.info(f"Validation: {len(valid_articles)}/{len(articles)} articles passed validation")
|
| 339 |
+
|
| 340 |
+
if not valid_articles:
|
| 341 |
+
logger.warning("No valid articles to save after validation")
|
| 342 |
+
return True
|
| 343 |
+
|
| 344 |
+
# Create DataFrame and save
|
| 345 |
+
df_new = pd.DataFrame(valid_articles)
|
| 346 |
+
|
| 347 |
+
# Existing file handling logic...
|
| 348 |
if self.output_path.exists():
|
| 349 |
+
df_existing = pd.read_csv(self.output_path)
|
| 350 |
+
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 351 |
+
df_combined = df_combined.drop_duplicates(subset=['text'], keep='first')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
else:
|
| 353 |
df_combined = df_new
|
| 354 |
|
|
|
|
| 355 |
df_combined.to_csv(self.output_path, index=False)
|
| 356 |
|
| 357 |
+
# Save validation report
|
| 358 |
+
validation_report_path = self.data_dir / "scraping_validation_report.json"
|
| 359 |
+
with open(validation_report_path, 'w') as f:
|
| 360 |
+
json.dump(validation_summary, f, indent=2)
|
| 361 |
|
| 362 |
+
logger.info(f"Saved {len(valid_articles)} validated articles to {self.output_path}")
|
| 363 |
+
return True
|
| 364 |
except Exception as e:
|
| 365 |
+
logger.error(f"Failed to save validated articles: {e}")
|
| 366 |
return False
|
| 367 |
|
| 368 |
def generate_scraping_metadata(self, articles: List[Dict]) -> Dict:
|
|
|
|
| 440 |
error_msg = f"Scraping process failed: {str(e)}"
|
| 441 |
logger.error(error_msg)
|
| 442 |
return False, error_msg
|
| 443 |
+
|
| 444 |
+
def validate_scraped_articles(self, articles: List[Dict]) -> Tuple[List[Dict], Dict]:
|
| 445 |
+
"""Validate scraped articles using validation schemas"""
|
| 446 |
+
if not articles:
|
| 447 |
+
return articles, {}
|
| 448 |
+
|
| 449 |
+
validator = DataValidationPipeline()
|
| 450 |
+
|
| 451 |
+
# Ensure required fields for validation
|
| 452 |
+
enhanced_articles = []
|
| 453 |
+
for article in articles:
|
| 454 |
+
enhanced_article = article.copy()
|
| 455 |
+
if 'source' not in enhanced_article:
|
| 456 |
+
enhanced_article['source'] = 'scraped_real'
|
| 457 |
+
if 'label' not in enhanced_article:
|
| 458 |
+
enhanced_article['label'] = 0 # Real news
|
| 459 |
+
enhanced_articles.append(enhanced_article)
|
| 460 |
+
|
| 461 |
+
# Validate batch
|
| 462 |
+
validation_result = validator.validate_scraped_data(enhanced_articles, "web_scraping")
|
| 463 |
+
|
| 464 |
+
# Filter valid articles
|
| 465 |
+
valid_articles = []
|
| 466 |
+
for i, result in enumerate(validation_result.validation_results):
|
| 467 |
+
if result.is_valid:
|
| 468 |
+
article = enhanced_articles[i].copy()
|
| 469 |
+
article['validation_quality_score'] = result.quality_metrics.get('overall_quality_score', 0.0)
|
| 470 |
+
valid_articles.append(article)
|
| 471 |
+
|
| 472 |
+
validation_summary = {
|
| 473 |
+
'original_count': len(articles),
|
| 474 |
+
'valid_count': len(valid_articles),
|
| 475 |
+
'success_rate': validation_result.success_rate,
|
| 476 |
+
'overall_quality_score': validation_result.overall_quality_score
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
return valid_articles, validation_summary
|
| 480 |
+
|
| 481 |
+
|
| 482 |
def scrape_articles():
|
| 483 |
"""Main function for external calls"""
|
| 484 |
scraper = RobustNewsScraper()
|