Ahmedik95316 commited on
Commit
2e2b497
·
1 Parent(s): bfc4267

Update data/scrape_real_news.py

Browse files
Files changed (1) hide show
  1. data/scrape_real_news.py +72 -29
data/scrape_real_news.py CHANGED
@@ -1,17 +1,20 @@
1
- import requests
2
  import time
 
 
 
3
  import logging
 
4
  import pandas as pd
5
  from pathlib import Path
6
- from datetime import datetime, timedelta
7
- from typing import List, Dict, Optional, Tuple
8
  from newspaper import Article, build
9
- import hashlib
10
- import json
11
  from urllib.parse import urljoin, urlparse
12
- import random
 
 
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
- import re
15
 
16
  # Configure logging
17
  logging.basicConfig(
@@ -324,40 +327,42 @@ class RobustNewsScraper:
324
  return all_articles
325
 
326
  def save_scraped_articles(self, articles: List[Dict]) -> bool:
327
- """Save scraped articles to CSV with error handling"""
328
  try:
329
  if not articles:
330
- logger.info("No articles to save")
331
  return True
332
 
333
- # Create DataFrame
334
- df_new = pd.DataFrame(articles)
335
 
336
- # Load existing data if present
 
 
 
 
 
 
 
 
 
337
  if self.output_path.exists():
338
- try:
339
- df_existing = pd.read_csv(self.output_path)
340
- df_combined = pd.concat([df_existing, df_new], ignore_index=True)
341
-
342
- # Remove duplicates based on URL
343
- df_combined = df_combined.drop_duplicates(subset=['url'], keep='last')
344
-
345
- logger.info(f"Combined with existing data. Total: {len(df_combined)} articles")
346
-
347
- except Exception as e:
348
- logger.warning(f"Failed to load existing data: {e}")
349
- df_combined = df_new
350
  else:
351
  df_combined = df_new
352
 
353
- # Save to CSV
354
  df_combined.to_csv(self.output_path, index=False)
355
 
356
- logger.info(f"Successfully saved {len(articles)} new articles to {self.output_path}")
357
- return True
 
 
358
 
 
 
359
  except Exception as e:
360
- logger.error(f"Failed to save articles: {str(e)}")
361
  return False
362
 
363
  def generate_scraping_metadata(self, articles: List[Dict]) -> Dict:
@@ -435,7 +440,45 @@ class RobustNewsScraper:
435
  error_msg = f"Scraping process failed: {str(e)}"
436
  logger.error(error_msg)
437
  return False, error_msg
438
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  def scrape_articles():
440
  """Main function for external calls"""
441
  scraper = RobustNewsScraper()
 
1
+ import re
2
  import time
3
+ import json
4
+ import random
5
+ import hashlib
6
  import logging
7
+ import requests
8
  import pandas as pd
9
  from pathlib import Path
 
 
10
  from newspaper import Article, build
11
+ from datetime import datetime, timedelta
 
12
  from urllib.parse import urljoin, urlparse
13
+ from typing import List, Dict, Optional, Tuple
14
+ from data.validation_schemas import ValidationLevel
15
+ from data.data_validator import DataValidationPipeline
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
+
18
 
19
  # Configure logging
20
  logging.basicConfig(
 
327
  return all_articles
328
 
329
  def save_scraped_articles(self, articles: List[Dict]) -> bool:
330
+ """Save scraped articles with validation"""
331
  try:
332
  if not articles:
 
333
  return True
334
 
335
+ # Validate articles first
336
+ valid_articles, validation_summary = self.validate_scraped_articles(articles)
337
 
338
+ logger.info(f"Validation: {len(valid_articles)}/{len(articles)} articles passed validation")
339
+
340
+ if not valid_articles:
341
+ logger.warning("No valid articles to save after validation")
342
+ return True
343
+
344
+ # Create DataFrame and save
345
+ df_new = pd.DataFrame(valid_articles)
346
+
347
+ # Existing file handling logic...
348
  if self.output_path.exists():
349
+ df_existing = pd.read_csv(self.output_path)
350
+ df_combined = pd.concat([df_existing, df_new], ignore_index=True)
351
+ df_combined = df_combined.drop_duplicates(subset=['text'], keep='first')
 
 
 
 
 
 
 
 
 
352
  else:
353
  df_combined = df_new
354
 
 
355
  df_combined.to_csv(self.output_path, index=False)
356
 
357
+ # Save validation report
358
+ validation_report_path = self.data_dir / "scraping_validation_report.json"
359
+ with open(validation_report_path, 'w') as f:
360
+ json.dump(validation_summary, f, indent=2)
361
 
362
+ logger.info(f"Saved {len(valid_articles)} validated articles to {self.output_path}")
363
+ return True
364
  except Exception as e:
365
+ logger.error(f"Failed to save validated articles: {e}")
366
  return False
367
 
368
  def generate_scraping_metadata(self, articles: List[Dict]) -> Dict:
 
440
  error_msg = f"Scraping process failed: {str(e)}"
441
  logger.error(error_msg)
442
  return False, error_msg
443
+
444
+ def validate_scraped_articles(self, articles: List[Dict]) -> Tuple[List[Dict], Dict]:
445
+ """Validate scraped articles using validation schemas"""
446
+ if not articles:
447
+ return articles, {}
448
+
449
+ validator = DataValidationPipeline()
450
+
451
+ # Ensure required fields for validation
452
+ enhanced_articles = []
453
+ for article in articles:
454
+ enhanced_article = article.copy()
455
+ if 'source' not in enhanced_article:
456
+ enhanced_article['source'] = 'scraped_real'
457
+ if 'label' not in enhanced_article:
458
+ enhanced_article['label'] = 0 # Real news
459
+ enhanced_articles.append(enhanced_article)
460
+
461
+ # Validate batch
462
+ validation_result = validator.validate_scraped_data(enhanced_articles, "web_scraping")
463
+
464
+ # Filter valid articles
465
+ valid_articles = []
466
+ for i, result in enumerate(validation_result.validation_results):
467
+ if result.is_valid:
468
+ article = enhanced_articles[i].copy()
469
+ article['validation_quality_score'] = result.quality_metrics.get('overall_quality_score', 0.0)
470
+ valid_articles.append(article)
471
+
472
+ validation_summary = {
473
+ 'original_count': len(articles),
474
+ 'valid_count': len(valid_articles),
475
+ 'success_rate': validation_result.success_rate,
476
+ 'overall_quality_score': validation_result.overall_quality_score
477
+ }
478
+
479
+ return valid_articles, validation_summary
480
+
481
+
482
  def scrape_articles():
483
  """Main function for external calls"""
484
  scraper = RobustNewsScraper()