Spaces:

MoroccanDS
/

A8-Moroccan-News-Aggregator

Sleeping

App Files Files Community

A8-Moroccan-News-Aggregator / akhbarona_ar.py

S-11

Upload 13 files

e45d093 verified over 1 year ago

raw

history blame

2.68 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import os
	from google_drive_handle import authenticate_google_drive
	drive = authenticate_google_drive()

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
	}

	def scrape_article(article_url):
	response = requests.get(article_url, headers=headers)
	soup = BeautifulSoup(response.content, 'html.parser')

	title_tag = soup.find('h1')
	title = title_tag.get_text(strip=True) if title_tag else 'No Title'

	content_div = soup.find('div', id='article_holder') # Ensure this is the correct ID
	if content_div:
	content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))
	else:
	content = 'Content not found'

	return {
	'Title': title,
	'Content': content
	}

	def scrape_category(category_url, num_articles):
	articles_scraped = 0
	all_articles = []
	page_num = 1

	# Extract site and category from the URL
	site_name = category_url.split('/')[2] # This gets 'www.akhbarona.com' from the URL
	site_name = site_name.replace('www.', '')
	category_name = category_url.split('/')[-1] # This gets the category name from the URL

	while articles_scraped < num_articles:
	paginated_url = f"{category_url}/index.{page_num}.html"

	response = requests.get(paginated_url, headers=headers)
	soup = BeautifulSoup(response.content, 'html.parser')

	article_links = soup.find_all('h2', class_='article_title')
	for article_link in article_links:
	a_tag = article_link.find('a')
	if a_tag and 'href' in a_tag.attrs:
	full_article_url = a_tag['href']
	if not full_article_url.startswith('http'):
	full_article_url = f"{category_url}/{full_article_url}"
	article_data = scrape_article(full_article_url)

	all_articles.append(article_data)
	articles_scraped += 1

	if articles_scraped >= num_articles:
	break

	if articles_scraped >= num_articles:
	break

	print(f"Going to next page: {paginated_url}")
	page_num += 1 # Increment the page number


	#csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
	df = pd.DataFrame(all_articles)
	csv_file_name = f"{site_name}_{category_name}_articles.csv"
	csv_file_path = os.path.join(os.getcwd(), csv_file_name) # Full file path
	df.to_csv(csv_file_path, index=False)
	print(f"Articles saved to {csv_file_path}")

	return csv_file_path