Spaces:

axel-darmouni
/

datagouv-french-data-analyst

Sleeping

App Files Files Community

datagouv-french-data-analyst / tools /webpage_tools.py

axel-darmouni

all modifs

f584ef2 3 months ago

raw

history blame contribute delete

7.32 kB

	import requests
	from smolagents import tool
	from requests.exceptions import RequestException
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from dotenv import load_dotenv
	import pandas as pd
	import json
	from io import StringIO, BytesIO

	load_dotenv()

	@tool
	def visit_webpage(url: str) -> str:
	"""Visits a webpage at the given URL and returns its full DOM content.

	Args:
	url: The URL of the webpage to visit.

	Returns:
	The DOM of the webpage as a string, or an error message if the request fails.
	"""
	try:
	# Send a GET request to the URL
	response = requests.get(url)
	response.raise_for_status() # Raise an exception for bad status codes

	return response.text

	except RequestException as e:
	return f"Error fetching the webpage: {str(e)}"
	except Exception as e:
	return f"An unexpected error occurred: {str(e)}"

	@tool
	def get_all_links(html_content: str, base_url: str) -> list[str]:
	"""
	Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.

	Args:
	html_content: The HTML content of a webpage.
	base_url: The base URL of the webpage to resolve relative links.

	Returns:
	A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
	"""
	soup = BeautifulSoup(html_content, 'html.parser')
	links = set()
	for a_tag in soup.find_all('a', href=True):
	href = a_tag['href']
	# Join the base URL with the found href to handle relative links
	absolute_url = urljoin(base_url, href)
	if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
	links.add(absolute_url)
	return list(links)

	@tool
	def read_csv_file(file_path: str) -> str:
	"""
	Reads a CSV file and returns its content as a string.

	Args:
	file_path: The path to the CSV file.

	Returns:
	The content of the CSV file as a string, or an error message if the file cannot be read.
	"""
	try:
	df = pd.read_csv(file_path, delimiter=';')
	return df.to_string()
	except Exception as e:
	return f"Error reading the CSV file: {str(e)}"

	@tool
	def read_file_from_url(url: str) -> pd.DataFrame:
	"""
	Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.

	Args:
	url: The URL of the CSV, JSON, or Excel file to read.

	Returns:
	A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
	"""
	try:
	# Send a GET request to the URL
	response = requests.get(url)
	response.raise_for_status() # Raise an exception for bad status codes

	# Handle encoding properly
	if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
	response.encoding = 'utf-8'

	# Determine file type based on URL extension
	if url.lower().endswith('.csv'):
	# Use BytesIO to handle encoding properly
	content_bytes = response.content

	# Try different delimiters for CSV files
	try:
	# First try comma separator
	df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
	except Exception:
	try:
	# Then try semicolon separator
	df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
	except Exception:
	try:
	# Finally try tab separator
	df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
	except Exception:
	# Last resort: try latin-1 encoding
	df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')

	elif url.lower().endswith('.json'):
	# Parse JSON and convert to DataFrame - use proper encoding
	json_data = json.loads(response.text)

	# Handle different JSON structures
	if isinstance(json_data, list):
	df = pd.DataFrame(json_data)
	elif isinstance(json_data, dict):
	# If it's a dict, try to find the main data array
	if len(json_data.keys()) == 1:
	# If there's only one key, use its value
	key = list(json_data.keys())[0]
	if isinstance(json_data[key], list):
	df = pd.DataFrame(json_data[key])
	else:
	df = pd.DataFrame([json_data])
	else:
	# Multiple keys, treat the whole dict as a single row
	df = pd.DataFrame([json_data])
	else:
	raise ValueError("Unsupported JSON structure")

	elif url.lower().endswith('.xlsx'):
	# Handle Excel files
	content_bytes = response.content
	df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')

	else:
	raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")

	return df

	except RequestException as e:
	raise Exception(f"Error fetching the file from URL: {str(e)}")
	except json.JSONDecodeError as e:
	raise Exception(f"Error parsing JSON file: {str(e)}")
	except pd.errors.EmptyDataError:
	raise Exception("The file is empty or contains no data")
	except Exception as e:
	raise Exception(f"An unexpected error occurred: {str(e)}")

	@tool
	def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
	"""
	Save the current dataset to the generated_data folder for follow-up analysis.

	Args:
	df: The pandas DataFrame to save
	filename: Name of the file to save (default: "analysis_dataset.csv")

	Returns:
	Confirmation message with file path
	"""
	try:
	# Ensure generated_data directory exists
	import os
	os.makedirs('generated_data', exist_ok=True)

	# Save the dataset
	filepath = os.path.join('generated_data', filename)
	df.to_csv(filepath, index=False)

	return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"

	except Exception as e:
	return f"Error saving dataset: {str(e)}"

	if __name__ == "__main__":
	url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
	url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
	dom_content = visit_webpage(url)
	if not dom_content.startswith("Error"):
	all_links = get_all_links(dom_content, url)
	for link in all_links:
	print(link)

	link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
	link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
	content = read_file_from_url(link)
	print(content.head())