axel-darmouni's picture
all modifs
f584ef2
import requests
from smolagents import tool
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from dotenv import load_dotenv
import pandas as pd
import json
from io import StringIO, BytesIO
load_dotenv()
@tool
def visit_webpage(url: str) -> str:
"""Visits a webpage at the given URL and returns its full DOM content.
Args:
url: The URL of the webpage to visit.
Returns:
The DOM of the webpage as a string, or an error message if the request fails.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
return response.text
except RequestException as e:
return f"Error fetching the webpage: {str(e)}"
except Exception as e:
return f"An unexpected error occurred: {str(e)}"
@tool
def get_all_links(html_content: str, base_url: str) -> list[str]:
"""
Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.
Args:
html_content: The HTML content of a webpage.
base_url: The base URL of the webpage to resolve relative links.
Returns:
A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
"""
soup = BeautifulSoup(html_content, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
# Join the base URL with the found href to handle relative links
absolute_url = urljoin(base_url, href)
if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
links.add(absolute_url)
return list(links)
@tool
def read_csv_file(file_path: str) -> str:
"""
Reads a CSV file and returns its content as a string.
Args:
file_path: The path to the CSV file.
Returns:
The content of the CSV file as a string, or an error message if the file cannot be read.
"""
try:
df = pd.read_csv(file_path, delimiter=';')
return df.to_string()
except Exception as e:
return f"Error reading the CSV file: {str(e)}"
@tool
def read_file_from_url(url: str) -> pd.DataFrame:
"""
Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.
Args:
url: The URL of the CSV, JSON, or Excel file to read.
Returns:
A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
# Handle encoding properly
if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
response.encoding = 'utf-8'
# Determine file type based on URL extension
if url.lower().endswith('.csv'):
# Use BytesIO to handle encoding properly
content_bytes = response.content
# Try different delimiters for CSV files
try:
# First try comma separator
df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
except Exception:
try:
# Then try semicolon separator
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
except Exception:
try:
# Finally try tab separator
df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
except Exception:
# Last resort: try latin-1 encoding
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')
elif url.lower().endswith('.json'):
# Parse JSON and convert to DataFrame - use proper encoding
json_data = json.loads(response.text)
# Handle different JSON structures
if isinstance(json_data, list):
df = pd.DataFrame(json_data)
elif isinstance(json_data, dict):
# If it's a dict, try to find the main data array
if len(json_data.keys()) == 1:
# If there's only one key, use its value
key = list(json_data.keys())[0]
if isinstance(json_data[key], list):
df = pd.DataFrame(json_data[key])
else:
df = pd.DataFrame([json_data])
else:
# Multiple keys, treat the whole dict as a single row
df = pd.DataFrame([json_data])
else:
raise ValueError("Unsupported JSON structure")
elif url.lower().endswith('.xlsx'):
# Handle Excel files
content_bytes = response.content
df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')
else:
raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")
return df
except RequestException as e:
raise Exception(f"Error fetching the file from URL: {str(e)}")
except json.JSONDecodeError as e:
raise Exception(f"Error parsing JSON file: {str(e)}")
except pd.errors.EmptyDataError:
raise Exception("The file is empty or contains no data")
except Exception as e:
raise Exception(f"An unexpected error occurred: {str(e)}")
@tool
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
"""
Save the current dataset to the generated_data folder for follow-up analysis.
Args:
df: The pandas DataFrame to save
filename: Name of the file to save (default: "analysis_dataset.csv")
Returns:
Confirmation message with file path
"""
try:
# Ensure generated_data directory exists
import os
os.makedirs('generated_data', exist_ok=True)
# Save the dataset
filepath = os.path.join('generated_data', filename)
df.to_csv(filepath, index=False)
return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"
except Exception as e:
return f"Error saving dataset: {str(e)}"
if __name__ == "__main__":
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
dom_content = visit_webpage(url)
if not dom_content.startswith("Error"):
all_links = get_all_links(dom_content, url)
for link in all_links:
print(link)
link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
content = read_file_from_url(link)
print(content.head())