import requests from smolagents import tool from requests.exceptions import RequestException from bs4 import BeautifulSoup from urllib.parse import urljoin from dotenv import load_dotenv import pandas as pd import json from io import StringIO, BytesIO load_dotenv() @tool def visit_webpage(url: str) -> str: """Visits a webpage at the given URL and returns its full DOM content. Args: url: The URL of the webpage to visit. Returns: The DOM of the webpage as a string, or an error message if the request fails. """ try: # Send a GET request to the URL response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes return response.text except RequestException as e: return f"Error fetching the webpage: {str(e)}" except Exception as e: return f"An unexpected error occurred: {str(e)}" @tool def get_all_links(html_content: str, base_url: str) -> list[str]: """ Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content. Args: html_content: The HTML content of a webpage. base_url: The base URL of the webpage to resolve relative links. Returns: A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page. """ soup = BeautifulSoup(html_content, 'html.parser') links = set() for a_tag in soup.find_all('a', href=True): href = a_tag['href'] # Join the base URL with the found href to handle relative links absolute_url = urljoin(base_url, href) if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')): links.add(absolute_url) return list(links) @tool def read_csv_file(file_path: str) -> str: """ Reads a CSV file and returns its content as a string. Args: file_path: The path to the CSV file. Returns: The content of the CSV file as a string, or an error message if the file cannot be read. """ try: df = pd.read_csv(file_path, delimiter=';') return df.to_string() except Exception as e: return f"Error reading the CSV file: {str(e)}" @tool def read_file_from_url(url: str) -> pd.DataFrame: """ Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame. Args: url: The URL of the CSV, JSON, or Excel file to read. Returns: A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read. """ try: # Send a GET request to the URL response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes # Handle encoding properly if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']: response.encoding = 'utf-8' # Determine file type based on URL extension if url.lower().endswith('.csv'): # Use BytesIO to handle encoding properly content_bytes = response.content # Try different delimiters for CSV files try: # First try comma separator df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8') except Exception: try: # Then try semicolon separator df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8') except Exception: try: # Finally try tab separator df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8') except Exception: # Last resort: try latin-1 encoding df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1') elif url.lower().endswith('.json'): # Parse JSON and convert to DataFrame - use proper encoding json_data = json.loads(response.text) # Handle different JSON structures if isinstance(json_data, list): df = pd.DataFrame(json_data) elif isinstance(json_data, dict): # If it's a dict, try to find the main data array if len(json_data.keys()) == 1: # If there's only one key, use its value key = list(json_data.keys())[0] if isinstance(json_data[key], list): df = pd.DataFrame(json_data[key]) else: df = pd.DataFrame([json_data]) else: # Multiple keys, treat the whole dict as a single row df = pd.DataFrame([json_data]) else: raise ValueError("Unsupported JSON structure") elif url.lower().endswith('.xlsx'): # Handle Excel files content_bytes = response.content df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl') else: raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.") return df except RequestException as e: raise Exception(f"Error fetching the file from URL: {str(e)}") except json.JSONDecodeError as e: raise Exception(f"Error parsing JSON file: {str(e)}") except pd.errors.EmptyDataError: raise Exception("The file is empty or contains no data") except Exception as e: raise Exception(f"An unexpected error occurred: {str(e)}") @tool def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str: """ Save the current dataset to the generated_data folder for follow-up analysis. Args: df: The pandas DataFrame to save filename: Name of the file to save (default: "analysis_dataset.csv") Returns: Confirmation message with file path """ try: # Ensure generated_data directory exists import os os.makedirs('generated_data', exist_ok=True) # Save the dataset filepath = os.path.join('generated_data', filename) df.to_csv(filepath, index=False) return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)" except Exception as e: return f"Error saving dataset: {str(e)}" if __name__ == "__main__": url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/" url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/" dom_content = visit_webpage(url) if not dom_content.startswith("Error"): all_links = get_all_links(dom_content, url) for link in all_links: print(link) link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv" link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv" content = read_file_from_url(link) print(content.head())