Spaces:

axel-darmouni
/

datagouv-french-data-analyst

Sleeping

File size: 7,315 Bytes

import requests
from smolagents import tool
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from dotenv import load_dotenv
import pandas as pd
import json
from io import StringIO, BytesIO

load_dotenv()

@tool
def visit_webpage(url: str) -> str:
    """Visits a webpage at the given URL and returns its full DOM content.

    Args:
        url: The URL of the webpage to visit.

    Returns:
        The DOM of the webpage as a string, or an error message if the request fails.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        return response.text

    except RequestException as e:
        return f"Error fetching the webpage: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

@tool
def get_all_links(html_content: str, base_url: str) -> list[str]:
    """
    Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.

    Args:
        html_content: The HTML content of a webpage.
        base_url: The base URL of the webpage to resolve relative links.

    Returns:
        A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Join the base URL with the found href to handle relative links
        absolute_url = urljoin(base_url, href)
        if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
            links.add(absolute_url)
    return list(links)

@tool
def read_csv_file(file_path: str) -> str:
    """
    Reads a CSV file and returns its content as a string.

    Args:
        file_path: The path to the CSV file.

    Returns:
        The content of the CSV file as a string, or an error message if the file cannot be read.
    """
    try:
        df = pd.read_csv(file_path, delimiter=';')
        return df.to_string()
    except Exception as e:
        return f"Error reading the CSV file: {str(e)}"

@tool
def read_file_from_url(url: str) -> pd.DataFrame:
    """
    Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.

    Args:
        url: The URL of the CSV, JSON, or Excel file to read.

    Returns:
        A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Handle encoding properly
        if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
            response.encoding = 'utf-8'
        
        # Determine file type based on URL extension
        if url.lower().endswith('.csv'):
            # Use BytesIO to handle encoding properly
            content_bytes = response.content
            
            # Try different delimiters for CSV files
            try:
                # First try comma separator
                df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
            except Exception:
                try:
                    # Then try semicolon separator
                    df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
                except Exception:
                    try:
                        # Finally try tab separator
                        df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
                    except Exception:
                        # Last resort: try latin-1 encoding
                        df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')
        
        elif url.lower().endswith('.json'):
            # Parse JSON and convert to DataFrame - use proper encoding
            json_data = json.loads(response.text)
            
            # Handle different JSON structures
            if isinstance(json_data, list):
                df = pd.DataFrame(json_data)
            elif isinstance(json_data, dict):
                # If it's a dict, try to find the main data array
                if len(json_data.keys()) == 1:
                    # If there's only one key, use its value
                    key = list(json_data.keys())[0]
                    if isinstance(json_data[key], list):
                        df = pd.DataFrame(json_data[key])
                    else:
                        df = pd.DataFrame([json_data])
                else:
                    # Multiple keys, treat the whole dict as a single row
                    df = pd.DataFrame([json_data])
            else:
                raise ValueError("Unsupported JSON structure")
        
        elif url.lower().endswith('.xlsx'):
            # Handle Excel files
            content_bytes = response.content
            df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')
        
        else:
            raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")
        
        return df
        
    except RequestException as e:
        raise Exception(f"Error fetching the file from URL: {str(e)}")
    except json.JSONDecodeError as e:
        raise Exception(f"Error parsing JSON file: {str(e)}")
    except pd.errors.EmptyDataError:
        raise Exception("The file is empty or contains no data")
    except Exception as e:
        raise Exception(f"An unexpected error occurred: {str(e)}")

@tool
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
    """
    Save the current dataset to the generated_data folder for follow-up analysis.
    
    Args:
        df: The pandas DataFrame to save
        filename: Name of the file to save (default: "analysis_dataset.csv")
        
    Returns:
        Confirmation message with file path
    """
    try:
        # Ensure generated_data directory exists
        import os
        os.makedirs('generated_data', exist_ok=True)
        
        # Save the dataset
        filepath = os.path.join('generated_data', filename)
        df.to_csv(filepath, index=False)
        
        return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"
        
    except Exception as e:
        return f"Error saving dataset: {str(e)}"

if __name__ == "__main__":
    url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
    url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
    dom_content = visit_webpage(url)
    if not dom_content.startswith("Error"):
        all_links = get_all_links(dom_content, url)
        for link in all_links:
            print(link)

    link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
    link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
    content = read_file_from_url(link)
    print(content.head())