File size: 7,315 Bytes
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import requests
from smolagents import tool
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from dotenv import load_dotenv
import pandas as pd
import json
from io import StringIO, BytesIO

load_dotenv()

@tool
def visit_webpage(url: str) -> str:
    """Visits a webpage at the given URL and returns its full DOM content.

    Args:
        url: The URL of the webpage to visit.

    Returns:
        The DOM of the webpage as a string, or an error message if the request fails.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        return response.text

    except RequestException as e:
        return f"Error fetching the webpage: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

@tool
def get_all_links(html_content: str, base_url: str) -> list[str]:
    """
    Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.

    Args:
        html_content: The HTML content of a webpage.
        base_url: The base URL of the webpage to resolve relative links.

    Returns:
        A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Join the base URL with the found href to handle relative links
        absolute_url = urljoin(base_url, href)
        if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
            links.add(absolute_url)
    return list(links)

@tool
def read_csv_file(file_path: str) -> str:
    """
    Reads a CSV file and returns its content as a string.

    Args:
        file_path: The path to the CSV file.

    Returns:
        The content of the CSV file as a string, or an error message if the file cannot be read.
    """
    try:
        df = pd.read_csv(file_path, delimiter=';')
        return df.to_string()
    except Exception as e:
        return f"Error reading the CSV file: {str(e)}"

@tool
def read_file_from_url(url: str) -> pd.DataFrame:
    """
    Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.

    Args:
        url: The URL of the CSV, JSON, or Excel file to read.

    Returns:
        A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Handle encoding properly
        if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
            response.encoding = 'utf-8'
        
        # Determine file type based on URL extension
        if url.lower().endswith('.csv'):
            # Use BytesIO to handle encoding properly
            content_bytes = response.content
            
            # Try different delimiters for CSV files
            try:
                # First try comma separator
                df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
            except Exception:
                try:
                    # Then try semicolon separator
                    df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
                except Exception:
                    try:
                        # Finally try tab separator
                        df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
                    except Exception:
                        # Last resort: try latin-1 encoding
                        df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')
        
        elif url.lower().endswith('.json'):
            # Parse JSON and convert to DataFrame - use proper encoding
            json_data = json.loads(response.text)
            
            # Handle different JSON structures
            if isinstance(json_data, list):
                df = pd.DataFrame(json_data)
            elif isinstance(json_data, dict):
                # If it's a dict, try to find the main data array
                if len(json_data.keys()) == 1:
                    # If there's only one key, use its value
                    key = list(json_data.keys())[0]
                    if isinstance(json_data[key], list):
                        df = pd.DataFrame(json_data[key])
                    else:
                        df = pd.DataFrame([json_data])
                else:
                    # Multiple keys, treat the whole dict as a single row
                    df = pd.DataFrame([json_data])
            else:
                raise ValueError("Unsupported JSON structure")
        
        elif url.lower().endswith('.xlsx'):
            # Handle Excel files
            content_bytes = response.content
            df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')
        
        else:
            raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")
        
        return df
        
    except RequestException as e:
        raise Exception(f"Error fetching the file from URL: {str(e)}")
    except json.JSONDecodeError as e:
        raise Exception(f"Error parsing JSON file: {str(e)}")
    except pd.errors.EmptyDataError:
        raise Exception("The file is empty or contains no data")
    except Exception as e:
        raise Exception(f"An unexpected error occurred: {str(e)}")

@tool
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
    """
    Save the current dataset to the generated_data folder for follow-up analysis.
    
    Args:
        df: The pandas DataFrame to save
        filename: Name of the file to save (default: "analysis_dataset.csv")
        
    Returns:
        Confirmation message with file path
    """
    try:
        # Ensure generated_data directory exists
        import os
        os.makedirs('generated_data', exist_ok=True)
        
        # Save the dataset
        filepath = os.path.join('generated_data', filename)
        df.to_csv(filepath, index=False)
        
        return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"
        
    except Exception as e:
        return f"Error saving dataset: {str(e)}"

if __name__ == "__main__":
    url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
    url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
    dom_content = visit_webpage(url)
    if not dom_content.startswith("Error"):
        all_links = get_all_links(dom_content, url)
        for link in all_links:
            print(link)

    link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
    link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
    content = read_file_from_url(link)
    print(content.head())