|
import requests |
|
from smolagents import tool |
|
from requests.exceptions import RequestException |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin |
|
from dotenv import load_dotenv |
|
import pandas as pd |
|
import json |
|
from io import StringIO, BytesIO |
|
|
|
load_dotenv() |
|
|
|
@tool |
|
def visit_webpage(url: str) -> str: |
|
"""Visits a webpage at the given URL and returns its full DOM content. |
|
|
|
Args: |
|
url: The URL of the webpage to visit. |
|
|
|
Returns: |
|
The DOM of the webpage as a string, or an error message if the request fails. |
|
""" |
|
try: |
|
|
|
response = requests.get(url) |
|
response.raise_for_status() |
|
|
|
return response.text |
|
|
|
except RequestException as e: |
|
return f"Error fetching the webpage: {str(e)}" |
|
except Exception as e: |
|
return f"An unexpected error occurred: {str(e)}" |
|
|
|
@tool |
|
def get_all_links(html_content: str, base_url: str) -> list[str]: |
|
""" |
|
Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content. |
|
|
|
Args: |
|
html_content: The HTML content of a webpage. |
|
base_url: The base URL of the webpage to resolve relative links. |
|
|
|
Returns: |
|
A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page. |
|
""" |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
links = set() |
|
for a_tag in soup.find_all('a', href=True): |
|
href = a_tag['href'] |
|
|
|
absolute_url = urljoin(base_url, href) |
|
if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')): |
|
links.add(absolute_url) |
|
return list(links) |
|
|
|
@tool |
|
def read_csv_file(file_path: str) -> str: |
|
""" |
|
Reads a CSV file and returns its content as a string. |
|
|
|
Args: |
|
file_path: The path to the CSV file. |
|
|
|
Returns: |
|
The content of the CSV file as a string, or an error message if the file cannot be read. |
|
""" |
|
try: |
|
df = pd.read_csv(file_path, delimiter=';') |
|
return df.to_string() |
|
except Exception as e: |
|
return f"Error reading the CSV file: {str(e)}" |
|
|
|
@tool |
|
def read_file_from_url(url: str) -> pd.DataFrame: |
|
""" |
|
Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame. |
|
|
|
Args: |
|
url: The URL of the CSV, JSON, or Excel file to read. |
|
|
|
Returns: |
|
A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read. |
|
""" |
|
try: |
|
|
|
response = requests.get(url) |
|
response.raise_for_status() |
|
|
|
|
|
if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']: |
|
response.encoding = 'utf-8' |
|
|
|
|
|
if url.lower().endswith('.csv'): |
|
|
|
content_bytes = response.content |
|
|
|
|
|
try: |
|
|
|
df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8') |
|
except Exception: |
|
try: |
|
|
|
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8') |
|
except Exception: |
|
try: |
|
|
|
df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8') |
|
except Exception: |
|
|
|
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1') |
|
|
|
elif url.lower().endswith('.json'): |
|
|
|
json_data = json.loads(response.text) |
|
|
|
|
|
if isinstance(json_data, list): |
|
df = pd.DataFrame(json_data) |
|
elif isinstance(json_data, dict): |
|
|
|
if len(json_data.keys()) == 1: |
|
|
|
key = list(json_data.keys())[0] |
|
if isinstance(json_data[key], list): |
|
df = pd.DataFrame(json_data[key]) |
|
else: |
|
df = pd.DataFrame([json_data]) |
|
else: |
|
|
|
df = pd.DataFrame([json_data]) |
|
else: |
|
raise ValueError("Unsupported JSON structure") |
|
|
|
elif url.lower().endswith('.xlsx'): |
|
|
|
content_bytes = response.content |
|
df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl') |
|
|
|
else: |
|
raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.") |
|
|
|
return df |
|
|
|
except RequestException as e: |
|
raise Exception(f"Error fetching the file from URL: {str(e)}") |
|
except json.JSONDecodeError as e: |
|
raise Exception(f"Error parsing JSON file: {str(e)}") |
|
except pd.errors.EmptyDataError: |
|
raise Exception("The file is empty or contains no data") |
|
except Exception as e: |
|
raise Exception(f"An unexpected error occurred: {str(e)}") |
|
|
|
@tool |
|
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str: |
|
""" |
|
Save the current dataset to the generated_data folder for follow-up analysis. |
|
|
|
Args: |
|
df: The pandas DataFrame to save |
|
filename: Name of the file to save (default: "analysis_dataset.csv") |
|
|
|
Returns: |
|
Confirmation message with file path |
|
""" |
|
try: |
|
|
|
import os |
|
os.makedirs('generated_data', exist_ok=True) |
|
|
|
|
|
filepath = os.path.join('generated_data', filename) |
|
df.to_csv(filepath, index=False) |
|
|
|
return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)" |
|
|
|
except Exception as e: |
|
return f"Error saving dataset: {str(e)}" |
|
|
|
if __name__ == "__main__": |
|
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/" |
|
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/" |
|
dom_content = visit_webpage(url) |
|
if not dom_content.startswith("Error"): |
|
all_links = get_all_links(dom_content, url) |
|
for link in all_links: |
|
print(link) |
|
|
|
link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv" |
|
link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv" |
|
content = read_file_from_url(link) |
|
print(content.head()) |