File size: 7,315 Bytes
2508004 f584ef2 2508004 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import requests
from smolagents import tool
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from dotenv import load_dotenv
import pandas as pd
import json
from io import StringIO, BytesIO
load_dotenv()
@tool
def visit_webpage(url: str) -> str:
"""Visits a webpage at the given URL and returns its full DOM content.
Args:
url: The URL of the webpage to visit.
Returns:
The DOM of the webpage as a string, or an error message if the request fails.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
return response.text
except RequestException as e:
return f"Error fetching the webpage: {str(e)}"
except Exception as e:
return f"An unexpected error occurred: {str(e)}"
@tool
def get_all_links(html_content: str, base_url: str) -> list[str]:
"""
Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.
Args:
html_content: The HTML content of a webpage.
base_url: The base URL of the webpage to resolve relative links.
Returns:
A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
"""
soup = BeautifulSoup(html_content, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
# Join the base URL with the found href to handle relative links
absolute_url = urljoin(base_url, href)
if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
links.add(absolute_url)
return list(links)
@tool
def read_csv_file(file_path: str) -> str:
"""
Reads a CSV file and returns its content as a string.
Args:
file_path: The path to the CSV file.
Returns:
The content of the CSV file as a string, or an error message if the file cannot be read.
"""
try:
df = pd.read_csv(file_path, delimiter=';')
return df.to_string()
except Exception as e:
return f"Error reading the CSV file: {str(e)}"
@tool
def read_file_from_url(url: str) -> pd.DataFrame:
"""
Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.
Args:
url: The URL of the CSV, JSON, or Excel file to read.
Returns:
A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
# Handle encoding properly
if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
response.encoding = 'utf-8'
# Determine file type based on URL extension
if url.lower().endswith('.csv'):
# Use BytesIO to handle encoding properly
content_bytes = response.content
# Try different delimiters for CSV files
try:
# First try comma separator
df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
except Exception:
try:
# Then try semicolon separator
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
except Exception:
try:
# Finally try tab separator
df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
except Exception:
# Last resort: try latin-1 encoding
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')
elif url.lower().endswith('.json'):
# Parse JSON and convert to DataFrame - use proper encoding
json_data = json.loads(response.text)
# Handle different JSON structures
if isinstance(json_data, list):
df = pd.DataFrame(json_data)
elif isinstance(json_data, dict):
# If it's a dict, try to find the main data array
if len(json_data.keys()) == 1:
# If there's only one key, use its value
key = list(json_data.keys())[0]
if isinstance(json_data[key], list):
df = pd.DataFrame(json_data[key])
else:
df = pd.DataFrame([json_data])
else:
# Multiple keys, treat the whole dict as a single row
df = pd.DataFrame([json_data])
else:
raise ValueError("Unsupported JSON structure")
elif url.lower().endswith('.xlsx'):
# Handle Excel files
content_bytes = response.content
df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')
else:
raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")
return df
except RequestException as e:
raise Exception(f"Error fetching the file from URL: {str(e)}")
except json.JSONDecodeError as e:
raise Exception(f"Error parsing JSON file: {str(e)}")
except pd.errors.EmptyDataError:
raise Exception("The file is empty or contains no data")
except Exception as e:
raise Exception(f"An unexpected error occurred: {str(e)}")
@tool
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
"""
Save the current dataset to the generated_data folder for follow-up analysis.
Args:
df: The pandas DataFrame to save
filename: Name of the file to save (default: "analysis_dataset.csv")
Returns:
Confirmation message with file path
"""
try:
# Ensure generated_data directory exists
import os
os.makedirs('generated_data', exist_ok=True)
# Save the dataset
filepath = os.path.join('generated_data', filename)
df.to_csv(filepath, index=False)
return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"
except Exception as e:
return f"Error saving dataset: {str(e)}"
if __name__ == "__main__":
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
dom_content = visit_webpage(url)
if not dom_content.startswith("Error"):
all_links = get_all_links(dom_content, url)
for link in all_links:
print(link)
link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
content = read_file_from_url(link)
print(content.head()) |