axel-darmouni's picture
all modifs
f584ef2
raw
history blame
5.84 kB
import os
from tools.webpage_tools import (
visit_webpage,
get_all_links,
read_file_from_url,
save_dataset_for_followup,
)
from tools.exploration_tools import (
get_dataset_description,
)
from tools.drawing_tools import (
plot_departments_data,
)
from tools.libreoffice_tools import (
convert_to_pdf_with_libreoffice,
check_libreoffice_availability,
get_libreoffice_info,
)
from tools.retrieval_tools import (
search_datasets,
get_dataset_info,
get_random_quality_dataset,
)
from smolagents import (
CodeAgent,
DuckDuckGoSearchTool,
LiteLLMModel,
)
def create_web_agent(step_callback):
search_tool = DuckDuckGoSearchTool()
model = LiteLLMModel(
model_id="gemini/gemini-2.5-flash-preview-05-20",
api_key=os.getenv("GEMINI_API_KEY"),
)
web_agent = CodeAgent(
tools=[
search_tool,
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
get_dataset_description,
plot_departments_data,
convert_to_pdf_with_libreoffice,
check_libreoffice_availability, get_libreoffice_info,
search_datasets, get_dataset_info, get_random_quality_dataset
],
model=model,
max_steps=30,
verbosity_level=1, # Reduced verbosity for cleaner output
planning_interval=3,
step_callbacks=[step_callback], # Use the built-in callback system
additional_authorized_imports=[
"subprocess", "docx", "docx.*",
"os", "bs4", "io", "requests", "json", "pandas",
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
],
)
return web_agent
def generate_prompt(user_query=None, initial_search_results=None):
"""Generate a unified prompt for dataset search and analysis"""
base_instructions = """Follow these steps to analyze French public data:
1. **Dataset Selection**:
- You can use the search_datasets tool to find relevant datasets
- You can use get_dataset_info to get detailed information about specific datasets
- You can use get_random_quality_dataset to explore interesting datasets
2. **Dataset Analysis**:
- Examine the selected dataset page using visit_webpage
- Get all available data links using get_all_links
- Download and analyze the dataset using read_file_from_url
- Save the dataset for follow-up analysis using save_dataset_for_followup
- Get dataset description using get_dataset_description
3. **Visualization Creation**:
- If geographic data (departments/regions) is available, create a map of France
- Create 3 additional non-map visualizations
- Save all visualizations as PNG files
4. **Report Generation**:
- Write insightful analysis text for each visualization
- Generate a comprehensive PDF report using python-docx library that includes:
* Title page with dataset name and analysis overview
* All visualizations (PNG files) embedded in the report
* Analysis text for each visualization
* Conclusions and next steps
- Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool
**Important Technical Notes:**
- Save everything in the generated_data folder
- Do NOT use the 'os' module
- Work step by step, don't generate too much code at once
- Before PDF conversion, call check_libreoffice_availability() - it returns True/False
- If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
- If check_libreoffice_availability() returns False, skip PDF conversion and inform user
- Do NOT use subprocess calls directly for LibreOffice
- If question is in English, report is in English. If in French, report is in French.
"""
if user_query and initial_search_results:
return f"""I need you to analyze French public datasets related to: "{user_query}"
**INITIAL SEARCH RESULTS:**
{initial_search_results}
You have these options:
1. **Use one of the datasets from the initial search results above** - select the most relevant one
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
3. **Get more information** about any dataset using get_dataset_info tool
{base_instructions}
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
elif user_query:
return f"""I need you to find and analyze French public datasets related to: "{user_query}"
{base_instructions}
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
else:
return f"""I need you to find and analyze an interesting French public dataset.
{base_instructions}
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""