|
import os |
|
from tools.webpage_tools import ( |
|
visit_webpage, |
|
get_all_links, |
|
read_file_from_url, |
|
save_dataset_for_followup, |
|
) |
|
from tools.exploration_tools import ( |
|
get_dataset_description, |
|
) |
|
from tools.drawing_tools import ( |
|
plot_departments_data, |
|
) |
|
from tools.libreoffice_tools import ( |
|
convert_to_pdf_with_libreoffice, |
|
check_libreoffice_availability, |
|
get_libreoffice_info, |
|
) |
|
from tools.retrieval_tools import ( |
|
search_datasets, |
|
get_dataset_info, |
|
get_random_quality_dataset, |
|
) |
|
from smolagents import ( |
|
CodeAgent, |
|
DuckDuckGoSearchTool, |
|
LiteLLMModel, |
|
) |
|
|
|
def create_web_agent(step_callback): |
|
search_tool = DuckDuckGoSearchTool() |
|
model = LiteLLMModel( |
|
model_id="gemini/gemini-2.5-flash-preview-05-20", |
|
api_key=os.getenv("GEMINI_API_KEY"), |
|
) |
|
web_agent = CodeAgent( |
|
tools=[ |
|
search_tool, |
|
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup, |
|
get_dataset_description, |
|
plot_departments_data, |
|
convert_to_pdf_with_libreoffice, |
|
check_libreoffice_availability, get_libreoffice_info, |
|
search_datasets, get_dataset_info, get_random_quality_dataset |
|
], |
|
model=model, |
|
max_steps=30, |
|
verbosity_level=1, |
|
planning_interval=3, |
|
step_callbacks=[step_callback], |
|
additional_authorized_imports=[ |
|
"subprocess", "docx", "docx.*", |
|
"os", "bs4", "io", "requests", "json", "pandas", |
|
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn" |
|
], |
|
) |
|
return web_agent |
|
|
|
def generate_prompt(user_query=None, initial_search_results=None): |
|
"""Generate a unified prompt for dataset search and analysis""" |
|
|
|
base_instructions = """Follow these steps to analyze French public data: |
|
|
|
1. **Dataset Selection**: |
|
- You can use the search_datasets tool to find relevant datasets |
|
- You can use get_dataset_info to get detailed information about specific datasets |
|
- You can use get_random_quality_dataset to explore interesting datasets |
|
|
|
2. **Dataset Analysis**: |
|
- Examine the selected dataset page using visit_webpage |
|
- Get all available data links using get_all_links |
|
- Download and analyze the dataset using read_file_from_url |
|
- Save the dataset for follow-up analysis using save_dataset_for_followup |
|
- Get dataset description using get_dataset_description |
|
|
|
3. **Visualization Creation**: |
|
- If geographic data (departments/regions) is available, create a map of France |
|
- Create 3 additional non-map visualizations |
|
- Save all visualizations as PNG files |
|
|
|
4. **Report Generation**: |
|
- Write insightful analysis text for each visualization |
|
- Generate a comprehensive PDF report using python-docx library that includes: |
|
* Title page with dataset name and analysis overview |
|
* All visualizations (PNG files) embedded in the report |
|
* Analysis text for each visualization |
|
* Conclusions and next steps |
|
- Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool |
|
|
|
**Important Technical Notes:** |
|
- Save everything in the generated_data folder |
|
- Do NOT use the 'os' module |
|
- Work step by step, don't generate too much code at once |
|
- Before PDF conversion, call check_libreoffice_availability() - it returns True/False |
|
- If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool |
|
- If check_libreoffice_availability() returns False, skip PDF conversion and inform user |
|
- Do NOT use subprocess calls directly for LibreOffice |
|
- If question is in English, report is in English. If in French, report is in French. |
|
""" |
|
|
|
if user_query and initial_search_results: |
|
return f"""I need you to analyze French public datasets related to: "{user_query}" |
|
|
|
**INITIAL SEARCH RESULTS:** |
|
{initial_search_results} |
|
|
|
You have these options: |
|
1. **Use one of the datasets from the initial search results above** - select the most relevant one |
|
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect |
|
3. **Get more information** about any dataset using get_dataset_info tool |
|
|
|
{base_instructions} |
|
|
|
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}". |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |
|
|
|
elif user_query: |
|
return f"""I need you to find and analyze French public datasets related to: "{user_query}" |
|
|
|
{base_instructions} |
|
|
|
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}". |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |
|
|
|
else: |
|
return f"""I need you to find and analyze an interesting French public dataset. |
|
|
|
{base_instructions} |
|
|
|
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset. |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |