|
import os |
|
from tools.webpage_tools import ( |
|
visit_webpage, |
|
get_all_links, |
|
read_file_from_url, |
|
save_dataset_for_followup, |
|
) |
|
from tools.exploration_tools import ( |
|
get_dataset_description, |
|
) |
|
from tools.drawing_tools import ( |
|
plot_departments_data, |
|
) |
|
from tools.retrieval_tools import ( |
|
search_datasets, |
|
get_dataset_info, |
|
get_random_quality_dataset, |
|
) |
|
from smolagents import ( |
|
CodeAgent, |
|
DuckDuckGoSearchTool, |
|
LiteLLMModel, |
|
) |
|
|
|
def create_web_agent(step_callback): |
|
search_tool = DuckDuckGoSearchTool() |
|
model = LiteLLMModel( |
|
model_id="gemini/gemini-2.5-flash-preview-05-20", |
|
api_key=os.getenv("GEMINI_API_KEY"), |
|
) |
|
web_agent = CodeAgent( |
|
tools=[ |
|
search_tool, |
|
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup, |
|
get_dataset_description, |
|
plot_departments_data, |
|
search_datasets, get_dataset_info, get_random_quality_dataset |
|
], |
|
model=model, |
|
max_steps=30, |
|
verbosity_level=1, |
|
planning_interval=3, |
|
step_callbacks=[step_callback], |
|
additional_authorized_imports=[ |
|
"docx", "docx.*", |
|
"os", "bs4", "io", "requests", "json", "pandas", |
|
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn" |
|
], |
|
) |
|
return web_agent |
|
|
|
def generate_prompt(user_query=None, initial_search_results=None): |
|
"""Generate a unified prompt for dataset search and analysis""" |
|
|
|
base_instructions = """Follow these steps to analyze French public data: |
|
|
|
1. **Dataset Selection**: |
|
- You can use the search_datasets tool to find relevant datasets |
|
- You can use get_dataset_info to get detailed information about specific datasets |
|
- You can use get_random_quality_dataset to explore interesting datasets |
|
|
|
2. **Dataset Analysis**: |
|
- Examine the selected dataset page using visit_webpage |
|
- Get all available data links using get_all_links |
|
- Download and analyze the dataset using read_file_from_url |
|
- Save the dataset for follow-up analysis using save_dataset_for_followup |
|
- Get dataset description using get_dataset_description |
|
|
|
3. **Visualization Creation**: |
|
- **French Map Creation**: If you have data by French departments or regions, use the plot_departments_data tool: |
|
* Call: plot_departments_data(data_dict, title, filename, color_scheme='viridis') |
|
* data_dict format: {"department_name": value, "department_code": value} |
|
* Supports both department names (e.g., "Paris", "Bouches-du-Rhône") and codes (e.g., "75", "13") |
|
* Example: plot_departments_data({"Paris": 2161, "Lyon": 513}, "Population by Department", "population_map.png") |
|
* The tool automatically saves the map as PNG in generated_data folder |
|
- Create 3 additional non-map visualizations using matplotlib/seaborn |
|
- Save all visualizations as PNG files in generated_data folder |
|
|
|
4. **Report Generation**: |
|
- Write insightful analysis text for each visualization |
|
- Generate a comprehensive DOCX report using python-docx library that includes: |
|
* Title page with dataset name and analysis overview |
|
* All visualizations (PNG files) embedded in the report |
|
* Analysis text for each visualization |
|
* Conclusions and next steps |
|
- Save the final DOCX report in the generated_data folder |
|
|
|
**Important Technical Notes:** |
|
- Save everything in the generated_data folder |
|
- Do NOT use the 'os' module |
|
- Work step by step, don't generate too much code at once |
|
- Generate a complete DOCX report that can be downloaded by the user |
|
- If question is in English, report is in English. If in French, report is in French. |
|
""" |
|
|
|
if user_query and initial_search_results: |
|
return f"""I need you to analyze French public datasets related to: "{user_query}" |
|
|
|
**INITIAL SEARCH RESULTS:** |
|
{initial_search_results} |
|
|
|
You have these options: |
|
1. **Use one of the datasets from the initial search results above** - select the most relevant one |
|
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect |
|
3. **Get more information** about any dataset using get_dataset_info tool |
|
|
|
{base_instructions} |
|
|
|
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}". |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |
|
|
|
elif user_query: |
|
return f"""I need you to find and analyze French public datasets related to: "{user_query}" |
|
|
|
{base_instructions} |
|
|
|
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}". |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |
|
|
|
else: |
|
return f"""I need you to find and analyze an interesting French public dataset. |
|
|
|
{base_instructions} |
|
|
|
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset. |
|
If user query is not specific, remain generic with respect to the dataset at hand. |
|
Focus on getting results and analytics; do not go with too much data, we can always improve it later. |
|
""" |