axel-darmouni's picture
all gemini
244cc53
raw
history blame
5.85 kB
import os
from tools.webpage_tools import (
visit_webpage,
get_all_links,
read_file_from_url,
save_dataset_for_followup,
)
from tools.exploration_tools import (
get_dataset_description,
)
from tools.drawing_tools import (
plot_departments_data,
)
from tools.retrieval_tools import (
search_datasets,
get_dataset_info,
get_random_quality_dataset,
)
from smolagents import (
CodeAgent,
DuckDuckGoSearchTool,
LiteLLMModel,
)
def create_web_agent(step_callback):
search_tool = DuckDuckGoSearchTool()
model = LiteLLMModel(
model_id="gemini/gemini-2.5-flash-preview-05-20",
api_key=os.getenv("GEMINI_API_KEY"),
)
web_agent = CodeAgent(
tools=[
search_tool,
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
get_dataset_description,
plot_departments_data,
search_datasets, get_dataset_info, get_random_quality_dataset
],
model=model,
max_steps=30,
verbosity_level=1, # Reduced verbosity for cleaner output
planning_interval=3,
step_callbacks=[step_callback], # Use the built-in callback system
additional_authorized_imports=[
"docx", "docx.*",
"os", "bs4", "io", "requests", "json", "pandas",
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
],
)
return web_agent
def generate_prompt(user_query=None, initial_search_results=None):
"""Generate a unified prompt for dataset search and analysis"""
base_instructions = """Follow these steps to analyze French public data:
1. **Dataset Selection**:
- You can use the search_datasets tool to find relevant datasets
- You can use get_dataset_info to get detailed information about specific datasets
- You can use get_random_quality_dataset to explore interesting datasets
2. **Dataset Analysis**:
- Examine the selected dataset page using visit_webpage
- Get all available data links using get_all_links
- Download and analyze the dataset using read_file_from_url
- Save the dataset for follow-up analysis using save_dataset_for_followup
- Get dataset description using get_dataset_description
3. **Visualization Creation**:
- **French Map Creation**: If you have data by French departments or regions, use the plot_departments_data tool:
* Call: plot_departments_data(data_dict, title, filename, color_scheme='viridis')
* data_dict format: {"department_name": value, "department_code": value}
* Supports both department names (e.g., "Paris", "Bouches-du-Rhône") and codes (e.g., "75", "13")
* Example: plot_departments_data({"Paris": 2161, "Lyon": 513}, "Population by Department", "population_map.png")
* The tool automatically saves the map as PNG in generated_data folder
- Create 3 additional non-map visualizations using matplotlib/seaborn
- Save all visualizations as PNG files in generated_data folder
4. **Report Generation**:
- Write insightful analysis text for each visualization
- Generate a comprehensive DOCX report using python-docx library that includes:
* Title page with dataset name and analysis overview
* All visualizations (PNG files) embedded in the report
* Analysis text for each visualization
* Conclusions and next steps
- Save the final DOCX report in the generated_data folder
**Important Technical Notes:**
- Save everything in the generated_data folder
- Do NOT use the 'os' module
- Work step by step, don't generate too much code at once
- Generate a complete DOCX report that can be downloaded by the user
- If question is in English, report is in English. If in French, report is in French.
"""
if user_query and initial_search_results:
return f"""I need you to analyze French public datasets related to: "{user_query}"
**INITIAL SEARCH RESULTS:**
{initial_search_results}
You have these options:
1. **Use one of the datasets from the initial search results above** - select the most relevant one
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
3. **Get more information** about any dataset using get_dataset_info tool
{base_instructions}
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
elif user_query:
return f"""I need you to find and analyze French public datasets related to: "{user_query}"
{base_instructions}
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
else:
return f"""I need you to find and analyze an interesting French public dataset.
{base_instructions}
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""