File size: 5,849 Bytes
2508004 f584ef2 2508004 f584ef2 2508004 244cc53 2508004 f584ef2 2508004 f584ef2 2508004 2dd2794 2508004 f584ef2 6122d6e f584ef2 2dd2794 f584ef2 2dd2794 f584ef2 2dd2794 f584ef2 2508004 f584ef2 2508004 f584ef2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
from tools.webpage_tools import (
visit_webpage,
get_all_links,
read_file_from_url,
save_dataset_for_followup,
)
from tools.exploration_tools import (
get_dataset_description,
)
from tools.drawing_tools import (
plot_departments_data,
)
from tools.retrieval_tools import (
search_datasets,
get_dataset_info,
get_random_quality_dataset,
)
from smolagents import (
CodeAgent,
DuckDuckGoSearchTool,
LiteLLMModel,
)
def create_web_agent(step_callback):
search_tool = DuckDuckGoSearchTool()
model = LiteLLMModel(
model_id="gemini/gemini-2.5-flash-preview-05-20",
api_key=os.getenv("GEMINI_API_KEY"),
)
web_agent = CodeAgent(
tools=[
search_tool,
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
get_dataset_description,
plot_departments_data,
search_datasets, get_dataset_info, get_random_quality_dataset
],
model=model,
max_steps=30,
verbosity_level=1, # Reduced verbosity for cleaner output
planning_interval=3,
step_callbacks=[step_callback], # Use the built-in callback system
additional_authorized_imports=[
"docx", "docx.*",
"os", "bs4", "io", "requests", "json", "pandas",
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
],
)
return web_agent
def generate_prompt(user_query=None, initial_search_results=None):
"""Generate a unified prompt for dataset search and analysis"""
base_instructions = """Follow these steps to analyze French public data:
1. **Dataset Selection**:
- You can use the search_datasets tool to find relevant datasets
- You can use get_dataset_info to get detailed information about specific datasets
- You can use get_random_quality_dataset to explore interesting datasets
2. **Dataset Analysis**:
- Examine the selected dataset page using visit_webpage
- Get all available data links using get_all_links
- Download and analyze the dataset using read_file_from_url
- Save the dataset for follow-up analysis using save_dataset_for_followup
- Get dataset description using get_dataset_description
3. **Visualization Creation**:
- **French Map Creation**: If you have data by French departments or regions, use the plot_departments_data tool:
* Call: plot_departments_data(data_dict, title, filename, color_scheme='viridis')
* data_dict format: {"department_name": value, "department_code": value}
* Supports both department names (e.g., "Paris", "Bouches-du-Rhône") and codes (e.g., "75", "13")
* Example: plot_departments_data({"Paris": 2161, "Lyon": 513}, "Population by Department", "population_map.png")
* The tool automatically saves the map as PNG in generated_data folder
- Create 3 additional non-map visualizations using matplotlib/seaborn
- Save all visualizations as PNG files in generated_data folder
4. **Report Generation**:
- Write insightful analysis text for each visualization
- Generate a comprehensive DOCX report using python-docx library that includes:
* Title page with dataset name and analysis overview
* All visualizations (PNG files) embedded in the report
* Analysis text for each visualization
* Conclusions and next steps
- Save the final DOCX report in the generated_data folder
**Important Technical Notes:**
- Save everything in the generated_data folder
- Do NOT use the 'os' module
- Work step by step, don't generate too much code at once
- Generate a complete DOCX report that can be downloaded by the user
- If question is in English, report is in English. If in French, report is in French.
"""
if user_query and initial_search_results:
return f"""I need you to analyze French public datasets related to: "{user_query}"
**INITIAL SEARCH RESULTS:**
{initial_search_results}
You have these options:
1. **Use one of the datasets from the initial search results above** - select the most relevant one
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
3. **Get more information** about any dataset using get_dataset_info tool
{base_instructions}
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
elif user_query:
return f"""I need you to find and analyze French public datasets related to: "{user_query}"
{base_instructions}
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
else:
return f"""I need you to find and analyze an interesting French public dataset.
{base_instructions}
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
""" |