import os from tools.webpage_tools import ( visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup, ) from tools.exploration_tools import ( get_dataset_description, ) from tools.drawing_tools import ( plot_departments_data, ) from tools.retrieval_tools import ( search_datasets, get_dataset_info, get_random_quality_dataset, ) from smolagents import ( CodeAgent, DuckDuckGoSearchTool, LiteLLMModel, ) def create_web_agent(step_callback): search_tool = DuckDuckGoSearchTool() model = LiteLLMModel( model_id="gemini/gemini-2.5-flash-preview-05-20", api_key=os.getenv("GEMINI_API_KEY"), ) web_agent = CodeAgent( tools=[ search_tool, visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup, get_dataset_description, plot_departments_data, search_datasets, get_dataset_info, get_random_quality_dataset ], model=model, max_steps=30, verbosity_level=1, # Reduced verbosity for cleaner output planning_interval=3, step_callbacks=[step_callback], # Use the built-in callback system additional_authorized_imports=[ "docx", "docx.*", "os", "bs4", "io", "requests", "json", "pandas", "matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn" ], ) return web_agent def generate_prompt(user_query=None, initial_search_results=None): """Generate a unified prompt for dataset search and analysis""" base_instructions = """Follow these steps to analyze French public data: 1. **Dataset Selection**: - You can use the search_datasets tool to find relevant datasets - You can use get_dataset_info to get detailed information about specific datasets - You can use get_random_quality_dataset to explore interesting datasets 2. **Dataset Analysis**: - Examine the selected dataset page using visit_webpage - Get all available data links using get_all_links - Download and analyze the dataset using read_file_from_url - Save the dataset for follow-up analysis using save_dataset_for_followup - Get dataset description using get_dataset_description 3. **Visualization Creation**: - **French Map Creation**: If you have data by French departments or regions, use the plot_departments_data tool: * Call: plot_departments_data(data_dict, title, filename, color_scheme='viridis') * data_dict format: {"department_name": value, "department_code": value} * Supports both department names (e.g., "Paris", "Bouches-du-Rhône") and codes (e.g., "75", "13") * Example: plot_departments_data({"Paris": 2161, "Lyon": 513}, "Population by Department", "population_map.png") * The tool automatically saves the map as PNG in generated_data folder - Create 3 additional non-map visualizations using matplotlib/seaborn - Save all visualizations as PNG files in generated_data folder 4. **Report Generation**: - Write insightful analysis text for each visualization - Generate a comprehensive DOCX report using python-docx library that includes: * Title page with dataset name and analysis overview * All visualizations (PNG files) embedded in the report * Analysis text for each visualization * Conclusions and next steps - Save the final DOCX report in the generated_data folder **Important Technical Notes:** - Save everything in the generated_data folder - Do NOT use the 'os' module - Work step by step, don't generate too much code at once - Generate a complete DOCX report that can be downloaded by the user - If question is in English, report is in English. If in French, report is in French. """ if user_query and initial_search_results: return f"""I need you to analyze French public datasets related to: "{user_query}" **INITIAL SEARCH RESULTS:** {initial_search_results} You have these options: 1. **Use one of the datasets from the initial search results above** - select the most relevant one 2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect 3. **Get more information** about any dataset using get_dataset_info tool {base_instructions} Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}". If user query is not specific, remain generic with respect to the dataset at hand. Focus on getting results and analytics; do not go with too much data, we can always improve it later. """ elif user_query: return f"""I need you to find and analyze French public datasets related to: "{user_query}" {base_instructions} Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}". If user query is not specific, remain generic with respect to the dataset at hand. Focus on getting results and analytics; do not go with too much data, we can always improve it later. """ else: return f"""I need you to find and analyze an interesting French public dataset. {base_instructions} Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset. If user query is not specific, remain generic with respect to the dataset at hand. Focus on getting results and analytics; do not go with too much data, we can always improve it later. """