Spaces:

axel-darmouni
/

datagouv-french-data-analyst

Sleeping

File size: 5,838 Bytes

import os
from tools.webpage_tools import (
    visit_webpage,
    get_all_links,
    read_file_from_url,
    save_dataset_for_followup,
)
from tools.exploration_tools import (
    get_dataset_description,
)
from tools.drawing_tools import (
    plot_departments_data,
)
from tools.libreoffice_tools import (
    convert_to_pdf_with_libreoffice,
    check_libreoffice_availability,
    get_libreoffice_info,
)
from tools.retrieval_tools import (
    search_datasets,
    get_dataset_info,
    get_random_quality_dataset,
)
from smolagents import (
    CodeAgent,
    DuckDuckGoSearchTool,
    LiteLLMModel,
)

def create_web_agent(step_callback):
    search_tool = DuckDuckGoSearchTool()
    model = LiteLLMModel(
                model_id="gemini/gemini-2.5-flash-preview-05-20",
                api_key=os.getenv("GEMINI_API_KEY"),
            )
    web_agent = CodeAgent(
            tools=[
                search_tool,
                visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
                get_dataset_description,
                plot_departments_data,
                convert_to_pdf_with_libreoffice,
                check_libreoffice_availability, get_libreoffice_info,
                search_datasets, get_dataset_info, get_random_quality_dataset
            ],
            model=model,
            max_steps=30,
            verbosity_level=1,  # Reduced verbosity for cleaner output
            planning_interval=3,
            step_callbacks=[step_callback],  # Use the built-in callback system
            additional_authorized_imports=[
                "subprocess", "docx", "docx.*",
                "os", "bs4", "io", "requests", "json", "pandas",
                "matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy",  "seaborn"
            ],
        )
    return web_agent

def generate_prompt(user_query=None, initial_search_results=None):
    """Generate a unified prompt for dataset search and analysis"""
    
    base_instructions = """Follow these steps to analyze French public data:

    1. **Dataset Selection**: 
       - You can use the search_datasets tool to find relevant datasets
       - You can use get_dataset_info to get detailed information about specific datasets
       - You can use get_random_quality_dataset to explore interesting datasets
    
    2. **Dataset Analysis**:
       - Examine the selected dataset page using visit_webpage
       - Get all available data links using get_all_links
       - Download and analyze the dataset using read_file_from_url
       - Save the dataset for follow-up analysis using save_dataset_for_followup
       - Get dataset description using get_dataset_description
    
    3. **Visualization Creation**:
       - If geographic data (departments/regions) is available, create a map of France
       - Create 3 additional non-map visualizations
       - Save all visualizations as PNG files
    
    4. **Report Generation**:
       - Write insightful analysis text for each visualization
       - Generate a comprehensive PDF report using python-docx library that includes:
         * Title page with dataset name and analysis overview
         * All visualizations (PNG files) embedded in the report
         * Analysis text for each visualization
         * Conclusions and next steps
       - Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool

    **Important Technical Notes:**
    - Save everything in the generated_data folder
    - Do NOT use the 'os' module
    - Work step by step, don't generate too much code at once
    - Before PDF conversion, call check_libreoffice_availability() - it returns True/False
    - If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
    - If check_libreoffice_availability() returns False, skip PDF conversion and inform user
    - Do NOT use subprocess calls directly for LibreOffice
    - If question is in English, report is in English. If in French, report is in French.
    """
    
    if user_query and initial_search_results:
        return f"""I need you to analyze French public datasets related to: "{user_query}"

**INITIAL SEARCH RESULTS:**
{initial_search_results}

You have these options:
1. **Use one of the datasets from the initial search results above** - select the most relevant one
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
3. **Get more information** about any dataset using get_dataset_info tool

{base_instructions}

Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
    
    elif user_query:
        return f"""I need you to find and analyze French public datasets related to: "{user_query}"

{base_instructions}

Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
    
    else:
        return f"""I need you to find and analyze an interesting French public dataset.

{base_instructions}

Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""