File size: 5,838 Bytes
2508004
 
 
 
 
f584ef2
2508004
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
2508004
 
 
f584ef2
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2508004
f584ef2
2508004
f584ef2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from tools.webpage_tools import (
    visit_webpage,
    get_all_links,
    read_file_from_url,
    save_dataset_for_followup,
)
from tools.exploration_tools import (
    get_dataset_description,
)
from tools.drawing_tools import (
    plot_departments_data,
)
from tools.libreoffice_tools import (
    convert_to_pdf_with_libreoffice,
    check_libreoffice_availability,
    get_libreoffice_info,
)
from tools.retrieval_tools import (
    search_datasets,
    get_dataset_info,
    get_random_quality_dataset,
)
from smolagents import (
    CodeAgent,
    DuckDuckGoSearchTool,
    LiteLLMModel,
)

def create_web_agent(step_callback):
    search_tool = DuckDuckGoSearchTool()
    model = LiteLLMModel(
                model_id="gemini/gemini-2.5-flash-preview-05-20",
                api_key=os.getenv("GEMINI_API_KEY"),
            )
    web_agent = CodeAgent(
            tools=[
                search_tool,
                visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
                get_dataset_description,
                plot_departments_data,
                convert_to_pdf_with_libreoffice,
                check_libreoffice_availability, get_libreoffice_info,
                search_datasets, get_dataset_info, get_random_quality_dataset
            ],
            model=model,
            max_steps=30,
            verbosity_level=1,  # Reduced verbosity for cleaner output
            planning_interval=3,
            step_callbacks=[step_callback],  # Use the built-in callback system
            additional_authorized_imports=[
                "subprocess", "docx", "docx.*",
                "os", "bs4", "io", "requests", "json", "pandas",
                "matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy",  "seaborn"
            ],
        )
    return web_agent

def generate_prompt(user_query=None, initial_search_results=None):
    """Generate a unified prompt for dataset search and analysis"""
    
    base_instructions = """Follow these steps to analyze French public data:

    1. **Dataset Selection**: 
       - You can use the search_datasets tool to find relevant datasets
       - You can use get_dataset_info to get detailed information about specific datasets
       - You can use get_random_quality_dataset to explore interesting datasets
    
    2. **Dataset Analysis**:
       - Examine the selected dataset page using visit_webpage
       - Get all available data links using get_all_links
       - Download and analyze the dataset using read_file_from_url
       - Save the dataset for follow-up analysis using save_dataset_for_followup
       - Get dataset description using get_dataset_description
    
    3. **Visualization Creation**:
       - If geographic data (departments/regions) is available, create a map of France
       - Create 3 additional non-map visualizations
       - Save all visualizations as PNG files
    
    4. **Report Generation**:
       - Write insightful analysis text for each visualization
       - Generate a comprehensive PDF report using python-docx library that includes:
         * Title page with dataset name and analysis overview
         * All visualizations (PNG files) embedded in the report
         * Analysis text for each visualization
         * Conclusions and next steps
       - Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool

    **Important Technical Notes:**
    - Save everything in the generated_data folder
    - Do NOT use the 'os' module
    - Work step by step, don't generate too much code at once
    - Before PDF conversion, call check_libreoffice_availability() - it returns True/False
    - If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
    - If check_libreoffice_availability() returns False, skip PDF conversion and inform user
    - Do NOT use subprocess calls directly for LibreOffice
    - If question is in English, report is in English. If in French, report is in French.
    """
    
    if user_query and initial_search_results:
        return f"""I need you to analyze French public datasets related to: "{user_query}"

**INITIAL SEARCH RESULTS:**
{initial_search_results}

You have these options:
1. **Use one of the datasets from the initial search results above** - select the most relevant one
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
3. **Get more information** about any dataset using get_dataset_info tool

{base_instructions}

Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
    
    elif user_query:
        return f"""I need you to find and analyze French public datasets related to: "{user_query}"

{base_instructions}

Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""
    
    else:
        return f"""I need you to find and analyze an interesting French public dataset.

{base_instructions}

Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
If user query is not specific, remain generic with respect to the dataset at hand.
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
"""