Commit
Β·
f584ef2
1
Parent(s):
2508004
all modifs
Browse files- README.md +124 -21
- agent.py +87 -37
- app.py +251 -66
- followup_agent.py +119 -0
- tools/followup_tools.py +515 -0
- tools/libreoffice_tools.py +14 -3
- tools/retrieval_tools.py +277 -0
- tools/webpage_tools.py +26 -0
README.md
CHANGED
@@ -8,44 +8,58 @@ sdk_version: 5.33.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
14 |
# π€ French Public Data Analysis Agent
|
15 |
|
16 |
-
**AI-powered intelligent analysis of French
|
17 |
|
18 |
## β¨ Features
|
19 |
|
20 |
### π **Intelligent Dataset Discovery**
|
21 |
- **BM25 Keyword Search**: Advanced keyword matching with pre-computed search indices
|
22 |
- **Bilingual Query Translation**: Search in French or English - queries are automatically translated using LLM
|
23 |
-
- **Quality-Weighted Random Selection**: Leave query empty to randomly select high-quality datasets
|
24 |
- **Real-time Dataset Matching**: Instant matching against 5,000+ French government datasets
|
|
|
25 |
|
26 |
### π€ **Automated AI Analysis**
|
27 |
- **SmolAgents Integration**: Advanced AI agent with 30+ step planning capability
|
28 |
- **Custom Tool Suite**: Specialized tools for web scraping, data analysis, and visualization
|
29 |
- **Multi-step Processing**: Complete pipeline from data discovery to report generation
|
30 |
- **Error Recovery**: Smart error handling and alternative data source selection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
### π **Advanced Visualizations**
|
33 |
- **France Geographic Maps**: Department and region-level choropleth maps
|
34 |
-
- **Multiple Chart Types**: Bar charts, line plots, scatter plots, heatmaps
|
35 |
- **Smart Visualization Selection**: AI automatically chooses appropriate chart types
|
36 |
- **High-Quality PNG Output**: Publication-ready visualizations
|
|
|
37 |
|
38 |
### π **Comprehensive Reports**
|
39 |
- **Professional PDF Reports**: Complete analysis with embedded visualizations
|
40 |
- **Bilingual Support**: Reports generated in the same language as your query
|
41 |
- **Structured Analysis**: Title page, methodology, findings, and next steps
|
42 |
- **LibreOffice Integration**: Cross-platform PDF generation
|
|
|
43 |
|
44 |
### π¨ **Modern Web Interface**
|
45 |
- **Real-time Progress Tracking**: Detailed step-by-step progress updates
|
46 |
- **Responsive Design**: Beautiful, modern Gradio interface
|
47 |
- **Quick Start Examples**: Pre-built queries for common use cases
|
48 |
- **Accordion Tips**: Collapsible help section with usage instructions
|
|
|
|
|
49 |
|
50 |
## π Quick Start
|
51 |
|
@@ -60,7 +74,7 @@ short_description: Agents for data analysis of French public data.
|
|
60 |
```bash
|
61 |
# Clone the repository
|
62 |
git clone <repository-url>
|
63 |
-
cd
|
64 |
|
65 |
# Install dependencies
|
66 |
pip install -r requirements.txt
|
@@ -76,8 +90,14 @@ GEMINI_API_KEY=your_gemini_api_key_here
|
|
76 |
|
77 |
### 4. Launch the Application
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
```bash
|
80 |
-
python
|
81 |
```
|
82 |
|
83 |
The interface will be available at:
|
@@ -86,26 +106,43 @@ The interface will be available at:
|
|
86 |
|
87 |
## π‘ How to Use
|
88 |
|
89 |
-
### Basic
|
90 |
|
91 |
1. **Enter Your Query**: Type any search term related to French public data
|
92 |
- Examples: "road traffic accidents", "education directory", "housing data"
|
93 |
- Supports both French and English queries
|
94 |
|
95 |
2. **Or Use Quick Examples**: Click any of the pre-built example queries:
|
96 |
-
- π Road Traffic Accidents
|
97 |
- π Education Directory
|
98 |
- π French Vacant Housing Private Park
|
99 |
|
100 |
3. **Or Go Random**: Leave the query empty to randomly select a high-quality dataset
|
101 |
|
102 |
-
4. **Click "π Analyze Dataset"**: The AI agent begins processing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
### Results
|
105 |
|
106 |
- **Download PDF Report**: Complete analysis with all visualizations
|
107 |
- **View Individual Charts**: Up to 4 visualizations displayed in the interface
|
108 |
- **Dataset Reference**: Direct link to the original data.gouv.fr page
|
|
|
109 |
|
110 |
## π οΈ Technical Architecture
|
111 |
|
@@ -113,34 +150,55 @@ The interface will be available at:
|
|
113 |
|
114 |
```
|
115 |
π Project Structure
|
116 |
-
βββ app.py
|
117 |
-
βββ
|
118 |
-
βββ
|
119 |
-
|
120 |
-
|
121 |
-
β βββ
|
122 |
-
β
|
123 |
-
βββ
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
125 |
```
|
126 |
|
127 |
### Key Technologies
|
128 |
|
129 |
- **Frontend**: Gradio with custom CSS and real-time progress
|
130 |
-
- **AI
|
|
|
|
|
131 |
- **Search**: BM25 keyword matching with TF-IDF preprocessing
|
132 |
- **Translation**: LLM-powered bilingual query translation
|
133 |
- **Visualization**: Matplotlib, Geopandas, Seaborn
|
134 |
- **PDF Generation**: python-docx + LibreOffice conversion
|
135 |
-
- **Data Processing**: Pandas, NumPy, Shapely
|
|
|
136 |
|
137 |
### Smart Features
|
138 |
|
139 |
-
#### BM25 Search
|
140 |
- Pre-computed search indices for 5,000+ datasets
|
141 |
- Accent-insensitive keyword matching
|
142 |
- Plural form normalization
|
143 |
- Quality-score weighted ranking
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
#### LLM Translation
|
146 |
- Automatic French β English translation
|
@@ -161,6 +219,7 @@ The interface will be available at:
|
|
161 |
1. **"No CSV/JSON files found"**
|
162 |
- The selected dataset doesn't contain processable files
|
163 |
- Try a different query or use the random selection
|
|
|
164 |
|
165 |
2. **LibreOffice PDF conversion fails**
|
166 |
- Ensure LibreOffice is installed and accessible
|
@@ -174,11 +233,17 @@ The interface will be available at:
|
|
174 |
- BM25 index computation may take time on first run
|
175 |
- Pre-computed indices are cached for faster subsequent searches
|
176 |
|
|
|
|
|
|
|
|
|
|
|
177 |
### Performance Optimization
|
178 |
|
179 |
- **Pre-compute BM25**: Run the search once to generate `bm25_data.pkl`
|
180 |
- **Use SSD storage**: Faster file I/O for large datasets
|
181 |
- **Monitor API usage**: API calls for translation and agent execution
|
|
|
182 |
|
183 |
## π Dataset Coverage
|
184 |
|
@@ -187,9 +252,32 @@ The interface will be available at:
|
|
187 |
- **File Formats**: CSV, JSON, Excel, XML
|
188 |
- **Topics**: All major sectors of French public administration
|
189 |
- **Quality Scores**: Datasets ranked by completeness and usability
|
|
|
190 |
|
191 |
## π Advanced Usage
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
### Custom Tool Development
|
194 |
Add new tools to the `tools/` directory following the SmolAgents tool pattern.
|
195 |
|
@@ -203,6 +291,19 @@ python -c "from app import initialize_models; initialize_models()"
|
|
203 |
### Batch Processing
|
204 |
Process multiple datasets programmatically using the agent directly.
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
## π License
|
207 |
|
208 |
This project is developed for the Gradio MCP x Agents Hackathon. See individual tool licenses for third-party components.
|
@@ -217,3 +318,5 @@ This project is developed for the Gradio MCP x Agents Hackathon. See individual
|
|
217 |
---
|
218 |
|
219 |
**π Ready to explore French public data with AI? Launch the interface and start analyzing!**
|
|
|
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short_description: AI-powered agents for comprehensive analysis of French public data with follow-up capabilities.
|
12 |
---
|
13 |
|
14 |
# π€ French Public Data Analysis Agent
|
15 |
|
16 |
+
**AI-powered intelligent analysis of French public datasets** with automated visualization generation, comprehensive PDF reports, and **interactive follow-up analysis capabilities**.
|
17 |
|
18 |
## β¨ Features
|
19 |
|
20 |
### π **Intelligent Dataset Discovery**
|
21 |
- **BM25 Keyword Search**: Advanced keyword matching with pre-computed search indices
|
22 |
- **Bilingual Query Translation**: Search in French or English - queries are automatically translated using LLM
|
23 |
+
- **Quality-Weighted Random Selection**: Leave query empty to randomly select high-quality datasets
|
24 |
- **Real-time Dataset Matching**: Instant matching against 5,000+ French government datasets
|
25 |
+
- **Dynamic Dataset Search**: Agent can search for alternative datasets if initial results aren't suitable
|
26 |
|
27 |
### π€ **Automated AI Analysis**
|
28 |
- **SmolAgents Integration**: Advanced AI agent with 30+ step planning capability
|
29 |
- **Custom Tool Suite**: Specialized tools for web scraping, data analysis, and visualization
|
30 |
- **Multi-step Processing**: Complete pipeline from data discovery to report generation
|
31 |
- **Error Recovery**: Smart error handling and alternative data source selection
|
32 |
+
- **Autonomous Decision Making**: Agent can choose from provided results or find better alternatives
|
33 |
+
|
34 |
+
### π― **Interactive Follow-up Analysis** β NEW
|
35 |
+
- **Dedicated Follow-up Agent**: Specialized AI for answering questions about generated reports
|
36 |
+
- **Dataset Continuity**: Automatically loads and analyzes the same dataset from previous report
|
37 |
+
- **Advanced Analytics**: Correlation analysis, statistical summaries, custom filtering
|
38 |
+
- **Interactive Visualizations**: Create new charts and graphs based on follow-up questions
|
39 |
+
- **Multiple Analysis Types**: Support for bar charts, scatter plots, histograms, box plots, and more
|
40 |
+
- **Example-Driven Interface**: Quick-start examples for common follow-up questions
|
41 |
|
42 |
### π **Advanced Visualizations**
|
43 |
- **France Geographic Maps**: Department and region-level choropleth maps
|
44 |
+
- **Multiple Chart Types**: Bar charts, line plots, scatter plots, heatmaps, histograms, box plots
|
45 |
- **Smart Visualization Selection**: AI automatically chooses appropriate chart types
|
46 |
- **High-Quality PNG Output**: Publication-ready visualizations
|
47 |
+
- **Follow-up Visualizations**: Generate additional charts based on user questions
|
48 |
|
49 |
### π **Comprehensive Reports**
|
50 |
- **Professional PDF Reports**: Complete analysis with embedded visualizations
|
51 |
- **Bilingual Support**: Reports generated in the same language as your query
|
52 |
- **Structured Analysis**: Title page, methodology, findings, and next steps
|
53 |
- **LibreOffice Integration**: Cross-platform PDF generation
|
54 |
+
- **Report Continuity**: Follow-up analysis references previous report context
|
55 |
|
56 |
### π¨ **Modern Web Interface**
|
57 |
- **Real-time Progress Tracking**: Detailed step-by-step progress updates
|
58 |
- **Responsive Design**: Beautiful, modern Gradio interface
|
59 |
- **Quick Start Examples**: Pre-built queries for common use cases
|
60 |
- **Accordion Tips**: Collapsible help section with usage instructions
|
61 |
+
- **Follow-up Interface**: Dedicated section for asking follow-up questions
|
62 |
+
- **Visual Feedback**: Progress bars and status indicators
|
63 |
|
64 |
## π Quick Start
|
65 |
|
|
|
74 |
```bash
|
75 |
# Clone the repository
|
76 |
git clone <repository-url>
|
77 |
+
cd datagouv-french-data-analyst
|
78 |
|
79 |
# Install dependencies
|
80 |
pip install -r requirements.txt
|
|
|
90 |
|
91 |
### 4. Launch the Application
|
92 |
|
93 |
+
**Option 1: Using the launch script (Recommended)**
|
94 |
+
```bash
|
95 |
+
python launch_gradio.py
|
96 |
+
```
|
97 |
+
|
98 |
+
**Option 2: Direct launch**
|
99 |
```bash
|
100 |
+
python app.py
|
101 |
```
|
102 |
|
103 |
The interface will be available at:
|
|
|
106 |
|
107 |
## π‘ How to Use
|
108 |
|
109 |
+
### Basic Analysis Workflow
|
110 |
|
111 |
1. **Enter Your Query**: Type any search term related to French public data
|
112 |
- Examples: "road traffic accidents", "education directory", "housing data"
|
113 |
- Supports both French and English queries
|
114 |
|
115 |
2. **Or Use Quick Examples**: Click any of the pre-built example queries:
|
116 |
+
- π Road Traffic Accidents 2023
|
117 |
- π Education Directory
|
118 |
- π French Vacant Housing Private Park
|
119 |
|
120 |
3. **Or Go Random**: Leave the query empty to randomly select a high-quality dataset
|
121 |
|
122 |
+
4. **Click "π Analyze Dataset"**: The AI agent begins processing (7-15 minutes)
|
123 |
+
|
124 |
+
### Follow-up Analysis Workflow
|
125 |
+
|
126 |
+
After the initial analysis is complete:
|
127 |
+
|
128 |
+
1. **Follow-up Section Appears**: Located below the generated visualizations
|
129 |
+
2. **Ask Follow-up Questions**: Use the dedicated input field to ask questions about the report
|
130 |
+
3. **Use Example Questions**: Click pre-built examples like:
|
131 |
+
- π Correlation Analysis
|
132 |
+
- π Statistical Summary
|
133 |
+
- π― Filter & Analyze
|
134 |
+
- π Dataset Overview
|
135 |
+
- π Trend Analysis
|
136 |
+
- π Custom Visualization
|
137 |
+
|
138 |
+
4. **Get Detailed Answers**: Receive both text explanations and new visualizations
|
139 |
|
140 |
### Results
|
141 |
|
142 |
- **Download PDF Report**: Complete analysis with all visualizations
|
143 |
- **View Individual Charts**: Up to 4 visualizations displayed in the interface
|
144 |
- **Dataset Reference**: Direct link to the original data.gouv.fr page
|
145 |
+
- **Follow-up Visualizations**: Additional charts generated from follow-up questions
|
146 |
|
147 |
## π οΈ Technical Architecture
|
148 |
|
|
|
150 |
|
151 |
```
|
152 |
π Project Structure
|
153 |
+
βββ app.py # Main Gradio interface with progress tracking
|
154 |
+
βββ launch_gradio.py # Simplified launch script
|
155 |
+
βββ agent.py # SmolAgents configuration and prompt generation
|
156 |
+
βββ followup_agent.py # Follow-up analysis agent
|
157 |
+
βββ tools/ # Custom agent tools
|
158 |
+
β βββ webpage_tools.py # Web scraping and data extraction
|
159 |
+
β βββ exploration_tools.py # Dataset analysis and description
|
160 |
+
β βββ drawing_tools.py # France map generation and visualization
|
161 |
+
β βββ libreoffice_tools.py # PDF conversion utilities
|
162 |
+
β βββ followup_tools.py # Follow-up analysis tools
|
163 |
+
β βββ retrieval_tools.py # Dataset search and retrieval
|
164 |
+
βββ filtered_dataset.csv # Pre-processed dataset index (5,000+ datasets)
|
165 |
+
βββ france_data/ # Geographic data for France maps
|
166 |
+
βββ generated_data/ # Output folder for reports and visualizations
|
167 |
```
|
168 |
|
169 |
### Key Technologies
|
170 |
|
171 |
- **Frontend**: Gradio with custom CSS and real-time progress
|
172 |
+
- **AI Agents**:
|
173 |
+
- Primary SmolAgents powered by Gemini 2.5 Flash
|
174 |
+
- Specialized follow-up agent for interactive analysis β
|
175 |
- **Search**: BM25 keyword matching with TF-IDF preprocessing
|
176 |
- **Translation**: LLM-powered bilingual query translation
|
177 |
- **Visualization**: Matplotlib, Geopandas, Seaborn
|
178 |
- **PDF Generation**: python-docx + LibreOffice conversion
|
179 |
+
- **Data Processing**: Pandas, NumPy, Shapely, Scipy
|
180 |
+
- **Follow-up Analytics**: Statistical analysis, correlation studies, custom filtering β
|
181 |
|
182 |
### Smart Features
|
183 |
|
184 |
+
#### Enhanced BM25 Search
|
185 |
- Pre-computed search indices for 5,000+ datasets
|
186 |
- Accent-insensitive keyword matching
|
187 |
- Plural form normalization
|
188 |
- Quality-score weighted ranking
|
189 |
+
- Dynamic dataset retrieval during analysis β
|
190 |
+
|
191 |
+
#### Follow-up Analysis System
|
192 |
+
- **Dataset Continuity**: Automatically loads previous analysis dataset
|
193 |
+
- **Context Awareness**: References previous report findings
|
194 |
+
- **Multi-modal Analysis**: Combines statistical analysis with visualizations
|
195 |
+
- **Tool Integration**: 8+ specialized follow-up tools including:
|
196 |
+
- `load_previous_dataset()` - Load analysis dataset
|
197 |
+
- `get_dataset_summary()` - Comprehensive dataset overview
|
198 |
+
- `create_followup_visualization()` - Generate custom charts
|
199 |
+
- `analyze_column_correlation()` - Statistical correlation analysis
|
200 |
+
- `create_statistical_summary()` - Advanced statistical reports
|
201 |
+
- `filter_and_visualize_data()` - Targeted data filtering and visualization
|
202 |
|
203 |
#### LLM Translation
|
204 |
- Automatic French β English translation
|
|
|
219 |
1. **"No CSV/JSON files found"**
|
220 |
- The selected dataset doesn't contain processable files
|
221 |
- Try a different query or use the random selection
|
222 |
+
- Agent will automatically search for alternative datasets
|
223 |
|
224 |
2. **LibreOffice PDF conversion fails**
|
225 |
- Ensure LibreOffice is installed and accessible
|
|
|
233 |
- BM25 index computation may take time on first run
|
234 |
- Pre-computed indices are cached for faster subsequent searches
|
235 |
|
236 |
+
5. **Follow-up analysis errors**
|
237 |
+
- Ensure the initial analysis completed successfully
|
238 |
+
- Check that dataset files exist in `generated_data/` folder
|
239 |
+
- Verify follow-up question is clear and specific
|
240 |
+
|
241 |
### Performance Optimization
|
242 |
|
243 |
- **Pre-compute BM25**: Run the search once to generate `bm25_data.pkl`
|
244 |
- **Use SSD storage**: Faster file I/O for large datasets
|
245 |
- **Monitor API usage**: API calls for translation and agent execution
|
246 |
+
- **Clean generated_data**: Remove old files to improve follow-up performance
|
247 |
|
248 |
## π Dataset Coverage
|
249 |
|
|
|
252 |
- **File Formats**: CSV, JSON, Excel, XML
|
253 |
- **Topics**: All major sectors of French public administration
|
254 |
- **Quality Scores**: Datasets ranked by completeness and usability
|
255 |
+
- **Real-time Search**: Agent can discover additional datasets during analysis
|
256 |
|
257 |
## π Advanced Usage
|
258 |
|
259 |
+
### Follow-up Analysis Examples
|
260 |
+
|
261 |
+
**Correlation Analysis:**
|
262 |
+
```
|
263 |
+
Show me the correlation between two numerical columns with a scatter plot
|
264 |
+
```
|
265 |
+
|
266 |
+
**Statistical Summary:**
|
267 |
+
```
|
268 |
+
Create a comprehensive statistical summary with visualization for unemployment rates
|
269 |
+
```
|
270 |
+
|
271 |
+
**Custom Filtering:**
|
272 |
+
```
|
273 |
+
Filter accidents data by night time conditions and create a visualization
|
274 |
+
```
|
275 |
+
|
276 |
+
**Trend Analysis:**
|
277 |
+
```
|
278 |
+
Create a line chart showing accident trends over the months
|
279 |
+
```
|
280 |
+
|
281 |
### Custom Tool Development
|
282 |
Add new tools to the `tools/` directory following the SmolAgents tool pattern.
|
283 |
|
|
|
291 |
### Batch Processing
|
292 |
Process multiple datasets programmatically using the agent directly.
|
293 |
|
294 |
+
## π Dependencies
|
295 |
+
|
296 |
+
The project requires the following Python packages (see `requirements.txt`):
|
297 |
+
|
298 |
+
```
|
299 |
+
pandas, shapely, geopandas, numpy, rtree, pyproj
|
300 |
+
matplotlib, requests, duckduckgo-search
|
301 |
+
smolagents[toolkit], smolagents[litellm]
|
302 |
+
dotenv, beautifulsoup4, reportlab>=3.6.0
|
303 |
+
scikit-learn, gradio, pypdf2, python-docx
|
304 |
+
scipy, openpyxl, unidecode, rank_bm25
|
305 |
+
```
|
306 |
+
|
307 |
## π License
|
308 |
|
309 |
This project is developed for the Gradio MCP x Agents Hackathon. See individual tool licenses for third-party components.
|
|
|
318 |
---
|
319 |
|
320 |
**π Ready to explore French public data with AI? Launch the interface and start analyzing!**
|
321 |
+
|
322 |
+
**π₯ NEW: Try the follow-up analysis feature to dive deeper into your reports!**
|
agent.py
CHANGED
@@ -3,6 +3,7 @@ from tools.webpage_tools import (
|
|
3 |
visit_webpage,
|
4 |
get_all_links,
|
5 |
read_file_from_url,
|
|
|
6 |
)
|
7 |
from tools.exploration_tools import (
|
8 |
get_dataset_description,
|
@@ -13,6 +14,12 @@ from tools.drawing_tools import (
|
|
13 |
from tools.libreoffice_tools import (
|
14 |
convert_to_pdf_with_libreoffice,
|
15 |
check_libreoffice_availability,
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
)
|
17 |
from smolagents import (
|
18 |
CodeAgent,
|
@@ -29,11 +36,12 @@ def create_web_agent(step_callback):
|
|
29 |
web_agent = CodeAgent(
|
30 |
tools=[
|
31 |
search_tool,
|
32 |
-
visit_webpage, get_all_links, read_file_from_url,
|
33 |
get_dataset_description,
|
34 |
plot_departments_data,
|
35 |
convert_to_pdf_with_libreoffice,
|
36 |
-
check_libreoffice_availability
|
|
|
37 |
],
|
38 |
model=model,
|
39 |
max_steps=30,
|
@@ -48,40 +56,82 @@ def create_web_agent(step_callback):
|
|
48 |
)
|
49 |
return web_agent
|
50 |
|
51 |
-
def generate_prompt(
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
1. Examine the page
|
58 |
-
2. Get all links
|
59 |
-
3. Get the dataset from the link
|
60 |
-
4. Get information about the dataset using the get_dataset_description tool
|
61 |
-
5. Decide on what you can draw based on either department or region data
|
62 |
-
5.1 if no data department or region level, look for another file!
|
63 |
-
6. Draw a map of France using your idea
|
64 |
-
7. Save the map in png file
|
65 |
-
8. Make as well 3 additional visualizations, not maps, that you can save in png files
|
66 |
-
9. Write an interesting analysis text for each of your visualizations. Be smart and think cleverly about the data and what it can state
|
67 |
-
10. Think of next step analysis to look at the data
|
68 |
-
11. Generate a comprehensive PDF report using the python-docx library that includes:
|
69 |
-
- A title page with the dataset name and analysis overview
|
70 |
-
- All your visualizations (PNG files) embedded in the report
|
71 |
-
- Your analysis text for each visualization
|
72 |
-
- Conclusions and next steps
|
73 |
-
Make the visualizations appropriately sized so they fit well in the PDF report.
|
74 |
-
Convert then that docx file to pdf using the convert_to_pdf_with_libreoffice tool.
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
IMPORTANT LIBREOFFICE NOTES:
|
82 |
-
- If you need to use LibreOffice, first call check_libreoffice_availability() to verify it's available
|
83 |
-
- If LibreOffice is available, "LibreOffice found" is returned by "check_libreoffice_availability()"
|
84 |
-
- Use convert_to_pdf_with_libreoffice() tool instead of subprocess calls
|
85 |
-
- Do NOT use subprocess.run(['libreoffice', ...]) or subprocess.run(['soffice', ...]) directly
|
86 |
-
- The LibreOffice tools handle macOS, Linux, and Windows path differences automatically
|
87 |
-
"""
|
|
|
3 |
visit_webpage,
|
4 |
get_all_links,
|
5 |
read_file_from_url,
|
6 |
+
save_dataset_for_followup,
|
7 |
)
|
8 |
from tools.exploration_tools import (
|
9 |
get_dataset_description,
|
|
|
14 |
from tools.libreoffice_tools import (
|
15 |
convert_to_pdf_with_libreoffice,
|
16 |
check_libreoffice_availability,
|
17 |
+
get_libreoffice_info,
|
18 |
+
)
|
19 |
+
from tools.retrieval_tools import (
|
20 |
+
search_datasets,
|
21 |
+
get_dataset_info,
|
22 |
+
get_random_quality_dataset,
|
23 |
)
|
24 |
from smolagents import (
|
25 |
CodeAgent,
|
|
|
36 |
web_agent = CodeAgent(
|
37 |
tools=[
|
38 |
search_tool,
|
39 |
+
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
|
40 |
get_dataset_description,
|
41 |
plot_departments_data,
|
42 |
convert_to_pdf_with_libreoffice,
|
43 |
+
check_libreoffice_availability, get_libreoffice_info,
|
44 |
+
search_datasets, get_dataset_info, get_random_quality_dataset
|
45 |
],
|
46 |
model=model,
|
47 |
max_steps=30,
|
|
|
56 |
)
|
57 |
return web_agent
|
58 |
|
59 |
+
def generate_prompt(user_query=None, initial_search_results=None):
|
60 |
+
"""Generate a unified prompt for dataset search and analysis"""
|
61 |
+
|
62 |
+
base_instructions = """Follow these steps to analyze French public data:
|
63 |
+
|
64 |
+
1. **Dataset Selection**:
|
65 |
+
- You can use the search_datasets tool to find relevant datasets
|
66 |
+
- You can use get_dataset_info to get detailed information about specific datasets
|
67 |
+
- You can use get_random_quality_dataset to explore interesting datasets
|
68 |
+
|
69 |
+
2. **Dataset Analysis**:
|
70 |
+
- Examine the selected dataset page using visit_webpage
|
71 |
+
- Get all available data links using get_all_links
|
72 |
+
- Download and analyze the dataset using read_file_from_url
|
73 |
+
- Save the dataset for follow-up analysis using save_dataset_for_followup
|
74 |
+
- Get dataset description using get_dataset_description
|
75 |
+
|
76 |
+
3. **Visualization Creation**:
|
77 |
+
- If geographic data (departments/regions) is available, create a map of France
|
78 |
+
- Create 3 additional non-map visualizations
|
79 |
+
- Save all visualizations as PNG files
|
80 |
+
|
81 |
+
4. **Report Generation**:
|
82 |
+
- Write insightful analysis text for each visualization
|
83 |
+
- Generate a comprehensive PDF report using python-docx library that includes:
|
84 |
+
* Title page with dataset name and analysis overview
|
85 |
+
* All visualizations (PNG files) embedded in the report
|
86 |
+
* Analysis text for each visualization
|
87 |
+
* Conclusions and next steps
|
88 |
+
- Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool
|
89 |
+
|
90 |
+
**Important Technical Notes:**
|
91 |
+
- Save everything in the generated_data folder
|
92 |
+
- Do NOT use the 'os' module
|
93 |
+
- Work step by step, don't generate too much code at once
|
94 |
+
- Before PDF conversion, call check_libreoffice_availability() - it returns True/False
|
95 |
+
- If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
|
96 |
+
- If check_libreoffice_availability() returns False, skip PDF conversion and inform user
|
97 |
+
- Do NOT use subprocess calls directly for LibreOffice
|
98 |
+
- If question is in English, report is in English. If in French, report is in French.
|
99 |
+
"""
|
100 |
+
|
101 |
+
if user_query and initial_search_results:
|
102 |
+
return f"""I need you to analyze French public datasets related to: "{user_query}"
|
103 |
+
|
104 |
+
**INITIAL SEARCH RESULTS:**
|
105 |
+
{initial_search_results}
|
106 |
+
|
107 |
+
You have these options:
|
108 |
+
1. **Use one of the datasets from the initial search results above** - select the most relevant one
|
109 |
+
2. **Search for different datasets** using the search_datasets tool if none of the above seem perfect
|
110 |
+
3. **Get more information** about any dataset using get_dataset_info tool
|
111 |
+
|
112 |
+
{base_instructions}
|
113 |
+
|
114 |
+
Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
|
115 |
+
If user query is not specific, remain generic with respect to the dataset at hand.
|
116 |
+
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
|
117 |
+
"""
|
118 |
+
|
119 |
+
elif user_query:
|
120 |
+
return f"""I need you to find and analyze French public datasets related to: "{user_query}"
|
121 |
+
|
122 |
+
{base_instructions}
|
123 |
+
|
124 |
+
Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
|
125 |
+
If user query is not specific, remain generic with respect to the dataset at hand.
|
126 |
+
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
|
127 |
+
"""
|
128 |
+
|
129 |
+
else:
|
130 |
+
return f"""I need you to find and analyze an interesting French public dataset.
|
131 |
|
132 |
+
{base_instructions}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
|
135 |
+
If user query is not specific, remain generic with respect to the dataset at hand.
|
136 |
+
Focus on getting results and analytics; do not go with too much data, we can always improve it later.
|
137 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -7,10 +7,10 @@ import time
|
|
7 |
import queue
|
8 |
import numpy as np
|
9 |
from rank_bm25 import BM25Okapi
|
10 |
-
import re
|
11 |
from dotenv import load_dotenv
|
12 |
from smolagents import CodeAgent, LiteLLMModel
|
13 |
from agent import create_web_agent, generate_prompt
|
|
|
14 |
from unidecode import unidecode
|
15 |
|
16 |
load_dotenv()
|
@@ -302,30 +302,9 @@ def run_agent_analysis_with_progress(query, progress_callback, df=None, page_url
|
|
302 |
|
303 |
def search_and_analyze(query, progress=gr.Progress()):
|
304 |
"""
|
305 |
-
|
306 |
Uses Gradio's progress bar for visual feedback.
|
307 |
"""
|
308 |
-
# Read the filtered dataset first
|
309 |
-
df = pd.read_csv('filtered_dataset.csv')
|
310 |
-
|
311 |
-
# If no query provided, randomly select one weighted by quality score
|
312 |
-
if not query.strip():
|
313 |
-
progress(0, desc="π² No query provided - selecting random high-quality dataset...")
|
314 |
-
|
315 |
-
# Use quality_score as weights for random selection
|
316 |
-
if 'quality_score' in df.columns:
|
317 |
-
# Ensure quality scores are positive for weighting
|
318 |
-
weights = df['quality_score'].fillna(0)
|
319 |
-
weights = weights - weights.min() + 0.1 # Shift to make all positive
|
320 |
-
else:
|
321 |
-
weights = None
|
322 |
-
|
323 |
-
# Randomly sample one dataset weighted by quality
|
324 |
-
selected_row = df.sample(n=1, weights=weights).iloc[0]
|
325 |
-
query = selected_row['title']
|
326 |
-
|
327 |
-
progress(0.02, f"π― Random selection: {query[:60]}...")
|
328 |
-
|
329 |
# Clear the progress queue
|
330 |
while not progress_queue.empty():
|
331 |
try:
|
@@ -336,10 +315,10 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
336 |
# Initialize outputs
|
337 |
pdf_file = None
|
338 |
images_output = [gr.Image(visible=False)] * 4
|
339 |
-
status = "π Starting analysis..."
|
340 |
|
341 |
# Initial progress
|
342 |
-
progress(0.05, desc="π Initializing...")
|
343 |
|
344 |
def progress_callback(progress_val, description):
|
345 |
"""Callback function to update progress - puts updates in queue"""
|
@@ -351,40 +330,76 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
351 |
# Run analysis in a separate thread
|
352 |
result_queue = queue.Queue()
|
353 |
|
354 |
-
# Store the page URL to show immediately (kept for compatibility)
|
355 |
-
page_url_to_show = None
|
356 |
-
|
357 |
-
def page_url_callback(url):
|
358 |
-
nonlocal page_url_to_show
|
359 |
-
page_url_to_show = url
|
360 |
-
|
361 |
-
# Find and show the page URL immediately FIRST
|
362 |
-
initialize_models()
|
363 |
-
progress(0.06, desc="π Finding relevant dataset...")
|
364 |
-
most_similar_idx, similarity_score, translated_query, original_lang = find_similar_dataset_bm25(query, df)
|
365 |
-
data_gouv_page = df.iloc[most_similar_idx]['url']
|
366 |
-
dataset_title = df.iloc[most_similar_idx]['title']
|
367 |
-
|
368 |
-
progress(0.07, desc=f"π Found dataset: {dataset_title[:50]}...")
|
369 |
-
|
370 |
-
# Now start the analysis thread with the found dataset info
|
371 |
def run_analysis():
|
372 |
try:
|
373 |
-
#
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
except Exception as e:
|
377 |
-
|
|
|
378 |
|
379 |
analysis_thread = threading.Thread(target=run_analysis)
|
380 |
analysis_thread.start()
|
381 |
|
382 |
-
# Show
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
# Initial update to show the page URL immediately
|
387 |
-
progress(0.08, desc="π Page found - starting analysis...")
|
388 |
|
389 |
# Monitor progress while analysis runs
|
390 |
last_progress = 0.08
|
@@ -408,11 +423,18 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
408 |
# Check if this is a "no data" case
|
409 |
if "β No CSV/JSON files found" in final_status:
|
410 |
progress(1.0, desc="β No processable data found")
|
411 |
-
return (gr.Textbox(value=
|
412 |
final_status,
|
413 |
gr.File(visible=False),
|
414 |
gr.Image(visible=False), gr.Image(visible=False),
|
415 |
-
gr.Image(visible=False), gr.Image(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
# Final progress update
|
418 |
progress(1.0, desc="β
Processing results...")
|
@@ -441,7 +463,16 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
441 |
# final progress completion
|
442 |
progress(1.0, desc="π Complete!")
|
443 |
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
except queue.Empty:
|
447 |
pass
|
@@ -450,14 +481,86 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
450 |
|
451 |
except Exception as e:
|
452 |
progress(1.0, desc=f"β Error: {str(e)}")
|
453 |
-
return gr.Textbox(value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
|
455 |
# Ensure thread completes
|
456 |
analysis_thread.join(timeout=1)
|
457 |
|
458 |
# Fallback return
|
459 |
progress(1.0, desc="π Finished")
|
460 |
-
return gr.Textbox(value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
|
462 |
# Create the Gradio interface
|
463 |
with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.Soft(), css="""
|
@@ -516,7 +619,10 @@ with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.S
|
|
516 |
gr.HTML("""
|
517 |
<div style="text-align: center; background: #f8fafc; padding: 1.5rem; border-radius: 10px; margin: 1rem 0;">
|
518 |
<p style="font-size: 1.1rem; color: #374151; margin: 0;">
|
519 |
-
π <strong>Search in French or English</strong> β’ πΊοΈ <strong>Generate Reports with visualizations
|
|
|
|
|
|
|
520 |
</p>
|
521 |
</div>
|
522 |
""")
|
@@ -527,18 +633,21 @@ with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.S
|
|
527 |
with gr.Column():
|
528 |
gr.Markdown("""
|
529 |
π― **How to Use:**
|
530 |
-
- Enter
|
531 |
-
- Leave empty
|
|
|
|
|
532 |
- Results include visualizations and downloadable reports
|
533 |
|
534 |
β±οΈ **Processing Time:**
|
535 |
-
-
|
536 |
-
-
|
537 |
""")
|
538 |
with gr.Column():
|
539 |
gr.Markdown("""
|
540 |
β οΈ **Important Notes:**
|
541 |
-
-
|
|
|
542 |
- Some datasets may not contain processable CSV/JSON files
|
543 |
- All visualizations are automatically generated
|
544 |
- Maps focus on France when geographic data is available
|
@@ -571,7 +680,7 @@ with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.S
|
|
571 |
|
572 |
with gr.Row():
|
573 |
examples = [
|
574 |
-
("π Road Traffic Accidents
|
575 |
("π Education Directory", "education directory"),
|
576 |
("π French Vacant Housing Private Park", "French vacant housing private park"),
|
577 |
]
|
@@ -615,14 +724,90 @@ with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.S
|
|
615 |
image3 = gr.Image(label="πΊοΈ Map/Chart 3", visible=False, height=400)
|
616 |
image4 = gr.Image(label="π Chart 4", visible=False, height=400)
|
617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
# Set up the search button click event with progress bar
|
619 |
search_button.click(
|
620 |
fn=search_and_analyze,
|
621 |
inputs=[query_input],
|
622 |
-
outputs=[page_url_display, status_output, download_button, image1, image2, image3, image4
|
|
|
|
|
623 |
show_progress="full" # Show the built-in progress bar
|
624 |
)
|
625 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
|
627 |
|
628 |
if __name__ == "__main__":
|
@@ -631,5 +816,5 @@ if __name__ == "__main__":
|
|
631 |
share=True,
|
632 |
server_name="0.0.0.0",
|
633 |
server_port=7860,
|
634 |
-
show_error=True
|
635 |
)
|
|
|
7 |
import queue
|
8 |
import numpy as np
|
9 |
from rank_bm25 import BM25Okapi
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
from smolagents import CodeAgent, LiteLLMModel
|
12 |
from agent import create_web_agent, generate_prompt
|
13 |
+
from followup_agent import run_followup_analysis
|
14 |
from unidecode import unidecode
|
15 |
|
16 |
load_dotenv()
|
|
|
302 |
|
303 |
def search_and_analyze(query, progress=gr.Progress()):
|
304 |
"""
|
305 |
+
Unified function that does initial search then lets agent analyze with full autonomy.
|
306 |
Uses Gradio's progress bar for visual feedback.
|
307 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
# Clear the progress queue
|
309 |
while not progress_queue.empty():
|
310 |
try:
|
|
|
315 |
# Initialize outputs
|
316 |
pdf_file = None
|
317 |
images_output = [gr.Image(visible=False)] * 4
|
318 |
+
status = "π Starting agent-driven analysis..."
|
319 |
|
320 |
# Initial progress
|
321 |
+
progress(0.05, desc="π Initializing agent...")
|
322 |
|
323 |
def progress_callback(progress_val, description):
|
324 |
"""Callback function to update progress - puts updates in queue"""
|
|
|
330 |
# Run analysis in a separate thread
|
331 |
result_queue = queue.Queue()
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
def run_analysis():
|
334 |
try:
|
335 |
+
# Clean up previous results
|
336 |
+
if os.path.exists('generated_data'):
|
337 |
+
for file in glob.glob('generated_data/*'):
|
338 |
+
try:
|
339 |
+
os.remove(file)
|
340 |
+
except:
|
341 |
+
pass
|
342 |
+
else:
|
343 |
+
os.makedirs('generated_data', exist_ok=True)
|
344 |
+
|
345 |
+
# Do initial search if query provided
|
346 |
+
initial_search_results = None
|
347 |
+
if query.strip():
|
348 |
+
progress_callback(0.06, f"π Initial search for: {query[:50]}...")
|
349 |
+
try:
|
350 |
+
# Import search function from tools
|
351 |
+
from tools.retrieval_tools import search_datasets
|
352 |
+
initial_search_results = search_datasets(query, top_k=5)
|
353 |
+
progress_callback(0.08, "π€ Starting agent with search results...")
|
354 |
+
except Exception as e:
|
355 |
+
print(f"Initial search failed: {e}")
|
356 |
+
progress_callback(0.08, "π€ Starting agent without initial results...")
|
357 |
+
else:
|
358 |
+
progress_callback(0.08, "π€ Starting agent for random selection...")
|
359 |
+
|
360 |
+
step_callback = create_progress_callback()
|
361 |
+
|
362 |
+
# Create the agent with progress callback
|
363 |
+
web_agent = create_web_agent(step_callback)
|
364 |
+
|
365 |
+
# Generate unified prompt with initial search results
|
366 |
+
prompt = generate_prompt(user_query=query, initial_search_results=initial_search_results)
|
367 |
+
progress_callback(0.1, "π€ Agent analyzing datasets...")
|
368 |
+
|
369 |
+
# Run the agent - the step_callbacks will automatically update progress
|
370 |
+
answer = web_agent.run(prompt)
|
371 |
+
|
372 |
+
# Check if the agent found no processable data
|
373 |
+
answer_lower = str(answer).lower() if answer else ""
|
374 |
+
if ("no processable data" in answer_lower or
|
375 |
+
"no csv nor json" in answer_lower or
|
376 |
+
"cannot find csv" in answer_lower or
|
377 |
+
"cannot find json" in answer_lower or
|
378 |
+
"no data to process" in answer_lower):
|
379 |
+
progress_callback(1.0, "β No CSV/JSON files found in the dataset")
|
380 |
+
result_queue.put(("β No CSV/JSON files found in the selected dataset. This dataset cannot be processed automatically.", [], None))
|
381 |
+
return
|
382 |
+
|
383 |
+
# Check if files were generated
|
384 |
+
generated_files = glob.glob('generated_data/*')
|
385 |
+
|
386 |
+
if generated_files:
|
387 |
+
progress_callback(1.0, "β
Analysis completed successfully!")
|
388 |
+
result_queue.put(("Analysis completed successfully!", generated_files, "Agent-selected dataset"))
|
389 |
+
else:
|
390 |
+
progress_callback(1.0, "β οΈ Analysis completed but no files were generated.")
|
391 |
+
result_queue.put(("Analysis completed but no files were generated.", [], None))
|
392 |
+
|
393 |
except Exception as e:
|
394 |
+
progress_callback(1.0, f"β Error: {str(e)}")
|
395 |
+
result_queue.put((f"Error during analysis: {str(e)}", [], None))
|
396 |
|
397 |
analysis_thread = threading.Thread(target=run_analysis)
|
398 |
analysis_thread.start()
|
399 |
|
400 |
+
# Show initial status
|
401 |
+
current_status = "π€ Agent is finding relevant datasets..."
|
402 |
+
progress(0.08, desc=current_status)
|
|
|
|
|
|
|
403 |
|
404 |
# Monitor progress while analysis runs
|
405 |
last_progress = 0.08
|
|
|
423 |
# Check if this is a "no data" case
|
424 |
if "β No CSV/JSON files found" in final_status:
|
425 |
progress(1.0, desc="β No processable data found")
|
426 |
+
return (gr.Textbox(value="Agent-selected dataset", visible=True),
|
427 |
final_status,
|
428 |
gr.File(visible=False),
|
429 |
gr.Image(visible=False), gr.Image(visible=False),
|
430 |
+
gr.Image(visible=False), gr.Image(visible=False),
|
431 |
+
gr.Markdown(visible=False), # keep follow-up hidden
|
432 |
+
gr.HTML(visible=False),
|
433 |
+
gr.Row(visible=False),
|
434 |
+
gr.Row(visible=False),
|
435 |
+
gr.Row(visible=False),
|
436 |
+
gr.Row(visible=False),
|
437 |
+
gr.Row(visible=False))
|
438 |
|
439 |
# Final progress update
|
440 |
progress(1.0, desc="β
Processing results...")
|
|
|
463 |
# final progress completion
|
464 |
progress(1.0, desc="π Complete!")
|
465 |
|
466 |
+
# Show follow-up section after successful completion
|
467 |
+
return (gr.Textbox(value=page_url if page_url else "Agent-selected dataset", visible=True),
|
468 |
+
final_status, download_button, *images,
|
469 |
+
gr.Markdown(visible=True), # followup_section_divider
|
470 |
+
gr.HTML(visible=True), # followup_section_header
|
471 |
+
gr.Row(visible=True), # followup_input_row
|
472 |
+
gr.Row(visible=True), # followup_result_row
|
473 |
+
gr.Row(visible=True), # followup_image_row
|
474 |
+
gr.Row(visible=True), # followup_examples_header_row
|
475 |
+
gr.Row(visible=True)) # followup_examples_row
|
476 |
|
477 |
except queue.Empty:
|
478 |
pass
|
|
|
481 |
|
482 |
except Exception as e:
|
483 |
progress(1.0, desc=f"β Error: {str(e)}")
|
484 |
+
return (gr.Textbox(value="Error", visible=True), f"β Error: {str(e)}", None, *images_output,
|
485 |
+
gr.Markdown(visible=False), # keep follow-up hidden on error
|
486 |
+
gr.HTML(visible=False),
|
487 |
+
gr.Row(visible=False),
|
488 |
+
gr.Row(visible=False),
|
489 |
+
gr.Row(visible=False),
|
490 |
+
gr.Row(visible=False),
|
491 |
+
gr.Row(visible=False))
|
492 |
|
493 |
# Ensure thread completes
|
494 |
analysis_thread.join(timeout=1)
|
495 |
|
496 |
# Fallback return
|
497 |
progress(1.0, desc="π Finished")
|
498 |
+
return (gr.Textbox(value="Completed", visible=True), current_status, pdf_file, *images_output,
|
499 |
+
gr.Markdown(visible=False), # keep follow-up hidden
|
500 |
+
gr.HTML(visible=False),
|
501 |
+
gr.Row(visible=False),
|
502 |
+
gr.Row(visible=False),
|
503 |
+
gr.Row(visible=False),
|
504 |
+
gr.Row(visible=False),
|
505 |
+
gr.Row(visible=False))
|
506 |
+
|
507 |
+
def run_followup_question(question, progress=gr.Progress()):
|
508 |
+
"""
|
509 |
+
Run a follow-up analysis based on user's question about the previous report.
|
510 |
+
"""
|
511 |
+
if not question.strip():
|
512 |
+
return "Please enter a follow-up question.", gr.Image(visible=False)
|
513 |
+
|
514 |
+
progress(0.1, desc="π€ Starting follow-up analysis...")
|
515 |
+
|
516 |
+
try:
|
517 |
+
# Check if there are previous results
|
518 |
+
if not os.path.exists('generated_data') or not os.listdir('generated_data'):
|
519 |
+
return "No previous analysis found. Please run an analysis first.", gr.Image(visible=False)
|
520 |
+
|
521 |
+
progress(0.3, desc="π Analyzing previous report and dataset...")
|
522 |
+
|
523 |
+
# Run the follow-up analysis
|
524 |
+
result = run_followup_analysis(question)
|
525 |
+
|
526 |
+
progress(0.9, desc="π Processing results...")
|
527 |
+
|
528 |
+
# Look for new visualizations created by the follow-up analysis
|
529 |
+
import glob
|
530 |
+
|
531 |
+
# Get all images that were created after the analysis started
|
532 |
+
all_images = glob.glob('generated_data/*.png')
|
533 |
+
|
534 |
+
# Get recent images (created in the last few seconds)
|
535 |
+
import time
|
536 |
+
current_time = time.time()
|
537 |
+
recent_images = []
|
538 |
+
|
539 |
+
for img_path in all_images:
|
540 |
+
img_time = os.path.getctime(img_path)
|
541 |
+
if current_time - img_time < 120: # Images created in last 2 minutes
|
542 |
+
recent_images.append(img_path)
|
543 |
+
|
544 |
+
# Get the most recent image if any
|
545 |
+
latest_image = None
|
546 |
+
if recent_images:
|
547 |
+
latest_image = max(recent_images, key=os.path.getctime)
|
548 |
+
|
549 |
+
progress(1.0, desc="β
Follow-up analysis complete!")
|
550 |
+
|
551 |
+
# Enhanced result formatting
|
552 |
+
final_result = result
|
553 |
+
if latest_image:
|
554 |
+
final_result += f"\n\nπ **Visualization Created:** {os.path.basename(latest_image)}"
|
555 |
+
if len(recent_images) > 1:
|
556 |
+
final_result += f"\nπ **Total new visualizations:** {len(recent_images)}"
|
557 |
+
return final_result, gr.Image(value=latest_image, visible=True)
|
558 |
+
else:
|
559 |
+
return final_result, gr.Image(visible=False)
|
560 |
+
|
561 |
+
except Exception as e:
|
562 |
+
progress(1.0, desc="β Error in follow-up analysis")
|
563 |
+
return f"Error: {str(e)}", gr.Image(visible=False)
|
564 |
|
565 |
# Create the Gradio interface
|
566 |
with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.Soft(), css="""
|
|
|
619 |
gr.HTML("""
|
620 |
<div style="text-align: center; background: #f8fafc; padding: 1.5rem; border-radius: 10px; margin: 1rem 0;">
|
621 |
<p style="font-size: 1.1rem; color: #374151; margin: 0;">
|
622 |
+
π <strong>Search in French or English</strong> β’ π€ <strong>AI Agent finds & analyzes datasets</strong> β’ πΊοΈ <strong>Generate Reports with visualizations</strong>
|
623 |
+
</p>
|
624 |
+
<p style="font-size: 0.9rem; color: #6b7280; margin-top: 0.5rem;">
|
625 |
+
Initial search results guide the agent, but it can search for different datasets if needed
|
626 |
</p>
|
627 |
</div>
|
628 |
""")
|
|
|
633 |
with gr.Column():
|
634 |
gr.Markdown("""
|
635 |
π― **How to Use:**
|
636 |
+
- Enter search terms related to French public data
|
637 |
+
- Leave empty for random high-quality dataset selection
|
638 |
+
- System provides initial search results to guide the agent
|
639 |
+
- Agent can use provided results or search for different datasets
|
640 |
- Results include visualizations and downloadable reports
|
641 |
|
642 |
β±οΈ **Processing Time:**
|
643 |
+
- Analysis takes 7-15 minutes depending on dataset complexity
|
644 |
+
- Agent has full autonomy to find the best datasets
|
645 |
""")
|
646 |
with gr.Column():
|
647 |
gr.Markdown("""
|
648 |
β οΈ **Important Notes:**
|
649 |
+
- Agent gets initial search results but has full autonomy to make decisions
|
650 |
+
- Agent can choose from initial results or search for different datasets
|
651 |
- Some datasets may not contain processable CSV/JSON files
|
652 |
- All visualizations are automatically generated
|
653 |
- Maps focus on France when geographic data is available
|
|
|
680 |
|
681 |
with gr.Row():
|
682 |
examples = [
|
683 |
+
("π Road Traffic Accidents 2023", "road traffic accidents 2023"),
|
684 |
("π Education Directory", "education directory"),
|
685 |
("π French Vacant Housing Private Park", "French vacant housing private park"),
|
686 |
]
|
|
|
724 |
image3 = gr.Image(label="πΊοΈ Map/Chart 3", visible=False, height=400)
|
725 |
image4 = gr.Image(label="π Chart 4", visible=False, height=400)
|
726 |
|
727 |
+
# Follow-up Analysis Section (initially hidden)
|
728 |
+
followup_section_divider = gr.Markdown("---", visible=False)
|
729 |
+
followup_section_header = gr.HTML("""
|
730 |
+
<div style="text-align: center; margin: 2rem 0;">
|
731 |
+
<h2 style="color: #374151; margin-bottom: 0.5rem;">π€ Follow-up Analysis</h2>
|
732 |
+
<p style="color: #6b7280; margin: 0;">Ask questions about the generated report and dataset</p>
|
733 |
+
</div>
|
734 |
+
""", visible=False)
|
735 |
+
|
736 |
+
with gr.Row(visible=False) as followup_input_row:
|
737 |
+
followup_input = gr.Textbox(
|
738 |
+
label="Follow-up Question",
|
739 |
+
placeholder="e.g., Show me correlation between two columns, Create a chart for specific regions, What are the trends over time?",
|
740 |
+
scale=4
|
741 |
+
)
|
742 |
+
followup_button = gr.Button(
|
743 |
+
"π Analyze",
|
744 |
+
variant="secondary",
|
745 |
+
scale=1,
|
746 |
+
size="lg"
|
747 |
+
)
|
748 |
+
|
749 |
+
with gr.Row(visible=False) as followup_result_row:
|
750 |
+
followup_result = gr.Textbox(
|
751 |
+
label="π Follow-up Analysis Results",
|
752 |
+
interactive=False,
|
753 |
+
lines=10,
|
754 |
+
visible=True
|
755 |
+
)
|
756 |
+
|
757 |
+
with gr.Row(visible=False) as followup_image_row:
|
758 |
+
followup_image = gr.Image(
|
759 |
+
label="π Follow-up Visualization",
|
760 |
+
visible=False,
|
761 |
+
height=500
|
762 |
+
)
|
763 |
+
|
764 |
+
# Follow-up Examples (initially hidden)
|
765 |
+
with gr.Row(visible=False) as followup_examples_header_row:
|
766 |
+
gr.HTML("""
|
767 |
+
<div>
|
768 |
+
<h4 style="color: #374151">π‘ Example Follow-up Questions</h4>
|
769 |
+
<p style="color: #6b7280">Click any example below to try it out</p>
|
770 |
+
</div>
|
771 |
+
""")
|
772 |
+
|
773 |
+
with gr.Row(visible=False) as followup_examples_row:
|
774 |
+
followup_examples = [
|
775 |
+
("π Correlation Analysis", "Show me the correlation between two numerical columns with a scatter plot"),
|
776 |
+
("π Statistical Summary", "Create a comprehensive statistical summary with visualization for a specific column"),
|
777 |
+
("π― Filter & Analyze", "Filter the data by specific criteria and create a visualization"),
|
778 |
+
("π Dataset Overview", "Give me a detailed summary of the dataset structure and contents"),
|
779 |
+
("π Trend Analysis", "Create a line chart showing trends over time for specific data"),
|
780 |
+
("π Custom Visualization", "Create a custom bar/pie/histogram chart for specific columns"),
|
781 |
+
]
|
782 |
+
|
783 |
+
for emoji_text, query_text in followup_examples:
|
784 |
+
gr.Button(
|
785 |
+
emoji_text,
|
786 |
+
variant="secondary",
|
787 |
+
size="sm"
|
788 |
+
).click(
|
789 |
+
lambda x=query_text: x,
|
790 |
+
outputs=followup_input
|
791 |
+
)
|
792 |
+
|
793 |
# Set up the search button click event with progress bar
|
794 |
search_button.click(
|
795 |
fn=search_and_analyze,
|
796 |
inputs=[query_input],
|
797 |
+
outputs=[page_url_display, status_output, download_button, image1, image2, image3, image4,
|
798 |
+
followup_section_divider, followup_section_header, followup_input_row,
|
799 |
+
followup_result_row, followup_image_row, followup_examples_header_row, followup_examples_row],
|
800 |
show_progress="full" # Show the built-in progress bar
|
801 |
)
|
802 |
|
803 |
+
# Set up the follow-up button click event
|
804 |
+
followup_button.click(
|
805 |
+
fn=run_followup_question,
|
806 |
+
inputs=[followup_input],
|
807 |
+
outputs=[followup_result, followup_image],
|
808 |
+
show_progress="full"
|
809 |
+
)
|
810 |
+
|
811 |
|
812 |
|
813 |
if __name__ == "__main__":
|
|
|
816 |
share=True,
|
817 |
server_name="0.0.0.0",
|
818 |
server_port=7860,
|
819 |
+
show_error=True
|
820 |
)
|
followup_agent.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from tools.followup_tools import (
|
3 |
+
load_previous_dataset,
|
4 |
+
get_dataset_summary,
|
5 |
+
create_followup_visualization,
|
6 |
+
get_previous_report_content,
|
7 |
+
analyze_column_correlation,
|
8 |
+
create_statistical_summary,
|
9 |
+
filter_and_visualize_data,
|
10 |
+
)
|
11 |
+
from tools.retrieval_tools import (
|
12 |
+
search_datasets,
|
13 |
+
get_dataset_info,
|
14 |
+
)
|
15 |
+
from smolagents import (
|
16 |
+
CodeAgent,
|
17 |
+
DuckDuckGoSearchTool,
|
18 |
+
LiteLLMModel,
|
19 |
+
)
|
20 |
+
|
21 |
+
def create_followup_agent():
|
22 |
+
"""Create a specialized agent for follow-up analysis"""
|
23 |
+
search_tool = DuckDuckGoSearchTool()
|
24 |
+
model = LiteLLMModel(
|
25 |
+
model_id="gemini/gemini-2.5-flash-preview-05-20",
|
26 |
+
api_key=os.getenv("GEMINI_API_KEY"),
|
27 |
+
)
|
28 |
+
|
29 |
+
followup_agent = CodeAgent(
|
30 |
+
tools=[
|
31 |
+
search_tool,
|
32 |
+
load_previous_dataset,
|
33 |
+
get_dataset_summary,
|
34 |
+
create_followup_visualization,
|
35 |
+
get_previous_report_content,
|
36 |
+
analyze_column_correlation,
|
37 |
+
create_statistical_summary,
|
38 |
+
filter_and_visualize_data,
|
39 |
+
search_datasets,
|
40 |
+
get_dataset_info,
|
41 |
+
],
|
42 |
+
model=model,
|
43 |
+
max_steps=20,
|
44 |
+
verbosity_level=1,
|
45 |
+
planning_interval=2,
|
46 |
+
additional_authorized_imports=[
|
47 |
+
"pandas", "numpy", "matplotlib", "matplotlib.pyplot", "seaborn",
|
48 |
+
"os", "json", "datetime", "math", "statistics"
|
49 |
+
],
|
50 |
+
)
|
51 |
+
return followup_agent
|
52 |
+
|
53 |
+
def generate_followup_prompt(user_question, report_context=None):
|
54 |
+
"""Generate a prompt for follow-up analysis"""
|
55 |
+
|
56 |
+
base_prompt = f"""You are a data analysis assistant helping with follow-up questions about a previously generated report.
|
57 |
+
|
58 |
+
USER'S FOLLOW-UP QUESTION: "{user_question}"
|
59 |
+
|
60 |
+
AVAILABLE TOOLS:
|
61 |
+
1. **load_previous_dataset()** - Load the dataset used in the previous analysis
|
62 |
+
2. **get_dataset_summary(df)** - Get detailed info about the dataset structure
|
63 |
+
3. **get_previous_report_content()** - Get context about the previous report
|
64 |
+
4. **create_followup_visualization()** - Create new charts and graphs (bar, line, scatter, histogram, box, pie)
|
65 |
+
5. **analyze_column_correlation()** - Analyze relationships between columns with scatter plots
|
66 |
+
6. **create_statistical_summary()** - Generate comprehensive stats + visualizations for any column
|
67 |
+
7. **filter_and_visualize_data()** - Filter data by criteria and create targeted visualizations
|
68 |
+
8. **search_datasets()** - Search for additional datasets if needed
|
69 |
+
9. **get_dataset_info()** - Get info about specific datasets
|
70 |
+
|
71 |
+
ANALYSIS APPROACH:
|
72 |
+
1. First, get context by calling get_previous_report_content()
|
73 |
+
2. Load the previous dataset using load_previous_dataset()
|
74 |
+
3. Get a summary of the dataset structure with get_dataset_summary()
|
75 |
+
4. Based on the user's question, perform the appropriate analysis:
|
76 |
+
- Create new visualizations if they want different charts
|
77 |
+
- Analyze correlations if they ask about relationships
|
78 |
+
- Filter or group data if they want specific subsets
|
79 |
+
- Calculate statistics if they want numerical insights
|
80 |
+
5. **ALWAYS create visualizations when relevant** - save to generated_data folder
|
81 |
+
6. Provide a comprehensive text answer AND create supporting visualizations
|
82 |
+
|
83 |
+
IMPORTANT GUIDELINES:
|
84 |
+
- Always start by understanding the previous report context
|
85 |
+
- Use the same dataset that was used in the original analysis
|
86 |
+
- **CREATE VISUALIZATIONS whenever possible** - charts help answer questions better
|
87 |
+
- Provide clear, actionable insights in TEXT format
|
88 |
+
- Save all new visualization files to the generated_data folder with descriptive filenames
|
89 |
+
- Be concise but thorough in your explanations
|
90 |
+
- Combine text analysis with visual evidence
|
91 |
+
|
92 |
+
Answer the user's question: "{user_question}"
|
93 |
+
"""
|
94 |
+
|
95 |
+
if report_context:
|
96 |
+
base_prompt += f"""
|
97 |
+
|
98 |
+
ADDITIONAL CONTEXT ABOUT PREVIOUS REPORT:
|
99 |
+
{report_context}
|
100 |
+
"""
|
101 |
+
|
102 |
+
return base_prompt
|
103 |
+
|
104 |
+
def run_followup_analysis(user_question, report_context=None):
|
105 |
+
"""Run a follow-up analysis based on user question"""
|
106 |
+
try:
|
107 |
+
# Create the follow-up agent
|
108 |
+
agent = create_followup_agent()
|
109 |
+
|
110 |
+
# Generate the prompt
|
111 |
+
prompt = generate_followup_prompt(user_question, report_context)
|
112 |
+
|
113 |
+
# Run the analysis
|
114 |
+
result = agent.run(prompt)
|
115 |
+
|
116 |
+
return str(result)
|
117 |
+
|
118 |
+
except Exception as e:
|
119 |
+
return f"Error in follow-up analysis: {str(e)}"
|
tools/followup_tools.py
ADDED
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import glob
|
5 |
+
from smolagents import tool
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
from pathlib import Path
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
@tool
|
12 |
+
def load_previous_dataset() -> pd.DataFrame:
|
13 |
+
"""
|
14 |
+
Load the dataset that was used in the previous analysis.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
The pandas DataFrame that was used in the previous report generation
|
18 |
+
"""
|
19 |
+
try:
|
20 |
+
# Look for saved dataset in generated_data folder
|
21 |
+
dataset_files = glob.glob('generated_data/*dataset*.csv') + glob.glob('generated_data/*data*.csv')
|
22 |
+
|
23 |
+
if not dataset_files:
|
24 |
+
# Try to find any CSV file in generated_data
|
25 |
+
csv_files = glob.glob('generated_data/*.csv')
|
26 |
+
if csv_files:
|
27 |
+
dataset_files = csv_files
|
28 |
+
|
29 |
+
if not dataset_files:
|
30 |
+
raise Exception("No dataset found in generated_data folder")
|
31 |
+
|
32 |
+
# Use the most recent dataset file
|
33 |
+
latest_file = max(dataset_files, key=os.path.getctime)
|
34 |
+
df = pd.read_csv(latest_file)
|
35 |
+
|
36 |
+
print(f"β
Loaded dataset from {latest_file} with {len(df)} rows and {len(df.columns)} columns")
|
37 |
+
return df
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
raise Exception(f"Error loading previous dataset: {str(e)}")
|
41 |
+
|
42 |
+
@tool
|
43 |
+
def get_dataset_summary(df: pd.DataFrame) -> str:
|
44 |
+
"""
|
45 |
+
Get a comprehensive summary of the dataset structure and content.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
df: The pandas DataFrame to analyze
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
A formatted string with dataset summary information
|
52 |
+
"""
|
53 |
+
try:
|
54 |
+
summary_lines = []
|
55 |
+
summary_lines.append("=== DATASET SUMMARY ===")
|
56 |
+
summary_lines.append(f"Shape: {df.shape[0]} rows Γ {df.shape[1]} columns")
|
57 |
+
summary_lines.append("")
|
58 |
+
|
59 |
+
summary_lines.append("Column Information:")
|
60 |
+
for col in df.columns:
|
61 |
+
dtype = str(df[col].dtype)
|
62 |
+
non_null = df[col].count()
|
63 |
+
null_count = df[col].isnull().sum()
|
64 |
+
unique_count = df[col].nunique()
|
65 |
+
|
66 |
+
summary_lines.append(f" β’ {col}: {dtype}, {non_null} non-null, {null_count} null, {unique_count} unique")
|
67 |
+
|
68 |
+
# Show sample values for categorical columns
|
69 |
+
if df[col].dtype == 'object' and unique_count <= 10:
|
70 |
+
sample_values = df[col].value_counts().head(5).index.tolist()
|
71 |
+
summary_lines.append(f" Sample values: {sample_values}")
|
72 |
+
|
73 |
+
summary_lines.append("")
|
74 |
+
summary_lines.append("First 3 rows:")
|
75 |
+
summary_lines.append(df.head(3).to_string())
|
76 |
+
|
77 |
+
return "\n".join(summary_lines)
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
return f"Error analyzing dataset: {str(e)}"
|
81 |
+
|
82 |
+
@tool
|
83 |
+
def create_followup_visualization(df: pd.DataFrame, chart_type: str, x_column: str, y_column: str = None, title: str = "Follow-up Analysis", filename: str = "followup_chart.png") -> str:
|
84 |
+
"""
|
85 |
+
Create a visualization for follow-up analysis.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
df: The pandas DataFrame to visualize
|
89 |
+
chart_type: Type of chart ('bar', 'line', 'scatter', 'histogram', 'box', 'pie')
|
90 |
+
x_column: Column name for x-axis
|
91 |
+
y_column: Column name for y-axis (optional for some chart types)
|
92 |
+
title: Title for the chart
|
93 |
+
filename: Name of the file to save (should end with .png)
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
Path to the saved visualization file
|
97 |
+
"""
|
98 |
+
try:
|
99 |
+
plt.figure(figsize=(12, 8))
|
100 |
+
|
101 |
+
if chart_type == 'bar':
|
102 |
+
if y_column:
|
103 |
+
df_grouped = df.groupby(x_column)[y_column].sum().sort_values(ascending=False)
|
104 |
+
plt.bar(range(len(df_grouped)), df_grouped.values)
|
105 |
+
plt.xticks(range(len(df_grouped)), df_grouped.index, rotation=45)
|
106 |
+
plt.ylabel(y_column)
|
107 |
+
else:
|
108 |
+
value_counts = df[x_column].value_counts().head(10)
|
109 |
+
plt.bar(range(len(value_counts)), value_counts.values)
|
110 |
+
plt.xticks(range(len(value_counts)), value_counts.index, rotation=45)
|
111 |
+
plt.ylabel('Count')
|
112 |
+
|
113 |
+
elif chart_type == 'line':
|
114 |
+
if y_column:
|
115 |
+
df_sorted = df.sort_values(x_column)
|
116 |
+
plt.plot(df_sorted[x_column], df_sorted[y_column])
|
117 |
+
plt.ylabel(y_column)
|
118 |
+
else:
|
119 |
+
value_counts = df[x_column].value_counts().sort_index()
|
120 |
+
plt.plot(value_counts.index, value_counts.values)
|
121 |
+
plt.ylabel('Count')
|
122 |
+
|
123 |
+
elif chart_type == 'scatter':
|
124 |
+
if y_column:
|
125 |
+
plt.scatter(df[x_column], df[y_column], alpha=0.6)
|
126 |
+
plt.ylabel(y_column)
|
127 |
+
else:
|
128 |
+
raise ValueError("Scatter plot requires both x_column and y_column")
|
129 |
+
|
130 |
+
elif chart_type == 'histogram':
|
131 |
+
plt.hist(df[x_column], bins=30, alpha=0.7)
|
132 |
+
plt.ylabel('Frequency')
|
133 |
+
|
134 |
+
elif chart_type == 'box':
|
135 |
+
if y_column:
|
136 |
+
df.boxplot(column=y_column, by=x_column)
|
137 |
+
else:
|
138 |
+
plt.boxplot(df[x_column])
|
139 |
+
plt.ylabel(x_column)
|
140 |
+
|
141 |
+
elif chart_type == 'pie':
|
142 |
+
value_counts = df[x_column].value_counts().head(10)
|
143 |
+
plt.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%')
|
144 |
+
|
145 |
+
else:
|
146 |
+
raise ValueError(f"Unsupported chart type: {chart_type}")
|
147 |
+
|
148 |
+
plt.xlabel(x_column)
|
149 |
+
plt.title(title)
|
150 |
+
plt.tight_layout()
|
151 |
+
|
152 |
+
# Save to generated_data folder
|
153 |
+
if not filename.endswith('.png'):
|
154 |
+
filename += '.png'
|
155 |
+
|
156 |
+
filepath = os.path.join('generated_data', filename)
|
157 |
+
plt.savefig(filepath, dpi=300, bbox_inches='tight')
|
158 |
+
plt.close()
|
159 |
+
|
160 |
+
return f"Visualization saved to: {filepath}"
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
plt.close() # Ensure plot is closed even on error
|
164 |
+
return f"Error creating visualization: {str(e)}"
|
165 |
+
|
166 |
+
@tool
|
167 |
+
def get_previous_report_content() -> str:
|
168 |
+
"""
|
169 |
+
Get the content of the previously generated report.
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
The text content of the previous report for context
|
173 |
+
"""
|
174 |
+
try:
|
175 |
+
# Look for PDF or DOCX files in generated_data
|
176 |
+
report_files = glob.glob('generated_data/*.pdf') + glob.glob('generated_data/*.docx')
|
177 |
+
|
178 |
+
if not report_files:
|
179 |
+
return "No previous report found in generated_data folder"
|
180 |
+
|
181 |
+
# Use the most recent report file
|
182 |
+
latest_report = max(report_files, key=os.path.getctime)
|
183 |
+
|
184 |
+
# For now, return basic info about the report
|
185 |
+
# In a full implementation, you'd extract text from PDF/DOCX
|
186 |
+
file_size = os.path.getsize(latest_report)
|
187 |
+
|
188 |
+
# Also look for any text files that might contain analysis
|
189 |
+
text_files = glob.glob('generated_data/*.txt')
|
190 |
+
text_content = ""
|
191 |
+
|
192 |
+
if text_files:
|
193 |
+
latest_text = max(text_files, key=os.path.getctime)
|
194 |
+
with open(latest_text, 'r', encoding='utf-8') as f:
|
195 |
+
text_content = f.read()
|
196 |
+
|
197 |
+
summary = f"""=== PREVIOUS REPORT CONTEXT ===
|
198 |
+
Report file: {latest_report}
|
199 |
+
File size: {file_size} bytes
|
200 |
+
Created: {os.path.getctime(latest_report)}
|
201 |
+
|
202 |
+
Additional analysis content:
|
203 |
+
{text_content if text_content else 'No additional text content found'}
|
204 |
+
|
205 |
+
The report was generated from the dataset in the previous analysis.
|
206 |
+
You can use load_previous_dataset() to access the same data.
|
207 |
+
"""
|
208 |
+
|
209 |
+
return summary
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
return f"Error accessing previous report: {str(e)}"
|
213 |
+
|
214 |
+
@tool
|
215 |
+
def analyze_column_correlation(df: pd.DataFrame, column1: str, column2: str) -> str:
|
216 |
+
"""
|
217 |
+
Analyze correlation between two columns in the dataset.
|
218 |
+
|
219 |
+
Args:
|
220 |
+
df: The pandas DataFrame
|
221 |
+
column1: First column name
|
222 |
+
column2: Second column name
|
223 |
+
|
224 |
+
Returns:
|
225 |
+
Correlation analysis results
|
226 |
+
"""
|
227 |
+
try:
|
228 |
+
# Check if columns exist
|
229 |
+
if column1 not in df.columns or column2 not in df.columns:
|
230 |
+
return f"Error: One or both columns not found. Available columns: {list(df.columns)}"
|
231 |
+
|
232 |
+
# Convert to numeric if possible
|
233 |
+
try:
|
234 |
+
col1_numeric = pd.to_numeric(df[column1], errors='coerce')
|
235 |
+
col2_numeric = pd.to_numeric(df[column2], errors='coerce')
|
236 |
+
except:
|
237 |
+
return f"Error: Cannot convert columns to numeric for correlation analysis"
|
238 |
+
|
239 |
+
# Calculate correlation
|
240 |
+
correlation = col1_numeric.corr(col2_numeric)
|
241 |
+
|
242 |
+
# Create scatter plot
|
243 |
+
plt.figure(figsize=(10, 6))
|
244 |
+
plt.scatter(col1_numeric, col2_numeric, alpha=0.6)
|
245 |
+
plt.xlabel(column1)
|
246 |
+
plt.ylabel(column2)
|
247 |
+
plt.title(f'Correlation between {column1} and {column2}\nCorrelation coefficient: {correlation:.3f}')
|
248 |
+
|
249 |
+
# Add trend line
|
250 |
+
if not col1_numeric.isna().all() and not col2_numeric.isna().all():
|
251 |
+
z = np.polyfit(col1_numeric.dropna(), col2_numeric.dropna(), 1)
|
252 |
+
p = np.poly1d(z)
|
253 |
+
plt.plot(col1_numeric, p(col1_numeric), "r--", alpha=0.8)
|
254 |
+
|
255 |
+
plt.tight_layout()
|
256 |
+
|
257 |
+
# Save plot
|
258 |
+
filename = f'correlation_{column1}_{column2}.png'
|
259 |
+
filepath = os.path.join('generated_data', filename)
|
260 |
+
plt.savefig(filepath, dpi=300, bbox_inches='tight')
|
261 |
+
plt.close()
|
262 |
+
|
263 |
+
# Interpret correlation
|
264 |
+
if abs(correlation) > 0.7:
|
265 |
+
strength = "strong"
|
266 |
+
elif abs(correlation) > 0.4:
|
267 |
+
strength = "moderate"
|
268 |
+
elif abs(correlation) > 0.2:
|
269 |
+
strength = "weak"
|
270 |
+
else:
|
271 |
+
strength = "very weak"
|
272 |
+
|
273 |
+
direction = "positive" if correlation > 0 else "negative"
|
274 |
+
|
275 |
+
result = f"""=== CORRELATION ANALYSIS ===
|
276 |
+
Columns: {column1} vs {column2}
|
277 |
+
Correlation coefficient: {correlation:.3f}
|
278 |
+
Strength: {strength} {direction} correlation
|
279 |
+
|
280 |
+
Interpretation:
|
281 |
+
- The correlation is {strength} and {direction}
|
282 |
+
- Values closer to 1 or -1 indicate stronger linear relationships
|
283 |
+
- Values closer to 0 indicate weaker linear relationships
|
284 |
+
|
285 |
+
Visualization saved to: {filepath}
|
286 |
+
"""
|
287 |
+
|
288 |
+
return result
|
289 |
+
|
290 |
+
except Exception as e:
|
291 |
+
return f"Error in correlation analysis: {str(e)}"
|
292 |
+
|
293 |
+
@tool
|
294 |
+
def create_statistical_summary(df: pd.DataFrame, column_name: str) -> str:
|
295 |
+
"""
|
296 |
+
Create a comprehensive statistical summary with visualization for a specific column.
|
297 |
+
|
298 |
+
Args:
|
299 |
+
df: The pandas DataFrame
|
300 |
+
column_name: Name of the column to analyze
|
301 |
+
|
302 |
+
Returns:
|
303 |
+
Statistical summary and saves a visualization
|
304 |
+
"""
|
305 |
+
try:
|
306 |
+
if column_name not in df.columns:
|
307 |
+
return f"Error: Column '{column_name}' not found. Available columns: {list(df.columns)}"
|
308 |
+
|
309 |
+
column_data = df[column_name]
|
310 |
+
|
311 |
+
# Generate statistical summary
|
312 |
+
summary_lines = [f"=== STATISTICAL SUMMARY: {column_name} ==="]
|
313 |
+
|
314 |
+
if pd.api.types.is_numeric_dtype(column_data):
|
315 |
+
# Numeric column analysis
|
316 |
+
stats = column_data.describe()
|
317 |
+
summary_lines.extend([
|
318 |
+
f"Count: {stats['count']:.0f}",
|
319 |
+
f"Mean: {stats['mean']:.2f}",
|
320 |
+
f"Median: {stats['50%']:.2f}",
|
321 |
+
f"Standard Deviation: {stats['std']:.2f}",
|
322 |
+
f"Min: {stats['min']:.2f}",
|
323 |
+
f"Max: {stats['max']:.2f}",
|
324 |
+
f"25th Percentile: {stats['25%']:.2f}",
|
325 |
+
f"75th Percentile: {stats['75%']:.2f}",
|
326 |
+
])
|
327 |
+
|
328 |
+
# Create histogram and box plot
|
329 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
330 |
+
|
331 |
+
# Histogram
|
332 |
+
ax1.hist(column_data.dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
|
333 |
+
ax1.set_title(f'Distribution of {column_name}')
|
334 |
+
ax1.set_xlabel(column_name)
|
335 |
+
ax1.set_ylabel('Frequency')
|
336 |
+
ax1.grid(True, alpha=0.3)
|
337 |
+
|
338 |
+
# Box plot
|
339 |
+
ax2.boxplot(column_data.dropna())
|
340 |
+
ax2.set_title(f'Box Plot of {column_name}')
|
341 |
+
ax2.set_ylabel(column_name)
|
342 |
+
ax2.grid(True, alpha=0.3)
|
343 |
+
|
344 |
+
else:
|
345 |
+
# Categorical column analysis
|
346 |
+
value_counts = column_data.value_counts()
|
347 |
+
summary_lines.extend([
|
348 |
+
f"Total unique values: {column_data.nunique()}",
|
349 |
+
f"Most frequent value: {value_counts.index[0]} ({value_counts.iloc[0]} times)",
|
350 |
+
f"Least frequent value: {value_counts.index[-1]} ({value_counts.iloc[-1]} times)",
|
351 |
+
"",
|
352 |
+
"Top 10 values:"
|
353 |
+
])
|
354 |
+
|
355 |
+
for value, count in value_counts.head(10).items():
|
356 |
+
percentage = (count / len(column_data)) * 100
|
357 |
+
summary_lines.append(f" {value}: {count} ({percentage:.1f}%)")
|
358 |
+
|
359 |
+
# Create bar chart and pie chart
|
360 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
361 |
+
|
362 |
+
# Bar chart
|
363 |
+
top_values = value_counts.head(10)
|
364 |
+
ax1.bar(range(len(top_values)), top_values.values, color='lightcoral')
|
365 |
+
ax1.set_title(f'Top 10 Values in {column_name}')
|
366 |
+
ax1.set_xlabel('Categories')
|
367 |
+
ax1.set_ylabel('Count')
|
368 |
+
ax1.set_xticks(range(len(top_values)))
|
369 |
+
ax1.set_xticklabels(top_values.index, rotation=45, ha='right')
|
370 |
+
ax1.grid(True, alpha=0.3)
|
371 |
+
|
372 |
+
# Pie chart (top 8 values + others)
|
373 |
+
top_8 = value_counts.head(8)
|
374 |
+
others_count = value_counts.iloc[8:].sum() if len(value_counts) > 8 else 0
|
375 |
+
|
376 |
+
if others_count > 0:
|
377 |
+
pie_data = list(top_8.values) + [others_count]
|
378 |
+
pie_labels = list(top_8.index) + ['Others']
|
379 |
+
else:
|
380 |
+
pie_data = top_8.values
|
381 |
+
pie_labels = top_8.index
|
382 |
+
|
383 |
+
ax2.pie(pie_data, labels=pie_labels, autopct='%1.1f%%', startangle=90)
|
384 |
+
ax2.set_title(f'Distribution of {column_name}')
|
385 |
+
|
386 |
+
plt.tight_layout()
|
387 |
+
|
388 |
+
# Save the plot
|
389 |
+
filename = f'statistical_summary_{column_name}.png'
|
390 |
+
filepath = os.path.join('generated_data', filename)
|
391 |
+
plt.savefig(filepath, dpi=300, bbox_inches='tight')
|
392 |
+
plt.close()
|
393 |
+
|
394 |
+
summary_lines.append(f"\nVisualization saved to: {filepath}")
|
395 |
+
|
396 |
+
return "\n".join(summary_lines)
|
397 |
+
|
398 |
+
except Exception as e:
|
399 |
+
return f"Error in statistical analysis: {str(e)}"
|
400 |
+
|
401 |
+
@tool
|
402 |
+
def filter_and_visualize_data(df: pd.DataFrame, filter_column: str, filter_value: str, analysis_column: str, chart_type: str = "bar") -> str:
|
403 |
+
"""
|
404 |
+
Filter the dataset and create a visualization of the filtered data.
|
405 |
+
|
406 |
+
Args:
|
407 |
+
df: The pandas DataFrame
|
408 |
+
filter_column: Column to filter by
|
409 |
+
filter_value: Value to filter for (can be partial match for string columns)
|
410 |
+
analysis_column: Column to analyze in the filtered data
|
411 |
+
chart_type: Type of chart to create ('bar', 'line', 'histogram', 'pie')
|
412 |
+
|
413 |
+
Returns:
|
414 |
+
Analysis results and saves a visualization
|
415 |
+
"""
|
416 |
+
try:
|
417 |
+
if filter_column not in df.columns:
|
418 |
+
return f"Error: Filter column '{filter_column}' not found. Available columns: {list(df.columns)}"
|
419 |
+
|
420 |
+
if analysis_column not in df.columns:
|
421 |
+
return f"Error: Analysis column '{analysis_column}' not found. Available columns: {list(df.columns)}"
|
422 |
+
|
423 |
+
# Filter the data
|
424 |
+
if df[filter_column].dtype == 'object':
|
425 |
+
# String filtering - partial match
|
426 |
+
filtered_df = df[df[filter_column].str.contains(filter_value, case=False, na=False)]
|
427 |
+
else:
|
428 |
+
# Numeric filtering - exact match
|
429 |
+
try:
|
430 |
+
filter_value_numeric = float(filter_value)
|
431 |
+
filtered_df = df[df[filter_column] == filter_value_numeric]
|
432 |
+
except ValueError:
|
433 |
+
return f"Error: Cannot convert '{filter_value}' to numeric for filtering"
|
434 |
+
|
435 |
+
if filtered_df.empty:
|
436 |
+
return f"No data found matching filter: {filter_column} = '{filter_value}'"
|
437 |
+
|
438 |
+
result_lines = [
|
439 |
+
f"=== FILTERED DATA ANALYSIS ===",
|
440 |
+
f"Filter: {filter_column} contains/equals '{filter_value}'",
|
441 |
+
f"Filtered dataset size: {len(filtered_df)} rows (from {len(df)} total)",
|
442 |
+
f"Analysis column: {analysis_column}",
|
443 |
+
""
|
444 |
+
]
|
445 |
+
|
446 |
+
# Analyze the filtered data
|
447 |
+
analysis_data = filtered_df[analysis_column]
|
448 |
+
|
449 |
+
plt.figure(figsize=(12, 8))
|
450 |
+
|
451 |
+
if chart_type == "bar":
|
452 |
+
if pd.api.types.is_numeric_dtype(analysis_data):
|
453 |
+
# For numeric data, create bins
|
454 |
+
analysis_data.hist(bins=20, alpha=0.7, color='lightblue', edgecolor='black')
|
455 |
+
plt.ylabel('Frequency')
|
456 |
+
else:
|
457 |
+
# For categorical data, show value counts
|
458 |
+
value_counts = analysis_data.value_counts().head(15)
|
459 |
+
plt.bar(range(len(value_counts)), value_counts.values, color='lightcoral')
|
460 |
+
plt.xticks(range(len(value_counts)), value_counts.index, rotation=45, ha='right')
|
461 |
+
plt.ylabel('Count')
|
462 |
+
|
463 |
+
# Add statistics to result
|
464 |
+
result_lines.extend([
|
465 |
+
f"Top value: {value_counts.index[0]} ({value_counts.iloc[0]} occurrences)",
|
466 |
+
f"Total unique values: {analysis_data.nunique()}"
|
467 |
+
])
|
468 |
+
|
469 |
+
elif chart_type == "line":
|
470 |
+
if pd.api.types.is_numeric_dtype(analysis_data):
|
471 |
+
sorted_data = analysis_data.sort_values()
|
472 |
+
plt.plot(range(len(sorted_data)), sorted_data.values, marker='o', alpha=0.7)
|
473 |
+
plt.ylabel(analysis_column)
|
474 |
+
plt.xlabel('Sorted Index')
|
475 |
+
else:
|
476 |
+
return "Line chart requires numeric data for analysis column"
|
477 |
+
|
478 |
+
elif chart_type == "histogram":
|
479 |
+
if pd.api.types.is_numeric_dtype(analysis_data):
|
480 |
+
plt.hist(analysis_data.dropna(), bins=30, alpha=0.7, color='green', edgecolor='black')
|
481 |
+
plt.ylabel('Frequency')
|
482 |
+
|
483 |
+
# Add statistics
|
484 |
+
mean_val = analysis_data.mean()
|
485 |
+
median_val = analysis_data.median()
|
486 |
+
result_lines.extend([
|
487 |
+
f"Mean: {mean_val:.2f}",
|
488 |
+
f"Median: {median_val:.2f}",
|
489 |
+
f"Standard Deviation: {analysis_data.std():.2f}"
|
490 |
+
])
|
491 |
+
else:
|
492 |
+
return "Histogram requires numeric data for analysis column"
|
493 |
+
|
494 |
+
elif chart_type == "pie":
|
495 |
+
value_counts = analysis_data.value_counts().head(10)
|
496 |
+
plt.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%', startangle=90)
|
497 |
+
|
498 |
+
plt.title(f'{chart_type.title()} Chart: {analysis_column}\nFiltered by {filter_column} = "{filter_value}"')
|
499 |
+
plt.xlabel(analysis_column)
|
500 |
+
plt.tight_layout()
|
501 |
+
|
502 |
+
# Save the plot
|
503 |
+
filename = f'filtered_{filter_column}_{filter_value}_{analysis_column}_{chart_type}.png'
|
504 |
+
# Clean filename
|
505 |
+
filename = "".join(c for c in filename if c.isalnum() or c in ('_', '-', '.')).rstrip()
|
506 |
+
filepath = os.path.join('generated_data', filename)
|
507 |
+
plt.savefig(filepath, dpi=300, bbox_inches='tight')
|
508 |
+
plt.close()
|
509 |
+
|
510 |
+
result_lines.append(f"\nVisualization saved to: {filepath}")
|
511 |
+
|
512 |
+
return "\n".join(result_lines)
|
513 |
+
|
514 |
+
except Exception as e:
|
515 |
+
return f"Error in filtered analysis: {str(e)}"
|
tools/libreoffice_tools.py
CHANGED
@@ -122,12 +122,23 @@ def convert_to_pdf_with_libreoffice(input_file: str, output_dir: str = None) ->
|
|
122 |
return f"Error during LibreOffice conversion: {str(e)}"
|
123 |
|
124 |
@tool
|
125 |
-
def check_libreoffice_availability() ->
|
126 |
"""
|
127 |
-
Check if LibreOffice is available
|
128 |
|
129 |
Returns:
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
"""
|
132 |
libreoffice_path = get_libreoffice_path()
|
133 |
|
|
|
122 |
return f"Error during LibreOffice conversion: {str(e)}"
|
123 |
|
124 |
@tool
|
125 |
+
def check_libreoffice_availability() -> bool:
|
126 |
"""
|
127 |
+
Check if LibreOffice is available on the system.
|
128 |
|
129 |
Returns:
|
130 |
+
bool: True if LibreOffice is available, False otherwise
|
131 |
+
"""
|
132 |
+
libreoffice_path = get_libreoffice_path()
|
133 |
+
return libreoffice_path is not None
|
134 |
+
|
135 |
+
@tool
|
136 |
+
def get_libreoffice_info() -> str:
|
137 |
+
"""
|
138 |
+
Get detailed information about LibreOffice installation for troubleshooting.
|
139 |
+
|
140 |
+
Returns:
|
141 |
+
str: Detailed information about LibreOffice availability and installation
|
142 |
"""
|
143 |
libreoffice_path = get_libreoffice_path()
|
144 |
|
tools/retrieval_tools.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
from smolagents import tool
|
6 |
+
from rank_bm25 import BM25Okapi
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from smolagents import CodeAgent, LiteLLMModel
|
9 |
+
from unidecode import unidecode
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Global variables for BM25 model
|
15 |
+
_bm25_model = None
|
16 |
+
_precomputed_titles = None
|
17 |
+
_dataset_df = None
|
18 |
+
_llm_translator = None
|
19 |
+
|
20 |
+
def _initialize_retrieval_system():
|
21 |
+
"""Initialize the retrieval system with BM25 model and dataset"""
|
22 |
+
global _bm25_model, _precomputed_titles, _dataset_df, _llm_translator
|
23 |
+
|
24 |
+
# Load dataset if not already loaded
|
25 |
+
if _dataset_df is None:
|
26 |
+
try:
|
27 |
+
_dataset_df = pd.read_csv('filtered_dataset.csv')
|
28 |
+
print(f"β
Loaded dataset with {len(_dataset_df)} entries")
|
29 |
+
except FileNotFoundError:
|
30 |
+
raise Exception("filtered_dataset.csv not found. Please ensure the dataset file exists.")
|
31 |
+
|
32 |
+
# Initialize LLM translator if not already initialized
|
33 |
+
if _llm_translator is None:
|
34 |
+
try:
|
35 |
+
model = LiteLLMModel(
|
36 |
+
model_id="gemini/gemini-2.5-flash-preview-05-20",
|
37 |
+
api_key=os.getenv("GEMINI_API_KEY")
|
38 |
+
)
|
39 |
+
_llm_translator = CodeAgent(tools=[], model=model, max_steps=1)
|
40 |
+
print("β
LLM translator initialized")
|
41 |
+
except Exception as e:
|
42 |
+
print(f"β οΈ Error initializing LLM translator: {e}")
|
43 |
+
|
44 |
+
# Load pre-computed BM25 model if available
|
45 |
+
if _bm25_model is None:
|
46 |
+
try:
|
47 |
+
with open('bm25_data.pkl', 'rb') as f:
|
48 |
+
bm25_data = pickle.load(f)
|
49 |
+
_bm25_model = bm25_data['bm25_model']
|
50 |
+
_precomputed_titles = bm25_data['titles']
|
51 |
+
print(f"β
Loaded pre-computed BM25 model for {len(_precomputed_titles)} datasets")
|
52 |
+
except FileNotFoundError:
|
53 |
+
print("β οΈ Pre-computed BM25 model not found. Will compute at runtime.")
|
54 |
+
except Exception as e:
|
55 |
+
print(f"β οΈ Error loading pre-computed BM25 model: {e}")
|
56 |
+
|
57 |
+
def _translate_query_llm(query, target_lang='fr'):
|
58 |
+
"""Translate query using LLM"""
|
59 |
+
global _llm_translator
|
60 |
+
|
61 |
+
if _llm_translator is None:
|
62 |
+
return query, 'unknown'
|
63 |
+
|
64 |
+
try:
|
65 |
+
if target_lang == 'fr':
|
66 |
+
target_language = "French"
|
67 |
+
elif target_lang == 'en':
|
68 |
+
target_language = "English"
|
69 |
+
else:
|
70 |
+
target_language = target_lang
|
71 |
+
|
72 |
+
translation_prompt = f"""
|
73 |
+
Translate the following text to {target_language}.
|
74 |
+
If the text is already in {target_language}, return it as is.
|
75 |
+
Only return the translated text, nothing else.
|
76 |
+
|
77 |
+
Text to translate: "{query}"
|
78 |
+
"""
|
79 |
+
|
80 |
+
response = _llm_translator.run(translation_prompt)
|
81 |
+
translated_text = str(response).strip().strip('"').strip("'")
|
82 |
+
|
83 |
+
# Simple language detection
|
84 |
+
if query.lower() == translated_text.lower():
|
85 |
+
source_lang = target_lang
|
86 |
+
else:
|
87 |
+
source_lang = 'en' if target_lang == 'fr' else 'fr'
|
88 |
+
|
89 |
+
return translated_text, source_lang
|
90 |
+
|
91 |
+
except Exception as e:
|
92 |
+
print(f"LLM translation error: {e}")
|
93 |
+
return query, 'unknown'
|
94 |
+
|
95 |
+
def _simple_keyword_preprocessing(text):
|
96 |
+
"""Simple preprocessing for keyword matching - handles case, accents and basic plurals"""
|
97 |
+
text = unidecode(str(text).lower())
|
98 |
+
|
99 |
+
words = text.split()
|
100 |
+
processed_words = []
|
101 |
+
|
102 |
+
for word in words:
|
103 |
+
if word.endswith('s') and len(word) > 3 and not word.endswith('ss'):
|
104 |
+
word = word[:-1]
|
105 |
+
elif word.endswith('x') and len(word) > 3:
|
106 |
+
word = word[:-1]
|
107 |
+
processed_words.append(word)
|
108 |
+
|
109 |
+
return processed_words
|
110 |
+
|
111 |
+
@tool
|
112 |
+
def search_datasets(query: str, top_k: int = 5) -> str:
|
113 |
+
"""
|
114 |
+
Search for relevant datasets in the French public data catalog using BM25-based keyword matching.
|
115 |
+
|
116 |
+
Args:
|
117 |
+
query: The search query describing what kind of dataset you're looking for
|
118 |
+
top_k: Number of top results to return (default: 5)
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
A formatted string containing the top matching datasets with their titles, URLs, and relevance scores
|
122 |
+
"""
|
123 |
+
try:
|
124 |
+
# Initialize the retrieval system
|
125 |
+
_initialize_retrieval_system()
|
126 |
+
|
127 |
+
global _bm25_model, _precomputed_titles, _dataset_df
|
128 |
+
|
129 |
+
# Translate query to French for better matching
|
130 |
+
translated_query, original_lang = _translate_query_llm(query, target_lang='fr')
|
131 |
+
|
132 |
+
# Combine original and translated queries for search
|
133 |
+
search_queries = [query, translated_query] if query != translated_query else [query]
|
134 |
+
|
135 |
+
# Get dataset titles
|
136 |
+
dataset_titles = _dataset_df['title'].fillna('').tolist()
|
137 |
+
|
138 |
+
# Use pre-computed BM25 model if available and matches current dataset
|
139 |
+
if (_bm25_model is not None and _precomputed_titles is not None and
|
140 |
+
len(dataset_titles) == len(_precomputed_titles) and dataset_titles == _precomputed_titles):
|
141 |
+
bm25 = _bm25_model
|
142 |
+
else:
|
143 |
+
# Build BM25 model at runtime
|
144 |
+
processed_titles = [_simple_keyword_preprocessing(title) for title in dataset_titles]
|
145 |
+
bm25 = BM25Okapi(processed_titles)
|
146 |
+
|
147 |
+
# Get scores for all search queries and find best matches
|
148 |
+
all_scores = []
|
149 |
+
for search_query in search_queries:
|
150 |
+
try:
|
151 |
+
processed_query = _simple_keyword_preprocessing(search_query)
|
152 |
+
scores = bm25.get_scores(processed_query)
|
153 |
+
all_scores.append(scores)
|
154 |
+
except Exception as e:
|
155 |
+
print(f"Error processing query '{search_query}': {e}")
|
156 |
+
continue
|
157 |
+
|
158 |
+
if not all_scores:
|
159 |
+
return "Error: Could not process any search queries"
|
160 |
+
|
161 |
+
# Combine scores (take maximum across all queries)
|
162 |
+
combined_scores = all_scores[0]
|
163 |
+
for scores in all_scores[1:]:
|
164 |
+
combined_scores = np.maximum(combined_scores, scores)
|
165 |
+
|
166 |
+
# Get top-k results
|
167 |
+
top_indices = combined_scores.argsort()[-top_k:][::-1]
|
168 |
+
|
169 |
+
# Format results
|
170 |
+
results = []
|
171 |
+
results.append(f"Top {top_k} datasets for query: '{query}'")
|
172 |
+
if query != translated_query:
|
173 |
+
results.append(f"(Translated to French: '{translated_query}')")
|
174 |
+
results.append("")
|
175 |
+
|
176 |
+
for i, idx in enumerate(top_indices, 1):
|
177 |
+
score = combined_scores[idx]
|
178 |
+
title = _dataset_df.iloc[idx]['title']
|
179 |
+
url = _dataset_df.iloc[idx]['url']
|
180 |
+
organization = _dataset_df.iloc[idx].get('organization', 'N/A')
|
181 |
+
|
182 |
+
results.append(f"{i}. Score: {score:.2f}")
|
183 |
+
results.append(f" Title: {title}")
|
184 |
+
results.append(f" URL: {url}")
|
185 |
+
results.append(f" Organization: {organization}")
|
186 |
+
results.append("")
|
187 |
+
|
188 |
+
return "\n".join(results)
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
return f"Error during dataset search: {str(e)}"
|
192 |
+
|
193 |
+
@tool
|
194 |
+
def get_dataset_info(dataset_url: str) -> str:
|
195 |
+
"""
|
196 |
+
Get detailed information about a specific dataset from its data.gouv.fr URL.
|
197 |
+
|
198 |
+
Args:
|
199 |
+
dataset_url: The URL of the dataset page on data.gouv.fr
|
200 |
+
|
201 |
+
Returns:
|
202 |
+
Detailed information about the dataset including title, description, organization, and metadata
|
203 |
+
"""
|
204 |
+
try:
|
205 |
+
_initialize_retrieval_system()
|
206 |
+
|
207 |
+
global _dataset_df
|
208 |
+
|
209 |
+
# Find the dataset in our catalog
|
210 |
+
matching_rows = _dataset_df[_dataset_df['url'] == dataset_url]
|
211 |
+
|
212 |
+
if matching_rows.empty:
|
213 |
+
return f"Dataset not found in catalog for URL: {dataset_url}"
|
214 |
+
|
215 |
+
dataset = matching_rows.iloc[0]
|
216 |
+
|
217 |
+
# Format the dataset information
|
218 |
+
info_lines = []
|
219 |
+
info_lines.append("=== DATASET INFORMATION ===")
|
220 |
+
info_lines.append(f"Title: {dataset.get('title', 'N/A')}")
|
221 |
+
info_lines.append(f"URL: {dataset.get('url', 'N/A')}")
|
222 |
+
info_lines.append(f"Organization: {dataset.get('organization', 'N/A')}")
|
223 |
+
|
224 |
+
if 'description' in dataset and pd.notna(dataset['description']):
|
225 |
+
description = str(dataset['description'])
|
226 |
+
if len(description) > 500:
|
227 |
+
description = description[:500] + "..."
|
228 |
+
info_lines.append(f"Description: {description}")
|
229 |
+
|
230 |
+
if 'tags' in dataset and pd.notna(dataset['tags']):
|
231 |
+
info_lines.append(f"Tags: {dataset['tags']}")
|
232 |
+
|
233 |
+
if 'license' in dataset and pd.notna(dataset['license']):
|
234 |
+
info_lines.append(f"License: {dataset['license']}")
|
235 |
+
|
236 |
+
if 'temporal_coverage' in dataset and pd.notna(dataset['temporal_coverage']):
|
237 |
+
info_lines.append(f"Temporal Coverage: {dataset['temporal_coverage']}")
|
238 |
+
|
239 |
+
if 'spatial_coverage' in dataset and pd.notna(dataset['spatial_coverage']):
|
240 |
+
info_lines.append(f"Spatial Coverage: {dataset['spatial_coverage']}")
|
241 |
+
|
242 |
+
if 'quality_score' in dataset and pd.notna(dataset['quality_score']):
|
243 |
+
info_lines.append(f"Quality Score: {dataset['quality_score']}")
|
244 |
+
|
245 |
+
return "\n".join(info_lines)
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
return f"Error getting dataset info: {str(e)}"
|
249 |
+
|
250 |
+
@tool
|
251 |
+
def get_random_quality_dataset() -> str:
|
252 |
+
"""
|
253 |
+
Get a random high-quality dataset from the catalog, weighted by quality score.
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
Information about a randomly selected high-quality dataset
|
257 |
+
"""
|
258 |
+
try:
|
259 |
+
_initialize_retrieval_system()
|
260 |
+
|
261 |
+
global _dataset_df
|
262 |
+
|
263 |
+
# Use quality_score as weights for random selection
|
264 |
+
if 'quality_score' in _dataset_df.columns:
|
265 |
+
weights = _dataset_df['quality_score'].fillna(0)
|
266 |
+
weights = weights - weights.min() + 0.1 # Shift to make all positive
|
267 |
+
else:
|
268 |
+
weights = None
|
269 |
+
|
270 |
+
# Randomly sample one dataset weighted by quality
|
271 |
+
selected_row = _dataset_df.sample(n=1, weights=weights).iloc[0]
|
272 |
+
|
273 |
+
# Return dataset info
|
274 |
+
return get_dataset_info(selected_row['url'])
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
return f"Error getting random dataset: {str(e)}"
|
tools/webpage_tools.py
CHANGED
@@ -153,6 +153,32 @@ def read_file_from_url(url: str) -> pd.DataFrame:
|
|
153 |
except Exception as e:
|
154 |
raise Exception(f"An unexpected error occurred: {str(e)}")
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
if __name__ == "__main__":
|
157 |
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
|
158 |
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
|
|
|
153 |
except Exception as e:
|
154 |
raise Exception(f"An unexpected error occurred: {str(e)}")
|
155 |
|
156 |
+
@tool
|
157 |
+
def save_dataset_for_followup(df: pd.DataFrame, filename: str = "analysis_dataset.csv") -> str:
|
158 |
+
"""
|
159 |
+
Save the current dataset to the generated_data folder for follow-up analysis.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
df: The pandas DataFrame to save
|
163 |
+
filename: Name of the file to save (default: "analysis_dataset.csv")
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
Confirmation message with file path
|
167 |
+
"""
|
168 |
+
try:
|
169 |
+
# Ensure generated_data directory exists
|
170 |
+
import os
|
171 |
+
os.makedirs('generated_data', exist_ok=True)
|
172 |
+
|
173 |
+
# Save the dataset
|
174 |
+
filepath = os.path.join('generated_data', filename)
|
175 |
+
df.to_csv(filepath, index=False)
|
176 |
+
|
177 |
+
return f"Dataset saved for follow-up analysis: {filepath} ({len(df)} rows, {len(df.columns)} columns)"
|
178 |
+
|
179 |
+
except Exception as e:
|
180 |
+
return f"Error saving dataset: {str(e)}"
|
181 |
+
|
182 |
if __name__ == "__main__":
|
183 |
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
|
184 |
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
|