Commit
·
2508004
1
Parent(s):
97eafcb
update
Browse files- .gitignore +6 -0
- README.md +206 -1
- agent.py +87 -0
- app.py +635 -0
- filtered_dataset.csv +0 -0
- france_data/departements.geojson +0 -0
- france_data/regions.geojson +0 -0
- launch_gradio.py +30 -0
- reexport_data.py +97 -0
- requirements.txt +22 -0
- tools/drawing_tools.py +206 -0
- tools/exploration_tools.py +84 -0
- tools/libreoffice_tools.py +155 -0
- tools/webpage_tools.py +168 -0
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__/
|
3 |
+
OLD/
|
4 |
+
dataset_metadata/
|
5 |
+
generated_data/
|
6 |
+
.gradio/
|
README.md
CHANGED
@@ -11,4 +11,209 @@ license: mit
|
|
11 |
short_description: Agents for data analysis of French public data.
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
short_description: Agents for data analysis of French public data.
|
12 |
---
|
13 |
|
14 |
+
# 🤖 French Public Data Analysis Agent
|
15 |
+
|
16 |
+
**AI-powered intelligent analysis of French government datasets** with automated visualization generation and comprehensive PDF reports.
|
17 |
+
|
18 |
+
## ✨ Features
|
19 |
+
|
20 |
+
### 🔍 **Intelligent Dataset Discovery**
|
21 |
+
- **BM25 Keyword Search**: Advanced keyword matching with pre-computed search indices
|
22 |
+
- **Bilingual Query Translation**: Search in French or English - queries are automatically translated using LLM
|
23 |
+
- **Quality-Weighted Random Selection**: Leave query empty to randomly select high-quality datasets
|
24 |
+
- **Real-time Dataset Matching**: Instant matching against 5,000+ French government datasets
|
25 |
+
|
26 |
+
### 🤖 **Automated AI Analysis**
|
27 |
+
- **SmolAgents Integration**: Advanced AI agent with 30+ step planning capability
|
28 |
+
- **Custom Tool Suite**: Specialized tools for web scraping, data analysis, and visualization
|
29 |
+
- **Multi-step Processing**: Complete pipeline from data discovery to report generation
|
30 |
+
- **Error Recovery**: Smart error handling and alternative data source selection
|
31 |
+
|
32 |
+
### 📊 **Advanced Visualizations**
|
33 |
+
- **France Geographic Maps**: Department and region-level choropleth maps
|
34 |
+
- **Multiple Chart Types**: Bar charts, line plots, scatter plots, heatmaps
|
35 |
+
- **Smart Visualization Selection**: AI automatically chooses appropriate chart types
|
36 |
+
- **High-Quality PNG Output**: Publication-ready visualizations
|
37 |
+
|
38 |
+
### 📄 **Comprehensive Reports**
|
39 |
+
- **Professional PDF Reports**: Complete analysis with embedded visualizations
|
40 |
+
- **Bilingual Support**: Reports generated in the same language as your query
|
41 |
+
- **Structured Analysis**: Title page, methodology, findings, and next steps
|
42 |
+
- **LibreOffice Integration**: Cross-platform PDF generation
|
43 |
+
|
44 |
+
### 🎨 **Modern Web Interface**
|
45 |
+
- **Real-time Progress Tracking**: Detailed step-by-step progress updates
|
46 |
+
- **Responsive Design**: Beautiful, modern Gradio interface
|
47 |
+
- **Quick Start Examples**: Pre-built queries for common use cases
|
48 |
+
- **Accordion Tips**: Collapsible help section with usage instructions
|
49 |
+
|
50 |
+
## 🚀 Quick Start
|
51 |
+
|
52 |
+
### 1. Prerequisites
|
53 |
+
|
54 |
+
- Python 3.8+
|
55 |
+
- LibreOffice (for PDF generation)
|
56 |
+
- Google Gemini API key
|
57 |
+
|
58 |
+
### 2. Installation
|
59 |
+
|
60 |
+
```bash
|
61 |
+
# Clone the repository
|
62 |
+
git clone <repository-url>
|
63 |
+
cd gradio_hackathon_agent
|
64 |
+
|
65 |
+
# Install dependencies
|
66 |
+
pip install -r requirements.txt
|
67 |
+
```
|
68 |
+
|
69 |
+
### 3. Environment Setup
|
70 |
+
|
71 |
+
Create a `.env` file in the project root:
|
72 |
+
|
73 |
+
```bash
|
74 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
75 |
+
```
|
76 |
+
|
77 |
+
### 4. Launch the Application
|
78 |
+
|
79 |
+
```bash
|
80 |
+
python gradio_interface.py
|
81 |
+
```
|
82 |
+
|
83 |
+
The interface will be available at:
|
84 |
+
- **Local**: http://localhost:7860
|
85 |
+
- **Public**: Shareable URL provided automatically
|
86 |
+
|
87 |
+
## 💡 How to Use
|
88 |
+
|
89 |
+
### Basic Usage
|
90 |
+
|
91 |
+
1. **Enter Your Query**: Type any search term related to French public data
|
92 |
+
- Examples: "road traffic accidents", "education directory", "housing data"
|
93 |
+
- Supports both French and English queries
|
94 |
+
|
95 |
+
2. **Or Use Quick Examples**: Click any of the pre-built example queries:
|
96 |
+
- 🚗 Road Traffic Accidents 2005-2023
|
97 |
+
- 🎓 Education Directory
|
98 |
+
- 🏠 French Vacant Housing Private Park
|
99 |
+
|
100 |
+
3. **Or Go Random**: Leave the query empty to randomly select a high-quality dataset
|
101 |
+
|
102 |
+
4. **Click "🚀 Analyze Dataset"**: The AI agent begins processing
|
103 |
+
|
104 |
+
### Results
|
105 |
+
|
106 |
+
- **Download PDF Report**: Complete analysis with all visualizations
|
107 |
+
- **View Individual Charts**: Up to 4 visualizations displayed in the interface
|
108 |
+
- **Dataset Reference**: Direct link to the original data.gouv.fr page
|
109 |
+
|
110 |
+
## 🛠️ Technical Architecture
|
111 |
+
|
112 |
+
### Core Components
|
113 |
+
|
114 |
+
```
|
115 |
+
📁 Project Structure
|
116 |
+
├── app.py # Main Gradio web interface with progress tracking
|
117 |
+
├── agent.py # SmolAgents configuration and prompt generation
|
118 |
+
├── tools/ # Custom agent tools
|
119 |
+
│ ├── webpage_tools.py # Web scraping and data extraction
|
120 |
+
│ ├── exploration_tools.py # Dataset analysis and description
|
121 |
+
│ ├── drawing_tools.py # France map generation and visualization
|
122 |
+
│ └── libreoffice_tools.py # PDF conversion utilities
|
123 |
+
├── filtered_dataset.csv # Pre-processed dataset index (5,000+ datasets)
|
124 |
+
└── france_data/ # Geographic data for France maps
|
125 |
+
```
|
126 |
+
|
127 |
+
### Key Technologies
|
128 |
+
|
129 |
+
- **Frontend**: Gradio with custom CSS and real-time progress
|
130 |
+
- **AI Agent**: SmolAgents powered by an MLLM
|
131 |
+
- **Search**: BM25 keyword matching with TF-IDF preprocessing
|
132 |
+
- **Translation**: LLM-powered bilingual query translation
|
133 |
+
- **Visualization**: Matplotlib, Geopandas, Seaborn
|
134 |
+
- **PDF Generation**: python-docx + LibreOffice conversion
|
135 |
+
- **Data Processing**: Pandas, NumPy, Shapely
|
136 |
+
|
137 |
+
### Smart Features
|
138 |
+
|
139 |
+
#### BM25 Search Enhancement
|
140 |
+
- Pre-computed search indices for 5,000+ datasets
|
141 |
+
- Accent-insensitive keyword matching
|
142 |
+
- Plural form normalization
|
143 |
+
- Quality-score weighted ranking
|
144 |
+
|
145 |
+
#### LLM Translation
|
146 |
+
- Automatic French ↔ English translation
|
147 |
+
- Query language detection
|
148 |
+
- Bilingual result matching
|
149 |
+
- Context-aware translations
|
150 |
+
|
151 |
+
#### Progress System
|
152 |
+
- Thread-safe progress tracking
|
153 |
+
- Queue-based status updates
|
154 |
+
- Step-by-step visual feedback
|
155 |
+
- Non-blocking UI execution
|
156 |
+
|
157 |
+
## 🔧 Troubleshooting
|
158 |
+
|
159 |
+
### Common Issues
|
160 |
+
|
161 |
+
1. **"No CSV/JSON files found"**
|
162 |
+
- The selected dataset doesn't contain processable files
|
163 |
+
- Try a different query or use the random selection
|
164 |
+
|
165 |
+
2. **LibreOffice PDF conversion fails**
|
166 |
+
- Ensure LibreOffice is installed and accessible
|
167 |
+
- Check the console for specific error messages
|
168 |
+
|
169 |
+
3. **Translation errors**
|
170 |
+
- Verify your API key is valid
|
171 |
+
- Check API quota and rate limits
|
172 |
+
|
173 |
+
4. **Slow performance**
|
174 |
+
- BM25 index computation may take time on first run
|
175 |
+
- Pre-computed indices are cached for faster subsequent searches
|
176 |
+
|
177 |
+
### Performance Optimization
|
178 |
+
|
179 |
+
- **Pre-compute BM25**: Run the search once to generate `bm25_data.pkl`
|
180 |
+
- **Use SSD storage**: Faster file I/O for large datasets
|
181 |
+
- **Monitor API usage**: API calls for translation and agent execution
|
182 |
+
|
183 |
+
## 📊 Dataset Coverage
|
184 |
+
|
185 |
+
- **5,000+ Datasets**: Pre-filtered French government datasets
|
186 |
+
- **Data Sources**: data.gouv.fr, INSEE, regional authorities
|
187 |
+
- **File Formats**: CSV, JSON, Excel, XML
|
188 |
+
- **Topics**: All major sectors of French public administration
|
189 |
+
- **Quality Scores**: Datasets ranked by completeness and usability
|
190 |
+
|
191 |
+
## 🚀 Advanced Usage
|
192 |
+
|
193 |
+
### Custom Tool Development
|
194 |
+
Add new tools to the `tools/` directory following the SmolAgents tool pattern.
|
195 |
+
|
196 |
+
### BM25 Index Optimization
|
197 |
+
Regenerate search indices with:
|
198 |
+
```python
|
199 |
+
# Run once to create optimized search index
|
200 |
+
python -c "from app import initialize_models; initialize_models()"
|
201 |
+
```
|
202 |
+
|
203 |
+
### Batch Processing
|
204 |
+
Process multiple datasets programmatically using the agent directly.
|
205 |
+
|
206 |
+
## 📄 License
|
207 |
+
|
208 |
+
This project is developed for the Gradio MCP x Agents Hackathon. See individual tool licenses for third-party components.
|
209 |
+
|
210 |
+
## 🤝 Contributing
|
211 |
+
|
212 |
+
1. Fork the repository
|
213 |
+
2. Create a feature branch
|
214 |
+
3. Add your improvements
|
215 |
+
4. Submit a pull request
|
216 |
+
|
217 |
+
---
|
218 |
+
|
219 |
+
**🎉 Ready to explore French public data with AI? Launch the interface and start analyzing!**
|
agent.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from tools.webpage_tools import (
|
3 |
+
visit_webpage,
|
4 |
+
get_all_links,
|
5 |
+
read_file_from_url,
|
6 |
+
)
|
7 |
+
from tools.exploration_tools import (
|
8 |
+
get_dataset_description,
|
9 |
+
)
|
10 |
+
from tools.drawing_tools import (
|
11 |
+
plot_departments_data,
|
12 |
+
)
|
13 |
+
from tools.libreoffice_tools import (
|
14 |
+
convert_to_pdf_with_libreoffice,
|
15 |
+
check_libreoffice_availability,
|
16 |
+
)
|
17 |
+
from smolagents import (
|
18 |
+
CodeAgent,
|
19 |
+
DuckDuckGoSearchTool,
|
20 |
+
LiteLLMModel,
|
21 |
+
)
|
22 |
+
|
23 |
+
def create_web_agent(step_callback):
|
24 |
+
search_tool = DuckDuckGoSearchTool()
|
25 |
+
model = LiteLLMModel(
|
26 |
+
model_id="gemini/gemini-2.5-flash-preview-05-20",
|
27 |
+
api_key=os.getenv("GEMINI_API_KEY"),
|
28 |
+
)
|
29 |
+
web_agent = CodeAgent(
|
30 |
+
tools=[
|
31 |
+
search_tool,
|
32 |
+
visit_webpage, get_all_links, read_file_from_url,
|
33 |
+
get_dataset_description,
|
34 |
+
plot_departments_data,
|
35 |
+
convert_to_pdf_with_libreoffice,
|
36 |
+
check_libreoffice_availability
|
37 |
+
],
|
38 |
+
model=model,
|
39 |
+
max_steps=30,
|
40 |
+
verbosity_level=1, # Reduced verbosity for cleaner output
|
41 |
+
planning_interval=3,
|
42 |
+
step_callbacks=[step_callback], # Use the built-in callback system
|
43 |
+
additional_authorized_imports=[
|
44 |
+
"subprocess", "docx", "docx.*",
|
45 |
+
"os", "bs4", "io", "requests", "json", "pandas",
|
46 |
+
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
|
47 |
+
],
|
48 |
+
)
|
49 |
+
return web_agent
|
50 |
+
|
51 |
+
def generate_prompt(data_gouv_page):
|
52 |
+
return f"""Fetch me a dataset that can be just read by using the read_file_from_url tool
|
53 |
+
from {data_gouv_page}
|
54 |
+
Follow the steps below to generate a pdf report from the dataset.
|
55 |
+
|
56 |
+
The steps should be as follows:
|
57 |
+
1. Examine the page
|
58 |
+
2. Get all links
|
59 |
+
3. Get the dataset from the link
|
60 |
+
4. Get information about the dataset using the get_dataset_description tool
|
61 |
+
5. Decide on what you can draw based on either department or region data
|
62 |
+
5.1 if no data department or region level, look for another file!
|
63 |
+
6. Draw a map of France using your idea
|
64 |
+
7. Save the map in png file
|
65 |
+
8. Make as well 3 additional visualizations, not maps, that you can save in png files
|
66 |
+
9. Write an interesting analysis text for each of your visualizations. Be smart and think cleverly about the data and what it can state
|
67 |
+
10. Think of next step analysis to look at the data
|
68 |
+
11. Generate a comprehensive PDF report using the python-docx library that includes:
|
69 |
+
- A title page with the dataset name and analysis overview
|
70 |
+
- All your visualizations (PNG files) embedded in the report
|
71 |
+
- Your analysis text for each visualization
|
72 |
+
- Conclusions and next steps
|
73 |
+
Make the visualizations appropriately sized so they fit well in the PDF report.
|
74 |
+
Convert then that docx file to pdf using the convert_to_pdf_with_libreoffice tool.
|
75 |
+
|
76 |
+
Do not overcommit, just do the steps one by one and it should go fine! Do not, under any circumstance, use the 'os' module!
|
77 |
+
Do not generate a lot of code every step, go slowly but surely and it will work out. Save everything within the generated_data folder.
|
78 |
+
If question is in english, report is in english.
|
79 |
+
If question is in french, report is in french.
|
80 |
+
|
81 |
+
IMPORTANT LIBREOFFICE NOTES:
|
82 |
+
- If you need to use LibreOffice, first call check_libreoffice_availability() to verify it's available
|
83 |
+
- If LibreOffice is available, "LibreOffice found" is returned by "check_libreoffice_availability()"
|
84 |
+
- Use convert_to_pdf_with_libreoffice() tool instead of subprocess calls
|
85 |
+
- Do NOT use subprocess.run(['libreoffice', ...]) or subprocess.run(['soffice', ...]) directly
|
86 |
+
- The LibreOffice tools handle macOS, Linux, and Windows path differences automatically
|
87 |
+
"""
|
app.py
ADDED
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
import glob
|
5 |
+
import threading
|
6 |
+
import time
|
7 |
+
import queue
|
8 |
+
import numpy as np
|
9 |
+
from rank_bm25 import BM25Okapi
|
10 |
+
import re
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from smolagents import CodeAgent, LiteLLMModel
|
13 |
+
from agent import create_web_agent, generate_prompt
|
14 |
+
from unidecode import unidecode
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
# Global variables for progress tracking
|
19 |
+
progress_queue = queue.Queue()
|
20 |
+
current_status = ""
|
21 |
+
|
22 |
+
# Initialize LLM translator and BM25
|
23 |
+
llm_translator = None
|
24 |
+
bm25_model = None
|
25 |
+
precomputed_titles = None
|
26 |
+
|
27 |
+
def initialize_models():
|
28 |
+
"""Initialize the LLM translator and BM25 model"""
|
29 |
+
global llm_translator, bm25_model, precomputed_titles
|
30 |
+
|
31 |
+
if llm_translator is None:
|
32 |
+
# Initialize LLM for translation
|
33 |
+
try:
|
34 |
+
model = LiteLLMModel(
|
35 |
+
model_id="gemini/gemini-2.5-flash-preview-05-20",
|
36 |
+
api_key=os.getenv("GEMINI_API_KEY")
|
37 |
+
)
|
38 |
+
llm_translator = CodeAgent(tools=[], model=model, max_steps=1)
|
39 |
+
print("✅ LLM translator initialized")
|
40 |
+
except Exception as e:
|
41 |
+
print(f"⚠️ Error initializing LLM translator: {e}")
|
42 |
+
|
43 |
+
# Load pre-computed BM25 model if available
|
44 |
+
if bm25_model is None:
|
45 |
+
try:
|
46 |
+
import pickle
|
47 |
+
with open('bm25_data.pkl', 'rb') as f:
|
48 |
+
bm25_data = pickle.load(f)
|
49 |
+
bm25_model = bm25_data['bm25_model']
|
50 |
+
precomputed_titles = bm25_data['titles']
|
51 |
+
print(f"✅ Loaded pre-computed BM25 model for {len(precomputed_titles)} datasets")
|
52 |
+
except FileNotFoundError:
|
53 |
+
print("⚠️ Pre-computed BM25 model not found. Will compute at runtime.")
|
54 |
+
except Exception as e:
|
55 |
+
print(f"⚠️ Error loading pre-computed BM25 model: {e}")
|
56 |
+
print("Will compute BM25 at runtime.")
|
57 |
+
|
58 |
+
def translate_query_llm(query, target_lang='fr'):
|
59 |
+
"""Translate query using LLM"""
|
60 |
+
global llm_translator
|
61 |
+
|
62 |
+
if llm_translator is None:
|
63 |
+
initialize_models()
|
64 |
+
|
65 |
+
if llm_translator is None:
|
66 |
+
print("⚠️ LLM translator not available, returning original query")
|
67 |
+
return query, 'unknown'
|
68 |
+
|
69 |
+
try:
|
70 |
+
# Create translation prompt
|
71 |
+
if target_lang == 'fr':
|
72 |
+
target_language = "French"
|
73 |
+
elif target_lang == 'en':
|
74 |
+
target_language = "English"
|
75 |
+
else:
|
76 |
+
target_language = target_lang
|
77 |
+
|
78 |
+
translation_prompt = f"""
|
79 |
+
Translate the following text to {target_language}.
|
80 |
+
If the text is already in {target_language}, return it as is.
|
81 |
+
Only return the translated text, nothing else.
|
82 |
+
|
83 |
+
Text to translate: "{query}"
|
84 |
+
"""
|
85 |
+
|
86 |
+
# Get translation from LLM
|
87 |
+
response = llm_translator.run(translation_prompt)
|
88 |
+
translated_text = str(response).strip().strip('"').strip("'")
|
89 |
+
|
90 |
+
# Simple language detection
|
91 |
+
if query.lower() == translated_text.lower():
|
92 |
+
source_lang = target_lang
|
93 |
+
else:
|
94 |
+
source_lang = 'en' if target_lang == 'fr' else 'fr'
|
95 |
+
|
96 |
+
return translated_text, source_lang
|
97 |
+
|
98 |
+
except Exception as e:
|
99 |
+
print(f"LLM translation error: {e}")
|
100 |
+
return query, 'unknown'
|
101 |
+
|
102 |
+
def simple_keyword_preprocessing(text):
|
103 |
+
"""Simple preprocessing for keyword matching - handles case, accents and basic plurals"""
|
104 |
+
# Convert to lowercase and remove accents
|
105 |
+
text = unidecode(str(text).lower())
|
106 |
+
|
107 |
+
# Basic plural handling - just remove trailing 's' and 'x'
|
108 |
+
words = text.split()
|
109 |
+
processed_words = []
|
110 |
+
|
111 |
+
for word in words:
|
112 |
+
# Remove common plural endings
|
113 |
+
if word.endswith('s') and len(word) > 3 and not word.endswith('ss'):
|
114 |
+
word = word[:-1]
|
115 |
+
elif word.endswith('x') and len(word) > 3:
|
116 |
+
word = word[:-1]
|
117 |
+
processed_words.append(word)
|
118 |
+
|
119 |
+
return processed_words
|
120 |
+
|
121 |
+
def find_similar_dataset_bm25(query, df):
|
122 |
+
"""Find the most similar dataset using BM25 keyword matching"""
|
123 |
+
global bm25_model, precomputed_titles
|
124 |
+
|
125 |
+
# Translate query to French for better matching with French datasets
|
126 |
+
translated_query, original_lang = translate_query_llm(query, target_lang='fr')
|
127 |
+
|
128 |
+
# Combine original and translated queries for search
|
129 |
+
search_queries = [query, translated_query] if query != translated_query else [query]
|
130 |
+
|
131 |
+
# Get dataset titles
|
132 |
+
dataset_titles = df['title'].fillna('').tolist()
|
133 |
+
|
134 |
+
# Use pre-computed BM25 model if available and matches current dataset
|
135 |
+
if (bm25_model is not None and precomputed_titles is not None and
|
136 |
+
len(dataset_titles) == len(precomputed_titles) and dataset_titles == precomputed_titles):
|
137 |
+
print("🚀 Using pre-computed BM25 model for fast matching")
|
138 |
+
bm25 = bm25_model
|
139 |
+
else:
|
140 |
+
# Build BM25 model at runtime
|
141 |
+
print("⚠️ Computing BM25 model at runtime...")
|
142 |
+
# Preprocess all dataset titles into tokenized form
|
143 |
+
processed_titles = [simple_keyword_preprocessing(title) for title in dataset_titles]
|
144 |
+
bm25 = BM25Okapi(processed_titles)
|
145 |
+
|
146 |
+
best_score = -1
|
147 |
+
best_idx = 0
|
148 |
+
|
149 |
+
for search_query in search_queries:
|
150 |
+
try:
|
151 |
+
# Preprocess the search query
|
152 |
+
processed_query = simple_keyword_preprocessing(search_query)
|
153 |
+
|
154 |
+
# Get BM25 scores for all documents
|
155 |
+
scores = bm25.get_scores(processed_query)
|
156 |
+
|
157 |
+
max_score = scores.max()
|
158 |
+
max_idx = scores.argmax()
|
159 |
+
if max_score > best_score:
|
160 |
+
best_score = max_score
|
161 |
+
best_idx = max_idx
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error processing query '{search_query}': {e}")
|
164 |
+
continue
|
165 |
+
|
166 |
+
# Show top 5 matches for comparison
|
167 |
+
if len(search_queries) > 0:
|
168 |
+
processed_query = simple_keyword_preprocessing(search_queries[0])
|
169 |
+
scores = bm25.get_scores(processed_query)
|
170 |
+
return best_idx, best_score, translated_query, original_lang
|
171 |
+
|
172 |
+
def create_progress_callback():
|
173 |
+
"""Create a callback function for tracking agent progress"""
|
174 |
+
|
175 |
+
def progress_callback(memory_step, agent=None):
|
176 |
+
"""Callback function called at each agent step"""
|
177 |
+
step_number = memory_step.step_number
|
178 |
+
|
179 |
+
# Extract information about the current step
|
180 |
+
if hasattr(memory_step, 'action_input') and memory_step.action_input:
|
181 |
+
action_content = memory_step.action_input
|
182 |
+
elif hasattr(memory_step, 'action_output') and memory_step.action_output:
|
183 |
+
action_content = str(memory_step.action_output)
|
184 |
+
else:
|
185 |
+
action_content = ""
|
186 |
+
|
187 |
+
# Define progress based on step content and number
|
188 |
+
progress_val = min(0.1 + (step_number * 0.03), 0.95) # Progressive increase
|
189 |
+
|
190 |
+
# Analyze the step content to provide meaningful status
|
191 |
+
action_lower = action_content.lower() if action_content else ""
|
192 |
+
|
193 |
+
if "visit_webpage" in action_lower or "examining" in action_lower:
|
194 |
+
description = f"🔍 Step {step_number}: Examining webpage..."
|
195 |
+
elif "get_all_links" in action_lower or "links" in action_lower:
|
196 |
+
description = f"🔗 Step {step_number}: Extracting data links..."
|
197 |
+
elif "read_file_from_url" in action_lower or "reading" in action_lower:
|
198 |
+
description = f"📊 Step {step_number}: Loading dataset..."
|
199 |
+
elif "get_dataset_description" in action_lower or "description" in action_lower:
|
200 |
+
description = f"📋 Step {step_number}: Analyzing dataset structure..."
|
201 |
+
elif "department" in action_lower or "region" in action_lower:
|
202 |
+
description = f"🗺️ Step {step_number}: Processing geographic data..."
|
203 |
+
elif "plot" in action_lower or "map" in action_lower or "france" in action_lower:
|
204 |
+
description = f"🗺️ Step {step_number}: Creating France map..."
|
205 |
+
elif "visualization" in action_lower or "chart" in action_lower:
|
206 |
+
description = f"📈 Step {step_number}: Generating visualizations..."
|
207 |
+
elif "save" in action_lower or "png" in action_lower:
|
208 |
+
description = f"💾 Step {step_number}: Saving visualizations..."
|
209 |
+
elif "pdf" in action_lower or "report" in action_lower:
|
210 |
+
description = f"📄 Step {step_number}: Creating PDF report..."
|
211 |
+
elif hasattr(memory_step, 'error') and memory_step.error:
|
212 |
+
description = f"⚠️ Step {step_number}: Handling error..."
|
213 |
+
else:
|
214 |
+
description = f"🤖 Step {step_number}: Processing..."
|
215 |
+
|
216 |
+
# Check if this is the final step
|
217 |
+
if hasattr(memory_step, 'action_output') and memory_step.action_output and "final" in action_lower:
|
218 |
+
progress_val = 1.0
|
219 |
+
description = "✅ Analysis complete!"
|
220 |
+
|
221 |
+
# Put the progress update in the queue
|
222 |
+
try:
|
223 |
+
progress_queue.put((progress_val, description))
|
224 |
+
except:
|
225 |
+
pass
|
226 |
+
|
227 |
+
return progress_callback
|
228 |
+
|
229 |
+
def run_agent_analysis_with_progress(query, progress_callback, df=None, page_url_callback=None, data_gouv_page=None, most_similar_idx=None):
|
230 |
+
"""
|
231 |
+
Run the agent analysis with progress tracking using smolagents callbacks.
|
232 |
+
"""
|
233 |
+
try:
|
234 |
+
# Clean up previous results
|
235 |
+
if os.path.exists('generated_data'):
|
236 |
+
for file in glob.glob('generated_data/*'):
|
237 |
+
try:
|
238 |
+
os.remove(file)
|
239 |
+
except:
|
240 |
+
pass
|
241 |
+
else:
|
242 |
+
os.makedirs('generated_data', exist_ok=True)
|
243 |
+
|
244 |
+
# If dataset info not provided, find it (fallback)
|
245 |
+
if data_gouv_page is None or most_similar_idx is None:
|
246 |
+
progress_callback(0.02, "🤖 Initializing LLM translator and BM25...")
|
247 |
+
initialize_models()
|
248 |
+
|
249 |
+
progress_callback(0.05, "🔍 Searching for relevant datasets (using BM25 keyword matching)...")
|
250 |
+
|
251 |
+
# Read the filtered dataset if not provided
|
252 |
+
if df is None:
|
253 |
+
df = pd.read_csv('filtered_dataset.csv')
|
254 |
+
|
255 |
+
# Find the most similar dataset using BM25 keyword matching
|
256 |
+
most_similar_idx, similarity_score, translated_query, original_lang = find_similar_dataset_bm25(query, df)
|
257 |
+
data_gouv_page = df.iloc[most_similar_idx]['url']
|
258 |
+
|
259 |
+
# Immediately show the page URL via callback
|
260 |
+
if page_url_callback:
|
261 |
+
page_url_callback(data_gouv_page)
|
262 |
+
|
263 |
+
progress_callback(0.08, "🤖 Initializing agent...")
|
264 |
+
else:
|
265 |
+
# Dataset already found, continue from where we left off
|
266 |
+
progress_callback(0.09, "🤖 Initializing agent...")
|
267 |
+
|
268 |
+
step_callback = create_progress_callback()
|
269 |
+
|
270 |
+
progress_callback(0.1, "🤖 Starting agent analysis...")
|
271 |
+
|
272 |
+
# Create the agent with progress callback
|
273 |
+
web_agent = create_web_agent(step_callback)
|
274 |
+
prompt = generate_prompt(data_gouv_page)
|
275 |
+
|
276 |
+
# Run the agent - the step_callbacks will automatically update progress
|
277 |
+
answer = web_agent.run(prompt)
|
278 |
+
|
279 |
+
# Check if the agent found no processable data
|
280 |
+
answer_lower = str(answer).lower() if answer else ""
|
281 |
+
if ("no processable data" in answer_lower or
|
282 |
+
"no csv nor json" in answer_lower or
|
283 |
+
"cannot find csv" in answer_lower or
|
284 |
+
"cannot find json" in answer_lower or
|
285 |
+
"no data to process" in answer_lower):
|
286 |
+
progress_callback(1.0, "❌ No CSV/JSON files found in the dataset")
|
287 |
+
return "❌ No CSV/JSON files found in the selected dataset. This dataset cannot be processed automatically.", [], data_gouv_page
|
288 |
+
|
289 |
+
# Check if files were generated
|
290 |
+
generated_files = glob.glob('generated_data/*')
|
291 |
+
|
292 |
+
if generated_files:
|
293 |
+
progress_callback(1.0, "✅ Analysis completed successfully!")
|
294 |
+
return "Analysis completed successfully!", generated_files, data_gouv_page
|
295 |
+
else:
|
296 |
+
progress_callback(1.0, "⚠️ Analysis completed but no files were generated.")
|
297 |
+
return "Analysis completed but no files were generated.", [], data_gouv_page
|
298 |
+
|
299 |
+
except Exception as e:
|
300 |
+
progress_callback(1.0, f"❌ Error: {str(e)}")
|
301 |
+
return f"Error during analysis: {str(e)}", [], None
|
302 |
+
|
303 |
+
def search_and_analyze(query, progress=gr.Progress()):
|
304 |
+
"""
|
305 |
+
Main function called when search button is clicked.
|
306 |
+
Uses Gradio's progress bar for visual feedback.
|
307 |
+
"""
|
308 |
+
# Read the filtered dataset first
|
309 |
+
df = pd.read_csv('filtered_dataset.csv')
|
310 |
+
|
311 |
+
# If no query provided, randomly select one weighted by quality score
|
312 |
+
if not query.strip():
|
313 |
+
progress(0, desc="🎲 No query provided - selecting random high-quality dataset...")
|
314 |
+
|
315 |
+
# Use quality_score as weights for random selection
|
316 |
+
if 'quality_score' in df.columns:
|
317 |
+
# Ensure quality scores are positive for weighting
|
318 |
+
weights = df['quality_score'].fillna(0)
|
319 |
+
weights = weights - weights.min() + 0.1 # Shift to make all positive
|
320 |
+
else:
|
321 |
+
weights = None
|
322 |
+
|
323 |
+
# Randomly sample one dataset weighted by quality
|
324 |
+
selected_row = df.sample(n=1, weights=weights).iloc[0]
|
325 |
+
query = selected_row['title']
|
326 |
+
|
327 |
+
progress(0.02, f"🎯 Random selection: {query[:60]}...")
|
328 |
+
|
329 |
+
# Clear the progress queue
|
330 |
+
while not progress_queue.empty():
|
331 |
+
try:
|
332 |
+
progress_queue.get_nowait()
|
333 |
+
except queue.Empty:
|
334 |
+
break
|
335 |
+
|
336 |
+
# Initialize outputs
|
337 |
+
pdf_file = None
|
338 |
+
images_output = [gr.Image(visible=False)] * 4
|
339 |
+
status = "🚀 Starting analysis..."
|
340 |
+
|
341 |
+
# Initial progress
|
342 |
+
progress(0.05, desc="🚀 Initializing...")
|
343 |
+
|
344 |
+
def progress_callback(progress_val, description):
|
345 |
+
"""Callback function to update progress - puts updates in queue"""
|
346 |
+
try:
|
347 |
+
progress_queue.put((progress_val, description))
|
348 |
+
except:
|
349 |
+
pass
|
350 |
+
|
351 |
+
# Run analysis in a separate thread
|
352 |
+
result_queue = queue.Queue()
|
353 |
+
|
354 |
+
# Store the page URL to show immediately (kept for compatibility)
|
355 |
+
page_url_to_show = None
|
356 |
+
|
357 |
+
def page_url_callback(url):
|
358 |
+
nonlocal page_url_to_show
|
359 |
+
page_url_to_show = url
|
360 |
+
|
361 |
+
# Find and show the page URL immediately FIRST
|
362 |
+
initialize_models()
|
363 |
+
progress(0.06, desc="🔍 Finding relevant dataset...")
|
364 |
+
most_similar_idx, similarity_score, translated_query, original_lang = find_similar_dataset_bm25(query, df)
|
365 |
+
data_gouv_page = df.iloc[most_similar_idx]['url']
|
366 |
+
dataset_title = df.iloc[most_similar_idx]['title']
|
367 |
+
|
368 |
+
progress(0.07, desc=f"📋 Found dataset: {dataset_title[:50]}...")
|
369 |
+
|
370 |
+
# Now start the analysis thread with the found dataset info
|
371 |
+
def run_analysis():
|
372 |
+
try:
|
373 |
+
# Pass the already found dataset info to the analysis function
|
374 |
+
result = run_agent_analysis_with_progress(query, progress_callback, df, page_url_callback, data_gouv_page, most_similar_idx)
|
375 |
+
result_queue.put(result)
|
376 |
+
except Exception as e:
|
377 |
+
result_queue.put((f"Error: {str(e)}", [], data_gouv_page))
|
378 |
+
|
379 |
+
analysis_thread = threading.Thread(target=run_analysis)
|
380 |
+
analysis_thread.start()
|
381 |
+
|
382 |
+
# Show page URL immediately by returning current state
|
383 |
+
current_page_display = gr.Textbox(value=data_gouv_page, visible=True)
|
384 |
+
current_status = "🔗 Page found - starting analysis..."
|
385 |
+
|
386 |
+
# Initial update to show the page URL immediately
|
387 |
+
progress(0.08, desc="🔗 Page found - starting analysis...")
|
388 |
+
|
389 |
+
# Monitor progress while analysis runs
|
390 |
+
last_progress = 0.08
|
391 |
+
|
392 |
+
while analysis_thread.is_alive() or not result_queue.empty():
|
393 |
+
try:
|
394 |
+
# Check for progress updates from queue
|
395 |
+
try:
|
396 |
+
progress_val, description = progress_queue.get(timeout=0.1)
|
397 |
+
if progress_val > last_progress:
|
398 |
+
last_progress = progress_val
|
399 |
+
current_status = description
|
400 |
+
progress(progress_val, desc=description)
|
401 |
+
except queue.Empty:
|
402 |
+
pass
|
403 |
+
|
404 |
+
# Check if analysis is complete
|
405 |
+
try:
|
406 |
+
final_status, files, page_url = result_queue.get(timeout=0.1)
|
407 |
+
|
408 |
+
# Check if this is a "no data" case
|
409 |
+
if "❌ No CSV/JSON files found" in final_status:
|
410 |
+
progress(1.0, desc="❌ No processable data found")
|
411 |
+
return (gr.Textbox(value=page_url if page_url else data_gouv_page, visible=True),
|
412 |
+
final_status,
|
413 |
+
gr.File(visible=False),
|
414 |
+
gr.Image(visible=False), gr.Image(visible=False),
|
415 |
+
gr.Image(visible=False), gr.Image(visible=False))
|
416 |
+
|
417 |
+
# Final progress update
|
418 |
+
progress(1.0, desc="✅ Processing results...")
|
419 |
+
|
420 |
+
# Process results
|
421 |
+
pdf_file = None
|
422 |
+
png_files = []
|
423 |
+
|
424 |
+
for file in files:
|
425 |
+
if file.endswith('.pdf'):
|
426 |
+
pdf_file = file
|
427 |
+
elif file.endswith('.png'):
|
428 |
+
png_files.append(file)
|
429 |
+
|
430 |
+
# Prepare final outputs
|
431 |
+
download_button = gr.File(value=pdf_file, visible=True) if pdf_file else None
|
432 |
+
|
433 |
+
# Prepare images for display (up to 4 images)
|
434 |
+
images = []
|
435 |
+
for i in range(4):
|
436 |
+
if i < len(png_files):
|
437 |
+
images.append(gr.Image(value=png_files[i], visible=True))
|
438 |
+
else:
|
439 |
+
images.append(gr.Image(visible=False))
|
440 |
+
|
441 |
+
# final progress completion
|
442 |
+
progress(1.0, desc="🎉 Complete!")
|
443 |
+
|
444 |
+
return gr.Textbox(value=page_url if page_url else data_gouv_page, visible=True), final_status, download_button, *images
|
445 |
+
|
446 |
+
except queue.Empty:
|
447 |
+
pass
|
448 |
+
|
449 |
+
time.sleep(0.5) # Small delay to prevent excessive updates
|
450 |
+
|
451 |
+
except Exception as e:
|
452 |
+
progress(1.0, desc=f"❌ Error: {str(e)}")
|
453 |
+
return gr.Textbox(value=data_gouv_page, visible=True), f"❌ Error: {str(e)}", None, *images_output
|
454 |
+
|
455 |
+
# Ensure thread completes
|
456 |
+
analysis_thread.join(timeout=1)
|
457 |
+
|
458 |
+
# Fallback return
|
459 |
+
progress(1.0, desc="🏁 Finished")
|
460 |
+
return gr.Textbox(value=data_gouv_page, visible=True), current_status, pdf_file, *images_output
|
461 |
+
|
462 |
+
# Create the Gradio interface
|
463 |
+
with gr.Blocks(title="🤖 French Public Data Analysis Agent", theme=gr.themes.Soft(), css="""
|
464 |
+
.gradio-container {
|
465 |
+
max-width: 1200px !important;
|
466 |
+
margin: auto;
|
467 |
+
width: 100% !important;
|
468 |
+
}
|
469 |
+
.main-header {
|
470 |
+
text-align: center;
|
471 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
472 |
+
color: white;
|
473 |
+
padding: 2rem;
|
474 |
+
border-radius: 15px;
|
475 |
+
margin-bottom: 2rem;
|
476 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
477 |
+
}
|
478 |
+
.accordion-content {
|
479 |
+
overflow: hidden !important;
|
480 |
+
width: 100% !important;
|
481 |
+
}
|
482 |
+
.gr-accordion {
|
483 |
+
width: 100% !important;
|
484 |
+
max-width: 100% !important;
|
485 |
+
}
|
486 |
+
.gr-accordion .gr-row {
|
487 |
+
width: 100% !important;
|
488 |
+
max-width: 100% !important;
|
489 |
+
margin: 0 !important;
|
490 |
+
}
|
491 |
+
.gr-accordion .gr-column {
|
492 |
+
min-width: 0 !important;
|
493 |
+
flex: 1 !important;
|
494 |
+
max-width: 50% !important;
|
495 |
+
padding-right: 1rem !important;
|
496 |
+
}
|
497 |
+
.gr-accordion .gr-column:last-child {
|
498 |
+
padding-right: 0 !important;
|
499 |
+
padding-left: 1rem !important;
|
500 |
+
}
|
501 |
+
""") as demo:
|
502 |
+
|
503 |
+
# Main header with better styling
|
504 |
+
gr.HTML("""
|
505 |
+
<div class="main-header">
|
506 |
+
<h1 style="margin: 0; font-size: 2.5rem; font-weight: bold;">
|
507 |
+
🤖 French Public Data Analysis Agent
|
508 |
+
</h1>
|
509 |
+
<p style="font-size: 1.2rem; opacity: 0.9;">
|
510 |
+
Intelligent analysis of French public datasets with AI-powered insights
|
511 |
+
</p>
|
512 |
+
</div>
|
513 |
+
""")
|
514 |
+
|
515 |
+
# What this agent does
|
516 |
+
gr.HTML("""
|
517 |
+
<div style="text-align: center; background: #f8fafc; padding: 1.5rem; border-radius: 10px; margin: 1rem 0;">
|
518 |
+
<p style="font-size: 1.1rem; color: #374151; margin: 0;">
|
519 |
+
🌐 <strong>Search in French or English</strong> • 🗺️ <strong>Generate Reports with visualizations from the data</strong>
|
520 |
+
</p>
|
521 |
+
</div>
|
522 |
+
""")
|
523 |
+
|
524 |
+
# Tips & Information accordion - moved to the top
|
525 |
+
with gr.Accordion("💡 Tips & Information", open=False):
|
526 |
+
with gr.Row():
|
527 |
+
with gr.Column():
|
528 |
+
gr.Markdown("""
|
529 |
+
🎯 **How to Use:**
|
530 |
+
- Enter any search term related to French public data
|
531 |
+
- Leave empty to randomly select a high-quality dataset
|
532 |
+
- Results include visualizations and downloadable reports
|
533 |
+
|
534 |
+
⏱️ **Processing Time:**
|
535 |
+
- Report generation takes 5-10 minutes depending on dataset complexity
|
536 |
+
- Larger datasets may require additional processing time
|
537 |
+
""")
|
538 |
+
with gr.Column():
|
539 |
+
gr.Markdown("""
|
540 |
+
⚠️ **Important Notes:**
|
541 |
+
- Still a work in progress, might be better to start with the example queries
|
542 |
+
- Some datasets may not contain processable CSV/JSON files
|
543 |
+
- All visualizations are automatically generated
|
544 |
+
- Maps focus on France when geographic data is available
|
545 |
+
|
546 |
+
🌐 **Language Support:**
|
547 |
+
- Search in French or English - queries are automatically translated
|
548 |
+
""")
|
549 |
+
|
550 |
+
with gr.Row():
|
551 |
+
query_input = gr.Textbox(
|
552 |
+
label="Search Query",
|
553 |
+
placeholder="e.g., road traffic accidents, education, housing (or leave empty for random selection)",
|
554 |
+
scale=4
|
555 |
+
)
|
556 |
+
search_button = gr.Button(
|
557 |
+
"🚀 Analyze Dataset",
|
558 |
+
variant="primary",
|
559 |
+
scale=1,
|
560 |
+
size="lg"
|
561 |
+
)
|
562 |
+
|
563 |
+
# Quick Start Examples row
|
564 |
+
with gr.Row():
|
565 |
+
gr.HTML("""
|
566 |
+
<div>
|
567 |
+
<h3 style="color: #374151">🚀 Quick Start Examples</h3>
|
568 |
+
<p style="color: #6b7280">Click any example below to get started</p>
|
569 |
+
</div>
|
570 |
+
""")
|
571 |
+
|
572 |
+
with gr.Row():
|
573 |
+
examples = [
|
574 |
+
("🚗 Road Traffic Accidents 2005 - 2023", "road traffic accidents 2005 - 2023"),
|
575 |
+
("🎓 Education Directory", "education directory"),
|
576 |
+
("🏠 French Vacant Housing Private Park", "French vacant housing private park"),
|
577 |
+
]
|
578 |
+
|
579 |
+
for emoji_text, query_text in examples:
|
580 |
+
gr.Button(
|
581 |
+
emoji_text,
|
582 |
+
variant="secondary",
|
583 |
+
size="sm"
|
584 |
+
).click(
|
585 |
+
lambda x=query_text: x,
|
586 |
+
outputs=query_input
|
587 |
+
)
|
588 |
+
|
589 |
+
# Page info and analysis status with progress bar
|
590 |
+
with gr.Group():
|
591 |
+
page_url_display = gr.Textbox(label="🔗 Page Started On", interactive=False, visible=False)
|
592 |
+
with gr.Row():
|
593 |
+
status_output = gr.Textbox(label="📊 Analysis Status", interactive=False, scale=1)
|
594 |
+
|
595 |
+
# Download section
|
596 |
+
with gr.Row():
|
597 |
+
download_button = gr.File(
|
598 |
+
label="📄 Download PDF Report",
|
599 |
+
visible=False
|
600 |
+
)
|
601 |
+
|
602 |
+
gr.Markdown("---")
|
603 |
+
gr.HTML("""
|
604 |
+
<div style="text-align: center; margin: 2rem 0;">
|
605 |
+
<h2 style="color: #374151; margin-bottom: 0.5rem;">📊 Generated Visualizations</h2>
|
606 |
+
<p style="color: #6b7280; margin: 0;">Automatically generated charts and maps will appear below</p>
|
607 |
+
</div>
|
608 |
+
""")
|
609 |
+
|
610 |
+
with gr.Row():
|
611 |
+
with gr.Column():
|
612 |
+
image1 = gr.Image(label="📈 Chart 1", visible=False, height=400)
|
613 |
+
image2 = gr.Image(label="📊 Chart 2", visible=False, height=400)
|
614 |
+
with gr.Column():
|
615 |
+
image3 = gr.Image(label="🗺️ Map/Chart 3", visible=False, height=400)
|
616 |
+
image4 = gr.Image(label="📉 Chart 4", visible=False, height=400)
|
617 |
+
|
618 |
+
# Set up the search button click event with progress bar
|
619 |
+
search_button.click(
|
620 |
+
fn=search_and_analyze,
|
621 |
+
inputs=[query_input],
|
622 |
+
outputs=[page_url_display, status_output, download_button, image1, image2, image3, image4],
|
623 |
+
show_progress="full" # Show the built-in progress bar
|
624 |
+
)
|
625 |
+
|
626 |
+
|
627 |
+
|
628 |
+
if __name__ == "__main__":
|
629 |
+
demo.queue() # Enable queuing for real-time updates
|
630 |
+
demo.launch(
|
631 |
+
share=True,
|
632 |
+
server_name="0.0.0.0",
|
633 |
+
server_port=7860,
|
634 |
+
show_error=True
|
635 |
+
)
|
filtered_dataset.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
france_data/departements.geojson
ADDED
The diff for this file is too large to render.
See raw diff
|
|
france_data/regions.geojson
ADDED
The diff for this file is too large to render.
See raw diff
|
|
launch_gradio.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Launch script for the Data Analysis Agent Gradio Interface
|
4 |
+
"""
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Add the current directory to Python path
|
10 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
11 |
+
|
12 |
+
try:
|
13 |
+
from app import demo
|
14 |
+
|
15 |
+
print("🚀 Starting Data Analysis Agent...")
|
16 |
+
print("📊 The interface will be available at: http://localhost:7860")
|
17 |
+
print("🌐 A shareable link will also be provided")
|
18 |
+
print("\n" + "="*50)
|
19 |
+
|
20 |
+
# Launch the interface
|
21 |
+
demo.launch()
|
22 |
+
|
23 |
+
except ImportError as e:
|
24 |
+
print(f"❌ Import Error: {e}")
|
25 |
+
print("\n💡 Make sure you have installed all dependencies:")
|
26 |
+
print(" pip install -r requirements.txt")
|
27 |
+
sys.exit(1)
|
28 |
+
except Exception as e:
|
29 |
+
print(f"❌ Error launching interface: {e}")
|
30 |
+
sys.exit(1)
|
reexport_data.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Read the CSV file
|
4 |
+
df = pd.read_csv('dataset_metadata/full_dts_list.csv', sep=";")
|
5 |
+
|
6 |
+
# Print all columns
|
7 |
+
print("Columns in the dataset:")
|
8 |
+
print(df.columns.tolist())
|
9 |
+
print("\n" + "="*50 + "\n")
|
10 |
+
|
11 |
+
# Print unique values for license column
|
12 |
+
print("Unique values in 'license' column:")
|
13 |
+
if 'license' in df.columns:
|
14 |
+
unique_licences = df['license'].unique()
|
15 |
+
for i, licence in enumerate(unique_licences, 1):
|
16 |
+
print(f"{i}. {licence}")
|
17 |
+
|
18 |
+
print(f"\nTotal unique license values: {len(unique_licences)}")
|
19 |
+
|
20 |
+
# Also show value counts for license column
|
21 |
+
print("\nLicense value counts:")
|
22 |
+
print(df['license'].value_counts())
|
23 |
+
else:
|
24 |
+
print("Column 'license' not found in the dataset.")
|
25 |
+
print("Available columns are:", df.columns.tolist())
|
26 |
+
|
27 |
+
print("\n" + "="*50 + "\n")
|
28 |
+
|
29 |
+
# Select only the required columns
|
30 |
+
required_columns = ["title", "url", "license", "quality_score"]
|
31 |
+
|
32 |
+
# Check which columns exist
|
33 |
+
existing_columns = [col for col in required_columns if col in df.columns]
|
34 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
35 |
+
|
36 |
+
print(f"Found columns: {existing_columns}")
|
37 |
+
if missing_columns:
|
38 |
+
print(f"Missing columns: {missing_columns}")
|
39 |
+
|
40 |
+
# Select only existing columns
|
41 |
+
df_filtered = df[existing_columns].copy()
|
42 |
+
|
43 |
+
print(f"\nOriginal dataset shape: {df.shape}")
|
44 |
+
print(f"After selecting columns: {df_filtered.shape}")
|
45 |
+
|
46 |
+
# Filter out rows where license is NaN
|
47 |
+
df_filtered = df_filtered.dropna(subset=['license'])
|
48 |
+
|
49 |
+
print(f"After removing NaN license values: {df_filtered.shape}")
|
50 |
+
# # Filter for datasets that include France in spatial granularity
|
51 |
+
# if 'spatial.zones' in df_filtered.columns:
|
52 |
+
# # Check unique values in spatial.granularity before filtering
|
53 |
+
# print(f"\nUnique values in 'spatial.zones' column (first 10):")
|
54 |
+
# unique_spatial = df_filtered['spatial.zones'].dropna().unique()
|
55 |
+
# for i, value in enumerate(unique_spatial[:10], 1):
|
56 |
+
# print(f"{i}. {value}")
|
57 |
+
# if len(unique_spatial) > 10:
|
58 |
+
# print(f"... and {len(unique_spatial) - 10} more values")
|
59 |
+
#
|
60 |
+
# # Filter for France (case-insensitive search)
|
61 |
+
# france_filter = df_filtered['spatial.zones'].str.contains('France', case=False, na=False)
|
62 |
+
# df_filtered = df_filtered[france_filter]
|
63 |
+
#
|
64 |
+
# print(f"After filtering for France in spatial zones: {df_filtered.shape}")
|
65 |
+
# else:
|
66 |
+
# print("Warning: 'spatial.zones' column not found, skipping France filter")
|
67 |
+
|
68 |
+
# Filter by quality score (keep only > 0.6)
|
69 |
+
if 'quality_score' in df_filtered.columns:
|
70 |
+
print(f"Before quality filtering: {df_filtered.shape}")
|
71 |
+
df_filtered = df_filtered[df_filtered['quality_score'] >= 0.8]
|
72 |
+
print(f"After filtering quality_score >= 0.8: {df_filtered.shape}")
|
73 |
+
|
74 |
+
# Sort by quality score (descending order - highest quality first)
|
75 |
+
df_filtered = df_filtered.sort_values('quality_score', ascending=False)
|
76 |
+
print(f"Dataset sorted by quality_score (highest first)")
|
77 |
+
|
78 |
+
# Show quality score distribution
|
79 |
+
if not df_filtered.empty:
|
80 |
+
print(f"Quality score range: {df_filtered['quality_score'].min():.2f} - {df_filtered['quality_score'].max():.2f}")
|
81 |
+
else:
|
82 |
+
print("Warning: 'quality_score' column not found, skipping quality filtering and sorting")
|
83 |
+
|
84 |
+
# Save to CSV
|
85 |
+
# Drop license column before saving
|
86 |
+
df_filtered = df_filtered.drop('license', axis=1)
|
87 |
+
output_file = 'filtered_dataset.csv'
|
88 |
+
df_filtered.to_csv(output_file, index=False)
|
89 |
+
|
90 |
+
print(f"\nFiltered dataset saved to: {output_file}")
|
91 |
+
print(f"Final dataset contains {len(df_filtered)} rows and {len(df_filtered.columns)} columns")
|
92 |
+
|
93 |
+
print("\n" + "="*50)
|
94 |
+
print("✅ Pre-processing complete!")
|
95 |
+
print("Files created:")
|
96 |
+
print(f" - {output_file}: Filtered dataset")
|
97 |
+
print("="*50)
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
shapely
|
3 |
+
geopandas
|
4 |
+
numpy
|
5 |
+
rtree
|
6 |
+
pyproj
|
7 |
+
matplotlib
|
8 |
+
requests
|
9 |
+
duckduckgo-search
|
10 |
+
smolagents[toolkit]
|
11 |
+
smolagents[litellm]
|
12 |
+
dotenv
|
13 |
+
beautifulsoup4
|
14 |
+
reportlab>=3.6.0
|
15 |
+
scikit-learn
|
16 |
+
gradio
|
17 |
+
pypdf2
|
18 |
+
python-docx
|
19 |
+
scipy
|
20 |
+
openpyxl
|
21 |
+
unidecode
|
22 |
+
rank_bm25
|
tools/drawing_tools.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import geopandas as gpd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
import pandas as pd
|
6 |
+
from smolagents import tool
|
7 |
+
from typing import Dict, Tuple, Optional
|
8 |
+
from matplotlib.figure import Figure
|
9 |
+
from matplotlib.axes import Axes
|
10 |
+
from shapely.geometry.base import BaseGeometry
|
11 |
+
|
12 |
+
def _download_geojson(url: str, file_name: str) -> str:
|
13 |
+
"""Downloads a GeoJSON file if it doesn't exist.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
url (str): The URL of the GeoJSON file.
|
17 |
+
file_name (str): The name of the file to save the data in.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
str: The path to the downloaded file.
|
21 |
+
"""
|
22 |
+
data_dir = "france_data"
|
23 |
+
if not os.path.exists(data_dir):
|
24 |
+
os.makedirs(data_dir)
|
25 |
+
|
26 |
+
file_path = os.path.join(data_dir, file_name)
|
27 |
+
|
28 |
+
if not os.path.exists(file_path):
|
29 |
+
print(f"Downloading {file_name} from {url}...")
|
30 |
+
response = requests.get(url)
|
31 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
32 |
+
|
33 |
+
with open(file_path, 'w') as f:
|
34 |
+
f.write(response.text)
|
35 |
+
print("Download complete.")
|
36 |
+
|
37 |
+
return file_path
|
38 |
+
|
39 |
+
def get_france_geodata(level: str = 'regions') -> gpd.GeoDataFrame:
|
40 |
+
"""Gets a GeoDataFrame for Metropolitan France with its regions or departments.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
level (str): The administrative level to draw ('regions' or 'departments').
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
gpd.GeoDataFrame: A GeoDataFrame with the requested administrative level.
|
47 |
+
"""
|
48 |
+
if level == 'regions':
|
49 |
+
url = "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/regions.geojson"
|
50 |
+
file_name = "regions.geojson"
|
51 |
+
elif level == 'departments':
|
52 |
+
url = "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements.geojson"
|
53 |
+
file_name = "departements.geojson"
|
54 |
+
else:
|
55 |
+
raise ValueError("level must be 'regions' or 'departments'")
|
56 |
+
|
57 |
+
geojson_path = _download_geojson(url, file_name)
|
58 |
+
gdf = gpd.read_file(geojson_path)
|
59 |
+
|
60 |
+
# Although the geojson files are for metropolitan France, we can filter to be safe.
|
61 |
+
if level == 'regions':
|
62 |
+
# Metropolitan region codes are between 11 and 94.
|
63 |
+
gdf['code'] = gdf['code'].astype(int)
|
64 |
+
france_metropolitan = gdf[gdf['code'].between(11, 94)]
|
65 |
+
else: # departments
|
66 |
+
# Metropolitan department codes are 01-19, 21-95, 2A, 2B. Corsica (20) is split.
|
67 |
+
metro_codes = [f'{i:02d}' for i in range(1, 20)] + [f'{i:02d}' for i in range(21, 96)] + ['2A', '2B']
|
68 |
+
france_metropolitan = gdf[gdf['code'].isin(metro_codes)]
|
69 |
+
|
70 |
+
france_metropolitan = france_metropolitan.to_crs(epsg=2154)
|
71 |
+
return france_metropolitan
|
72 |
+
|
73 |
+
@tool
|
74 |
+
def draw_france_map(level: str = 'regions') -> Tuple[Figure, Axes]:
|
75 |
+
"""Draws a map of Metropolitan France with its regions or departments.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
level (str): The administrative level to draw ('regions' or 'departments').
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
Tuple[Figure, Axes]: A tuple containing the Matplotlib figure and axes objects.
|
82 |
+
"""
|
83 |
+
france_metropolitan = get_france_geodata(level)
|
84 |
+
|
85 |
+
fig, ax = plt.subplots(1, 1, figsize=(15, 12))
|
86 |
+
|
87 |
+
# Plot with a single color
|
88 |
+
france_metropolitan.plot(ax=ax, color='lightgray', edgecolor='black')
|
89 |
+
|
90 |
+
minx, miny, maxx, maxy = france_metropolitan.total_bounds
|
91 |
+
|
92 |
+
padding = 0.1
|
93 |
+
ax.set_xlim(minx - padding * (maxx - minx), maxx + padding * (maxx - minx))
|
94 |
+
ax.set_ylim(miny - padding * (maxy - miny), maxy + padding * (maxy - miny))
|
95 |
+
|
96 |
+
ax.set_aspect('equal', adjustable='box')
|
97 |
+
ax.set_axis_off()
|
98 |
+
ax.set_title(f'Metropolitan France with {level.capitalize()}', fontsize=20)
|
99 |
+
|
100 |
+
return fig, ax
|
101 |
+
|
102 |
+
@tool
|
103 |
+
def get_geodata_mapping(level: str = 'regions') -> Dict[str, BaseGeometry]:
|
104 |
+
"""Returns a mapping from region/department name to its polygon.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
level (str): The administrative level to get the mapping for ('regions' or 'departments').
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Dict[str, BaseGeometry]: A dictionary mapping the name to the polygon.
|
111 |
+
"""
|
112 |
+
france_metropolitan = get_france_geodata(level)
|
113 |
+
|
114 |
+
mapping = {row['nom']: row['geometry'] for _, row in france_metropolitan.iterrows()}
|
115 |
+
|
116 |
+
return mapping
|
117 |
+
|
118 |
+
@tool
|
119 |
+
def plot_geodata(geodata: gpd.GeoDataFrame, ax: Axes, color: str = None, edgecolor: str = 'black', alpha: float = 1.0, output_path: Optional[str] = None) -> Optional[str]:
|
120 |
+
"""Plots geodata on a given map axes and optionally saves the map as an image file.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
geodata (gpd.GeoDataFrame): The geodata to plot.
|
124 |
+
ax (Axes): The axes to plot on.
|
125 |
+
color (str, optional): The color for the geometries. Defaults to None.
|
126 |
+
edgecolor (str, optional): The color for the geometry edges. Defaults to 'black'.
|
127 |
+
alpha (float, optional): The alpha blending value, between 0 and 1. Defaults to 1.0.
|
128 |
+
output_path (Optional[str], optional): Path to save the map image file (e.g., 'map.png'). Defaults to None.
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
Optional[str]: The path to the saved file if output_path is provided, otherwise None.
|
132 |
+
"""
|
133 |
+
# Ensure the geodata is in the same CRS
|
134 |
+
geodata = geodata.to_crs(epsg=2154)
|
135 |
+
geodata.plot(ax=ax, color=color, edgecolor=edgecolor, alpha=alpha)
|
136 |
+
|
137 |
+
if output_path:
|
138 |
+
fig = ax.get_figure()
|
139 |
+
fig.savefig(output_path, bbox_inches='tight')
|
140 |
+
|
141 |
+
return output_path
|
142 |
+
|
143 |
+
@tool
|
144 |
+
def plot_departments_data(
|
145 |
+
data: pd.DataFrame,
|
146 |
+
dep_col: str = 'dep',
|
147 |
+
value_col: str = 'value',
|
148 |
+
map_title: str = 'French Departments Data',
|
149 |
+
output_path: Optional[str] = 'france_data.png'
|
150 |
+
) -> Optional[str]:
|
151 |
+
"""
|
152 |
+
Plots data for French departments on a map of France.
|
153 |
+
|
154 |
+
Args:
|
155 |
+
data (pd.DataFrame): DataFrame with department data. Must contain at least two columns:
|
156 |
+
one for department codes and one for the values to plot.
|
157 |
+
dep_col (str): The name of the column in `data` that contains the department codes.
|
158 |
+
value_col (str): The name of the column in `data` that contains the values to plot.
|
159 |
+
map_title (str): The title of the map.
|
160 |
+
output_path (Optional[str]): Path to save the map image file. If None, the plot is not saved.
|
161 |
+
Defaults to 'france_data.png'.
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
Optional[str]: The path to the saved file if output_path is provided, otherwise None.
|
165 |
+
"""
|
166 |
+
# Get the geodata for departments
|
167 |
+
departments_gdf = get_france_geodata('departments')
|
168 |
+
|
169 |
+
# Ensure department codes in user data are strings for merging
|
170 |
+
data[dep_col] = data[dep_col].astype(str).str.zfill(2)
|
171 |
+
|
172 |
+
# Merge the geodata with the user's data
|
173 |
+
merged_gdf = departments_gdf.merge(data, left_on='code', right_on=dep_col)
|
174 |
+
|
175 |
+
# Create the plot
|
176 |
+
fig, ax = plt.subplots(1, 1, figsize=(15, 12))
|
177 |
+
ax.set_aspect('equal')
|
178 |
+
ax.set_axis_off()
|
179 |
+
|
180 |
+
# Plot the base map of all departments
|
181 |
+
departments_gdf.plot(ax=ax, color='lightgray', edgecolor='black')
|
182 |
+
|
183 |
+
# Plot the data on top
|
184 |
+
if not merged_gdf.empty:
|
185 |
+
merged_gdf.plot(column=value_col, ax=ax, legend=True, cmap='viridis')
|
186 |
+
|
187 |
+
ax.set_title(map_title, fontsize=20)
|
188 |
+
|
189 |
+
if output_path:
|
190 |
+
fig.savefig(output_path, bbox_inches='tight')
|
191 |
+
print(f"Map saved to {output_path}")
|
192 |
+
return output_path
|
193 |
+
|
194 |
+
return None
|
195 |
+
|
196 |
+
if __name__ == '__main__':
|
197 |
+
# Create sample data for 5 departments as requested
|
198 |
+
# The user provided 5, 92, 63, 45, 32
|
199 |
+
sample_data = {
|
200 |
+
'dep': [5, 92, 63, 45, 32],
|
201 |
+
'value': [10, 50, 20, 30, 45] # Some arbitrary values
|
202 |
+
}
|
203 |
+
data_df = pd.DataFrame(sample_data)
|
204 |
+
|
205 |
+
print("Generating map with department data...")
|
206 |
+
plot_departments_data(data_df, output_path='france_departments_data.png')
|
tools/exploration_tools.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from smolagents import tool
|
3 |
+
|
4 |
+
@tool
|
5 |
+
def read_data(file_path: str) -> pd.DataFrame:
|
6 |
+
"""
|
7 |
+
Reads a CSV, JSON, or Excel (.xlsx) file into a pandas DataFrame.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
file_path: The path to the CSV, JSON, or Excel file.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
A pandas DataFrame with the loaded data, or an error message if the file cannot be read.
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
if file_path.lower().endswith('.csv'):
|
17 |
+
df = pd.read_csv(file_path, delimiter=';')
|
18 |
+
elif file_path.lower().endswith('.json'):
|
19 |
+
df = pd.read_json(file_path)
|
20 |
+
elif file_path.lower().endswith('.xlsx'):
|
21 |
+
df = pd.read_excel(file_path, engine='openpyxl')
|
22 |
+
else:
|
23 |
+
return "Unsupported file format. Please use a CSV, JSON, or Excel (.xlsx) file."
|
24 |
+
return df
|
25 |
+
except Exception as e:
|
26 |
+
return f"Error reading the data file: {str(e)}"
|
27 |
+
|
28 |
+
@tool
|
29 |
+
def get_dataset_description(df: pd.DataFrame) -> str:
|
30 |
+
"""
|
31 |
+
Provides a description of the dataset, including info, description, and head.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
df: The pandas DataFrame to describe.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
A string containing the description of the DataFrame.
|
38 |
+
"""
|
39 |
+
try:
|
40 |
+
info = df.info(verbose=False)
|
41 |
+
description = df.describe()
|
42 |
+
head = df.head()
|
43 |
+
return f"Info:\n{info}\n\nDescription:\n{description}\n\nHead:\n{head}"
|
44 |
+
except Exception as e:
|
45 |
+
return f"Error describing the DataFrame: {str(e)}"
|
46 |
+
|
47 |
+
@tool
|
48 |
+
def get_value_counts(df: pd.DataFrame, column_name: str) -> str:
|
49 |
+
"""
|
50 |
+
Gets the value counts for a specified column in the DataFrame.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
df: The pandas DataFrame.
|
54 |
+
column_name: The name of the column to get the value counts for.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
A string containing the value counts for the column, or an error message.
|
58 |
+
"""
|
59 |
+
try:
|
60 |
+
value_counts = df[column_name].value_counts().to_string()
|
61 |
+
return f"Value counts for column '{column_name}':\n{value_counts}"
|
62 |
+
except KeyError:
|
63 |
+
return f"Error: Column '{column_name}' not found in the DataFrame."
|
64 |
+
except Exception as e:
|
65 |
+
return f"Error getting value counts: {str(e)}"
|
66 |
+
|
67 |
+
@tool
|
68 |
+
def get_correlation_matrix(df: pd.DataFrame) -> str:
|
69 |
+
"""
|
70 |
+
Computes and returns the correlation matrix for the numerical columns in the DataFrame.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
df: The pandas DataFrame.
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
A string containing the correlation matrix, or an error message.
|
77 |
+
"""
|
78 |
+
try:
|
79 |
+
# Select only numeric columns for correlation matrix
|
80 |
+
numeric_df = df.select_dtypes(include=['number'])
|
81 |
+
correlation_matrix = numeric_df.corr().to_string()
|
82 |
+
return f"Correlation Matrix:\n{correlation_matrix}"
|
83 |
+
except Exception as e:
|
84 |
+
return f"Error computing correlation matrix: {str(e)}"
|
tools/libreoffice_tools.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import platform
|
4 |
+
from smolagents import tool
|
5 |
+
|
6 |
+
def get_libreoffice_path():
|
7 |
+
"""
|
8 |
+
Get the correct LibreOffice path based on the operating system.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
str: Path to LibreOffice executable or None if not found
|
12 |
+
"""
|
13 |
+
system = platform.system()
|
14 |
+
|
15 |
+
if system == "Darwin": # macOS
|
16 |
+
# Common LibreOffice installation paths on macOS
|
17 |
+
possible_paths = [
|
18 |
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
19 |
+
"/Applications/LibreOffice Developer Edition.app/Contents/MacOS/soffice",
|
20 |
+
"/opt/homebrew/bin/soffice", # Homebrew installation
|
21 |
+
"/usr/local/bin/soffice"
|
22 |
+
]
|
23 |
+
|
24 |
+
for path in possible_paths:
|
25 |
+
if os.path.exists(path):
|
26 |
+
return path
|
27 |
+
|
28 |
+
elif system == "Linux":
|
29 |
+
# Common LibreOffice paths on Linux
|
30 |
+
possible_paths = [
|
31 |
+
"/usr/bin/libreoffice",
|
32 |
+
"/usr/bin/soffice",
|
33 |
+
"/snap/bin/libreoffice",
|
34 |
+
"/usr/local/bin/libreoffice"
|
35 |
+
]
|
36 |
+
|
37 |
+
for path in possible_paths:
|
38 |
+
if os.path.exists(path):
|
39 |
+
return path
|
40 |
+
|
41 |
+
elif system == "Windows":
|
42 |
+
# Common LibreOffice paths on Windows
|
43 |
+
possible_paths = [
|
44 |
+
r"C:\Program Files\LibreOffice\program\soffice.exe",
|
45 |
+
r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
|
46 |
+
]
|
47 |
+
|
48 |
+
for path in possible_paths:
|
49 |
+
if os.path.exists(path):
|
50 |
+
return path
|
51 |
+
|
52 |
+
# Try to find it in PATH as fallback
|
53 |
+
try:
|
54 |
+
result = subprocess.run(['which', 'soffice'], capture_output=True, text=True)
|
55 |
+
if result.returncode == 0:
|
56 |
+
return result.stdout.strip()
|
57 |
+
except:
|
58 |
+
pass
|
59 |
+
|
60 |
+
try:
|
61 |
+
result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True)
|
62 |
+
if result.returncode == 0:
|
63 |
+
return result.stdout.strip()
|
64 |
+
except:
|
65 |
+
pass
|
66 |
+
|
67 |
+
return None
|
68 |
+
|
69 |
+
@tool
|
70 |
+
def convert_to_pdf_with_libreoffice(input_file: str, output_dir: str = None) -> str:
|
71 |
+
"""
|
72 |
+
Convert a document to PDF using LibreOffice.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
input_file: Path to the input document
|
76 |
+
output_dir: Directory to save the PDF (optional, defaults to same directory as input)
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: Path to the generated PDF file or error message
|
80 |
+
"""
|
81 |
+
libreoffice_path = get_libreoffice_path()
|
82 |
+
|
83 |
+
if not libreoffice_path:
|
84 |
+
return "LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/"
|
85 |
+
|
86 |
+
if not os.path.exists(input_file):
|
87 |
+
return f"Input file not found: {input_file}"
|
88 |
+
|
89 |
+
if output_dir is None:
|
90 |
+
output_dir = os.path.dirname(input_file)
|
91 |
+
|
92 |
+
if not os.path.exists(output_dir):
|
93 |
+
os.makedirs(output_dir, exist_ok=True)
|
94 |
+
|
95 |
+
try:
|
96 |
+
# Use LibreOffice headless mode to convert to PDF
|
97 |
+
cmd = [
|
98 |
+
libreoffice_path,
|
99 |
+
'--headless',
|
100 |
+
'--convert-to', 'pdf',
|
101 |
+
'--outdir', output_dir,
|
102 |
+
input_file
|
103 |
+
]
|
104 |
+
|
105 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
106 |
+
|
107 |
+
if result.returncode == 0:
|
108 |
+
# Generate expected output filename
|
109 |
+
base_name = os.path.splitext(os.path.basename(input_file))[0]
|
110 |
+
pdf_path = os.path.join(output_dir, f"{base_name}.pdf")
|
111 |
+
|
112 |
+
if os.path.exists(pdf_path):
|
113 |
+
return pdf_path
|
114 |
+
else:
|
115 |
+
return f"PDF conversion completed but file not found at expected location: {pdf_path}"
|
116 |
+
else:
|
117 |
+
return f"LibreOffice conversion failed: {result.stderr}"
|
118 |
+
|
119 |
+
except subprocess.TimeoutExpired:
|
120 |
+
return "LibreOffice conversion timed out after 60 seconds"
|
121 |
+
except Exception as e:
|
122 |
+
return f"Error during LibreOffice conversion: {str(e)}"
|
123 |
+
|
124 |
+
@tool
|
125 |
+
def check_libreoffice_availability() -> str:
|
126 |
+
"""
|
127 |
+
Check if LibreOffice is available and return its path and version.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
str: Information about LibreOffice availability
|
131 |
+
"""
|
132 |
+
libreoffice_path = get_libreoffice_path()
|
133 |
+
|
134 |
+
if not libreoffice_path:
|
135 |
+
system = platform.system()
|
136 |
+
install_info = {
|
137 |
+
"Darwin": "Install with: brew install libreoffice OR download from https://www.libreoffice.org/",
|
138 |
+
"Linux": "Install with: sudo apt install libreoffice OR sudo yum install libreoffice",
|
139 |
+
"Windows": "Download from https://www.libreoffice.org/"
|
140 |
+
}
|
141 |
+
|
142 |
+
return f"LibreOffice not found on {system}. {install_info.get(system, 'Install from https://www.libreoffice.org/')}"
|
143 |
+
|
144 |
+
try:
|
145 |
+
# Get version info
|
146 |
+
result = subprocess.run([libreoffice_path, '--version'], capture_output=True, text=True, timeout=10)
|
147 |
+
version_info = result.stdout.strip() if result.returncode == 0 else "Version unknown"
|
148 |
+
|
149 |
+
return f"LibreOffice found at: {libreoffice_path}\nVersion: {version_info}"
|
150 |
+
except:
|
151 |
+
return f"LibreOffice found at: {libreoffice_path}\nVersion: Unable to determine"
|
152 |
+
|
153 |
+
if __name__ == "__main__":
|
154 |
+
# Test the LibreOffice detection
|
155 |
+
print(check_libreoffice_availability())
|
tools/webpage_tools.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from smolagents import tool
|
3 |
+
from requests.exceptions import RequestException
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from urllib.parse import urljoin
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
from io import StringIO, BytesIO
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
@tool
|
14 |
+
def visit_webpage(url: str) -> str:
|
15 |
+
"""Visits a webpage at the given URL and returns its full DOM content.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
url: The URL of the webpage to visit.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
The DOM of the webpage as a string, or an error message if the request fails.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
# Send a GET request to the URL
|
25 |
+
response = requests.get(url)
|
26 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
27 |
+
|
28 |
+
return response.text
|
29 |
+
|
30 |
+
except RequestException as e:
|
31 |
+
return f"Error fetching the webpage: {str(e)}"
|
32 |
+
except Exception as e:
|
33 |
+
return f"An unexpected error occurred: {str(e)}"
|
34 |
+
|
35 |
+
@tool
|
36 |
+
def get_all_links(html_content: str, base_url: str) -> list[str]:
|
37 |
+
"""
|
38 |
+
Finds all links to CSV, JSON, and Excel (.xlsx) files in the given HTML content.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
html_content: The HTML content of a webpage.
|
42 |
+
base_url: The base URL of the webpage to resolve relative links.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
A list of all unique absolute URLs to CSV, JSON, or Excel files found on the page.
|
46 |
+
"""
|
47 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
48 |
+
links = set()
|
49 |
+
for a_tag in soup.find_all('a', href=True):
|
50 |
+
href = a_tag['href']
|
51 |
+
# Join the base URL with the found href to handle relative links
|
52 |
+
absolute_url = urljoin(base_url, href)
|
53 |
+
if absolute_url.lower().endswith(('.csv', '.json', '.xlsx')):
|
54 |
+
links.add(absolute_url)
|
55 |
+
return list(links)
|
56 |
+
|
57 |
+
@tool
|
58 |
+
def read_csv_file(file_path: str) -> str:
|
59 |
+
"""
|
60 |
+
Reads a CSV file and returns its content as a string.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
file_path: The path to the CSV file.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
The content of the CSV file as a string, or an error message if the file cannot be read.
|
67 |
+
"""
|
68 |
+
try:
|
69 |
+
df = pd.read_csv(file_path, delimiter=';')
|
70 |
+
return df.to_string()
|
71 |
+
except Exception as e:
|
72 |
+
return f"Error reading the CSV file: {str(e)}"
|
73 |
+
|
74 |
+
@tool
|
75 |
+
def read_file_from_url(url: str) -> pd.DataFrame:
|
76 |
+
"""
|
77 |
+
Reads a CSV, JSON, or Excel (.xlsx) file from a static URL and loads it into a pandas DataFrame.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
url: The URL of the CSV, JSON, or Excel file to read.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
A pandas DataFrame containing the data from the file, or raises an exception if the file cannot be read.
|
84 |
+
"""
|
85 |
+
try:
|
86 |
+
# Send a GET request to the URL
|
87 |
+
response = requests.get(url)
|
88 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
89 |
+
|
90 |
+
# Handle encoding properly
|
91 |
+
if response.encoding is None or response.encoding.lower() in ['iso-8859-1', 'ascii']:
|
92 |
+
response.encoding = 'utf-8'
|
93 |
+
|
94 |
+
# Determine file type based on URL extension
|
95 |
+
if url.lower().endswith('.csv'):
|
96 |
+
# Use BytesIO to handle encoding properly
|
97 |
+
content_bytes = response.content
|
98 |
+
|
99 |
+
# Try different delimiters for CSV files
|
100 |
+
try:
|
101 |
+
# First try comma separator
|
102 |
+
df = pd.read_csv(BytesIO(content_bytes), encoding='utf-8')
|
103 |
+
except Exception:
|
104 |
+
try:
|
105 |
+
# Then try semicolon separator
|
106 |
+
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='utf-8')
|
107 |
+
except Exception:
|
108 |
+
try:
|
109 |
+
# Finally try tab separator
|
110 |
+
df = pd.read_csv(BytesIO(content_bytes), delimiter='\t', encoding='utf-8')
|
111 |
+
except Exception:
|
112 |
+
# Last resort: try latin-1 encoding
|
113 |
+
df = pd.read_csv(BytesIO(content_bytes), delimiter=';', encoding='latin-1')
|
114 |
+
|
115 |
+
elif url.lower().endswith('.json'):
|
116 |
+
# Parse JSON and convert to DataFrame - use proper encoding
|
117 |
+
json_data = json.loads(response.text)
|
118 |
+
|
119 |
+
# Handle different JSON structures
|
120 |
+
if isinstance(json_data, list):
|
121 |
+
df = pd.DataFrame(json_data)
|
122 |
+
elif isinstance(json_data, dict):
|
123 |
+
# If it's a dict, try to find the main data array
|
124 |
+
if len(json_data.keys()) == 1:
|
125 |
+
# If there's only one key, use its value
|
126 |
+
key = list(json_data.keys())[0]
|
127 |
+
if isinstance(json_data[key], list):
|
128 |
+
df = pd.DataFrame(json_data[key])
|
129 |
+
else:
|
130 |
+
df = pd.DataFrame([json_data])
|
131 |
+
else:
|
132 |
+
# Multiple keys, treat the whole dict as a single row
|
133 |
+
df = pd.DataFrame([json_data])
|
134 |
+
else:
|
135 |
+
raise ValueError("Unsupported JSON structure")
|
136 |
+
|
137 |
+
elif url.lower().endswith('.xlsx'):
|
138 |
+
# Handle Excel files
|
139 |
+
content_bytes = response.content
|
140 |
+
df = pd.read_excel(BytesIO(content_bytes), engine='openpyxl')
|
141 |
+
|
142 |
+
else:
|
143 |
+
raise ValueError("Unsupported file type. Only CSV, JSON, and Excel (.xlsx) files are supported.")
|
144 |
+
|
145 |
+
return df
|
146 |
+
|
147 |
+
except RequestException as e:
|
148 |
+
raise Exception(f"Error fetching the file from URL: {str(e)}")
|
149 |
+
except json.JSONDecodeError as e:
|
150 |
+
raise Exception(f"Error parsing JSON file: {str(e)}")
|
151 |
+
except pd.errors.EmptyDataError:
|
152 |
+
raise Exception("The file is empty or contains no data")
|
153 |
+
except Exception as e:
|
154 |
+
raise Exception(f"An unexpected error occurred: {str(e)}")
|
155 |
+
|
156 |
+
if __name__ == "__main__":
|
157 |
+
url = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/"
|
158 |
+
url = "https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/"
|
159 |
+
dom_content = visit_webpage(url)
|
160 |
+
if not dom_content.startswith("Error"):
|
161 |
+
all_links = get_all_links(dom_content, url)
|
162 |
+
for link in all_links:
|
163 |
+
print(link)
|
164 |
+
|
165 |
+
link = "https://static.data.gouv.fr/resources/repertoire-national-des-elus-1/20250312-164351/elus-conseillers-darrondissements-ca.csv"
|
166 |
+
link = "https://static.data.gouv.fr/resources/catalogue-des-donnees-de-data-gouv-fr/20250608-054904/export-dataset-20250608-054904.csv"
|
167 |
+
content = read_file_from_url(link)
|
168 |
+
print(content.head())
|