Commit
Β·
2dd2794
1
Parent(s):
b765960
update: docx use
Browse files- README.md +9 -10
- agent.py +4 -14
- app.py +9 -9
- requirements.txt +0 -1
- tools/followup_tools.py +18 -4
README.md
CHANGED
@@ -14,7 +14,7 @@ tag: agent-demo-track
|
|
14 |
|
15 |
# π€ French Public Data Analysis Agent
|
16 |
|
17 |
-
**AI-powered intelligent analysis of French public datasets** with automated visualization generation, comprehensive
|
18 |
|
19 |
## β¨ Features
|
20 |
|
@@ -48,10 +48,10 @@ tag: agent-demo-track
|
|
48 |
- **Follow-up Visualizations**: Generate additional charts based on user questions
|
49 |
|
50 |
### π **Comprehensive Reports**
|
51 |
-
- **Professional
|
52 |
- **Bilingual Support**: Reports generated in the same language as your query
|
53 |
- **Structured Analysis**: Title page, methodology, findings, and next steps
|
54 |
-
- **
|
55 |
- **Report Continuity**: Follow-up analysis references previous report context
|
56 |
|
57 |
### π¨ **Modern Web Interface**
|
@@ -67,7 +67,6 @@ tag: agent-demo-track
|
|
67 |
### 1. Prerequisites
|
68 |
|
69 |
- Python 3.8+
|
70 |
-
- LibreOffice (for PDF generation)
|
71 |
- Google Gemini API key
|
72 |
|
73 |
### 2. Installation
|
@@ -140,7 +139,7 @@ After the initial analysis is complete:
|
|
140 |
|
141 |
### Results
|
142 |
|
143 |
-
- **Download
|
144 |
- **View Individual Charts**: Up to 4 visualizations displayed in the interface
|
145 |
- **Dataset Reference**: Direct link to the original data.gouv.fr page
|
146 |
- **Follow-up Visualizations**: Additional charts generated from follow-up questions
|
@@ -159,7 +158,7 @@ After the initial analysis is complete:
|
|
159 |
β βββ webpage_tools.py # Web scraping and data extraction
|
160 |
β βββ exploration_tools.py # Dataset analysis and description
|
161 |
β βββ drawing_tools.py # France map generation and visualization
|
162 |
-
β βββ libreoffice_tools.py #
|
163 |
β βββ followup_tools.py # Follow-up analysis tools
|
164 |
β βββ retrieval_tools.py # Dataset search and retrieval
|
165 |
βββ filtered_dataset.csv # Pre-processed dataset index (5,000+ datasets)
|
@@ -176,7 +175,7 @@ After the initial analysis is complete:
|
|
176 |
- **Search**: BM25 keyword matching with TF-IDF preprocessing
|
177 |
- **Translation**: LLM-powered bilingual query translation
|
178 |
- **Visualization**: Matplotlib, Geopandas, Seaborn
|
179 |
-
- **
|
180 |
- **Data Processing**: Pandas, NumPy, Shapely, Scipy
|
181 |
- **Follow-up Analytics**: Statistical analysis, correlation studies, custom filtering β
|
182 |
|
@@ -222,8 +221,8 @@ After the initial analysis is complete:
|
|
222 |
- Try a different query or use the random selection
|
223 |
- Agent will automatically search for alternative datasets
|
224 |
|
225 |
-
2. **
|
226 |
-
- Ensure
|
227 |
- Check the console for specific error messages
|
228 |
|
229 |
3. **Translation errors**
|
@@ -301,7 +300,7 @@ pandas, shapely, geopandas, numpy, rtree, pyproj
|
|
301 |
matplotlib, requests, duckduckgo-search
|
302 |
smolagents[toolkit], smolagents[litellm]
|
303 |
dotenv, beautifulsoup4, reportlab>=3.6.0
|
304 |
-
scikit-learn, gradio,
|
305 |
scipy, openpyxl, unidecode, rank_bm25
|
306 |
```
|
307 |
|
|
|
14 |
|
15 |
# π€ French Public Data Analysis Agent
|
16 |
|
17 |
+
**AI-powered intelligent analysis of French public datasets** with automated visualization generation, comprehensive DOCX reports, and **interactive follow-up analysis capabilities**.
|
18 |
|
19 |
## β¨ Features
|
20 |
|
|
|
48 |
- **Follow-up Visualizations**: Generate additional charts based on user questions
|
49 |
|
50 |
### π **Comprehensive Reports**
|
51 |
+
- **Professional DOCX Reports**: Complete analysis with embedded visualizations
|
52 |
- **Bilingual Support**: Reports generated in the same language as your query
|
53 |
- **Structured Analysis**: Title page, methodology, findings, and next steps
|
54 |
+
- **Direct DOCX Generation**: No external dependencies required
|
55 |
- **Report Continuity**: Follow-up analysis references previous report context
|
56 |
|
57 |
### π¨ **Modern Web Interface**
|
|
|
67 |
### 1. Prerequisites
|
68 |
|
69 |
- Python 3.8+
|
|
|
70 |
- Google Gemini API key
|
71 |
|
72 |
### 2. Installation
|
|
|
139 |
|
140 |
### Results
|
141 |
|
142 |
+
- **Download DOCX Report**: Complete analysis with all visualizations
|
143 |
- **View Individual Charts**: Up to 4 visualizations displayed in the interface
|
144 |
- **Dataset Reference**: Direct link to the original data.gouv.fr page
|
145 |
- **Follow-up Visualizations**: Additional charts generated from follow-up questions
|
|
|
158 |
β βββ webpage_tools.py # Web scraping and data extraction
|
159 |
β βββ exploration_tools.py # Dataset analysis and description
|
160 |
β βββ drawing_tools.py # France map generation and visualization
|
161 |
+
β βββ libreoffice_tools.py # Document utilities (legacy)
|
162 |
β βββ followup_tools.py # Follow-up analysis tools
|
163 |
β βββ retrieval_tools.py # Dataset search and retrieval
|
164 |
βββ filtered_dataset.csv # Pre-processed dataset index (5,000+ datasets)
|
|
|
175 |
- **Search**: BM25 keyword matching with TF-IDF preprocessing
|
176 |
- **Translation**: LLM-powered bilingual query translation
|
177 |
- **Visualization**: Matplotlib, Geopandas, Seaborn
|
178 |
+
- **Report Generation**: python-docx for DOCX documents
|
179 |
- **Data Processing**: Pandas, NumPy, Shapely, Scipy
|
180 |
- **Follow-up Analytics**: Statistical analysis, correlation studies, custom filtering β
|
181 |
|
|
|
221 |
- Try a different query or use the random selection
|
222 |
- Agent will automatically search for alternative datasets
|
223 |
|
224 |
+
2. **DOCX report generation fails**
|
225 |
+
- Ensure python-docx is installed correctly
|
226 |
- Check the console for specific error messages
|
227 |
|
228 |
3. **Translation errors**
|
|
|
300 |
matplotlib, requests, duckduckgo-search
|
301 |
smolagents[toolkit], smolagents[litellm]
|
302 |
dotenv, beautifulsoup4, reportlab>=3.6.0
|
303 |
+
scikit-learn, gradio, python-docx
|
304 |
scipy, openpyxl, unidecode, rank_bm25
|
305 |
```
|
306 |
|
agent.py
CHANGED
@@ -11,11 +11,6 @@ from tools.exploration_tools import (
|
|
11 |
from tools.drawing_tools import (
|
12 |
plot_departments_data,
|
13 |
)
|
14 |
-
from tools.libreoffice_tools import (
|
15 |
-
convert_to_pdf_with_libreoffice,
|
16 |
-
check_libreoffice_availability,
|
17 |
-
get_libreoffice_info,
|
18 |
-
)
|
19 |
from tools.retrieval_tools import (
|
20 |
search_datasets,
|
21 |
get_dataset_info,
|
@@ -39,8 +34,6 @@ def create_web_agent(step_callback):
|
|
39 |
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
|
40 |
get_dataset_description,
|
41 |
plot_departments_data,
|
42 |
-
convert_to_pdf_with_libreoffice,
|
43 |
-
check_libreoffice_availability, get_libreoffice_info,
|
44 |
search_datasets, get_dataset_info, get_random_quality_dataset
|
45 |
],
|
46 |
model=model,
|
@@ -49,7 +42,7 @@ def create_web_agent(step_callback):
|
|
49 |
planning_interval=3,
|
50 |
step_callbacks=[step_callback], # Use the built-in callback system
|
51 |
additional_authorized_imports=[
|
52 |
-
"
|
53 |
"os", "bs4", "io", "requests", "json", "pandas",
|
54 |
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
|
55 |
],
|
@@ -80,21 +73,18 @@ def generate_prompt(user_query=None, initial_search_results=None):
|
|
80 |
|
81 |
4. **Report Generation**:
|
82 |
- Write insightful analysis text for each visualization
|
83 |
-
- Generate a comprehensive
|
84 |
* Title page with dataset name and analysis overview
|
85 |
* All visualizations (PNG files) embedded in the report
|
86 |
* Analysis text for each visualization
|
87 |
* Conclusions and next steps
|
88 |
-
-
|
89 |
|
90 |
**Important Technical Notes:**
|
91 |
- Save everything in the generated_data folder
|
92 |
- Do NOT use the 'os' module
|
93 |
- Work step by step, don't generate too much code at once
|
94 |
-
-
|
95 |
-
- If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
|
96 |
-
- If check_libreoffice_availability() returns False, skip PDF conversion and inform user
|
97 |
-
- Do NOT use subprocess calls directly for LibreOffice
|
98 |
- If question is in English, report is in English. If in French, report is in French.
|
99 |
"""
|
100 |
|
|
|
11 |
from tools.drawing_tools import (
|
12 |
plot_departments_data,
|
13 |
)
|
|
|
|
|
|
|
|
|
|
|
14 |
from tools.retrieval_tools import (
|
15 |
search_datasets,
|
16 |
get_dataset_info,
|
|
|
34 |
visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
|
35 |
get_dataset_description,
|
36 |
plot_departments_data,
|
|
|
|
|
37 |
search_datasets, get_dataset_info, get_random_quality_dataset
|
38 |
],
|
39 |
model=model,
|
|
|
42 |
planning_interval=3,
|
43 |
step_callbacks=[step_callback], # Use the built-in callback system
|
44 |
additional_authorized_imports=[
|
45 |
+
"docx", "docx.*",
|
46 |
"os", "bs4", "io", "requests", "json", "pandas",
|
47 |
"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
|
48 |
],
|
|
|
73 |
|
74 |
4. **Report Generation**:
|
75 |
- Write insightful analysis text for each visualization
|
76 |
+
- Generate a comprehensive DOCX report using python-docx library that includes:
|
77 |
* Title page with dataset name and analysis overview
|
78 |
* All visualizations (PNG files) embedded in the report
|
79 |
* Analysis text for each visualization
|
80 |
* Conclusions and next steps
|
81 |
+
- Save the final DOCX report in the generated_data folder
|
82 |
|
83 |
**Important Technical Notes:**
|
84 |
- Save everything in the generated_data folder
|
85 |
- Do NOT use the 'os' module
|
86 |
- Work step by step, don't generate too much code at once
|
87 |
+
- Generate a complete DOCX report that can be downloaded by the user
|
|
|
|
|
|
|
88 |
- If question is in English, report is in English. If in French, report is in French.
|
89 |
"""
|
90 |
|
app.py
CHANGED
@@ -206,8 +206,8 @@ def create_progress_callback():
|
|
206 |
description = f"π Step {step_number}: Generating visualizations..."
|
207 |
elif "save" in action_lower or "png" in action_lower:
|
208 |
description = f"πΎ Step {step_number}: Saving visualizations..."
|
209 |
-
elif "
|
210 |
-
description = f"π Step {step_number}: Creating
|
211 |
elif hasattr(memory_step, 'error') and memory_step.error:
|
212 |
description = f"β οΈ Step {step_number}: Handling error..."
|
213 |
else:
|
@@ -313,7 +313,7 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
313 |
break
|
314 |
|
315 |
# Initialize outputs
|
316 |
-
|
317 |
images_output = [gr.Image(visible=False)] * 4
|
318 |
status = "π Starting agent-driven analysis..."
|
319 |
|
@@ -440,17 +440,17 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
440 |
progress(1.0, desc="β
Processing results...")
|
441 |
|
442 |
# Process results
|
443 |
-
|
444 |
png_files = []
|
445 |
|
446 |
for file in files:
|
447 |
-
if file.endswith('.
|
448 |
-
|
449 |
elif file.endswith('.png'):
|
450 |
png_files.append(file)
|
451 |
|
452 |
# Prepare final outputs
|
453 |
-
download_button = gr.File(value=
|
454 |
|
455 |
# Prepare images for display (up to 4 images)
|
456 |
images = []
|
@@ -495,7 +495,7 @@ def search_and_analyze(query, progress=gr.Progress()):
|
|
495 |
|
496 |
# Fallback return
|
497 |
progress(1.0, desc="π Finished")
|
498 |
-
return (gr.Textbox(value="Completed", visible=True), current_status,
|
499 |
gr.Markdown(visible=False), # keep follow-up hidden
|
500 |
gr.HTML(visible=False),
|
501 |
gr.Row(visible=False),
|
@@ -704,7 +704,7 @@ with gr.Blocks(title="π€ French Public Data Analysis Agent", theme=gr.themes.S
|
|
704 |
# Download section
|
705 |
with gr.Row():
|
706 |
download_button = gr.File(
|
707 |
-
label="π Download
|
708 |
visible=False
|
709 |
)
|
710 |
|
|
|
206 |
description = f"π Step {step_number}: Generating visualizations..."
|
207 |
elif "save" in action_lower or "png" in action_lower:
|
208 |
description = f"πΎ Step {step_number}: Saving visualizations..."
|
209 |
+
elif "docx" in action_lower or "report" in action_lower:
|
210 |
+
description = f"π Step {step_number}: Creating DOCX report..."
|
211 |
elif hasattr(memory_step, 'error') and memory_step.error:
|
212 |
description = f"β οΈ Step {step_number}: Handling error..."
|
213 |
else:
|
|
|
313 |
break
|
314 |
|
315 |
# Initialize outputs
|
316 |
+
docx_file = None
|
317 |
images_output = [gr.Image(visible=False)] * 4
|
318 |
status = "π Starting agent-driven analysis..."
|
319 |
|
|
|
440 |
progress(1.0, desc="β
Processing results...")
|
441 |
|
442 |
# Process results
|
443 |
+
docx_file = None
|
444 |
png_files = []
|
445 |
|
446 |
for file in files:
|
447 |
+
if file.endswith('.docx'):
|
448 |
+
docx_file = file
|
449 |
elif file.endswith('.png'):
|
450 |
png_files.append(file)
|
451 |
|
452 |
# Prepare final outputs
|
453 |
+
download_button = gr.File(value=docx_file, visible=True) if docx_file else None
|
454 |
|
455 |
# Prepare images for display (up to 4 images)
|
456 |
images = []
|
|
|
495 |
|
496 |
# Fallback return
|
497 |
progress(1.0, desc="π Finished")
|
498 |
+
return (gr.Textbox(value="Completed", visible=True), current_status, docx_file, *images_output,
|
499 |
gr.Markdown(visible=False), # keep follow-up hidden
|
500 |
gr.HTML(visible=False),
|
501 |
gr.Row(visible=False),
|
|
|
704 |
# Download section
|
705 |
with gr.Row():
|
706 |
download_button = gr.File(
|
707 |
+
label="π Download DOCX Report",
|
708 |
visible=False
|
709 |
)
|
710 |
|
requirements.txt
CHANGED
@@ -14,7 +14,6 @@ beautifulsoup4
|
|
14 |
reportlab>=3.6.0
|
15 |
scikit-learn
|
16 |
gradio
|
17 |
-
pypdf2
|
18 |
python-docx
|
19 |
scipy
|
20 |
openpyxl
|
|
|
14 |
reportlab>=3.6.0
|
15 |
scikit-learn
|
16 |
gradio
|
|
|
17 |
python-docx
|
18 |
scipy
|
19 |
openpyxl
|
tools/followup_tools.py
CHANGED
@@ -172,8 +172,8 @@ def get_previous_report_content() -> str:
|
|
172 |
The text content of the previous report for context
|
173 |
"""
|
174 |
try:
|
175 |
-
# Look for
|
176 |
-
report_files = glob.glob('generated_data/*.
|
177 |
|
178 |
if not report_files:
|
179 |
return "No previous report found in generated_data folder"
|
@@ -181,8 +181,19 @@ def get_previous_report_content() -> str:
|
|
181 |
# Use the most recent report file
|
182 |
latest_report = max(report_files, key=os.path.getctime)
|
183 |
|
184 |
-
#
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
file_size = os.path.getsize(latest_report)
|
187 |
|
188 |
# Also look for any text files that might contain analysis
|
@@ -199,6 +210,9 @@ Report file: {latest_report}
|
|
199 |
File size: {file_size} bytes
|
200 |
Created: {os.path.getctime(latest_report)}
|
201 |
|
|
|
|
|
|
|
202 |
Additional analysis content:
|
203 |
{text_content if text_content else 'No additional text content found'}
|
204 |
|
|
|
172 |
The text content of the previous report for context
|
173 |
"""
|
174 |
try:
|
175 |
+
# Look for DOCX files in generated_data
|
176 |
+
report_files = glob.glob('generated_data/*.docx')
|
177 |
|
178 |
if not report_files:
|
179 |
return "No previous report found in generated_data folder"
|
|
|
181 |
# Use the most recent report file
|
182 |
latest_report = max(report_files, key=os.path.getctime)
|
183 |
|
184 |
+
# Try to extract basic text from DOCX file
|
185 |
+
docx_content = ""
|
186 |
+
try:
|
187 |
+
from docx import Document
|
188 |
+
doc = Document(latest_report)
|
189 |
+
paragraphs = []
|
190 |
+
for para in doc.paragraphs:
|
191 |
+
if para.text.strip():
|
192 |
+
paragraphs.append(para.text.strip())
|
193 |
+
docx_content = "\n".join(paragraphs[:10]) # First 10 paragraphs for context
|
194 |
+
except Exception as e:
|
195 |
+
docx_content = f"Could not extract text from DOCX: {str(e)}"
|
196 |
+
|
197 |
file_size = os.path.getsize(latest_report)
|
198 |
|
199 |
# Also look for any text files that might contain analysis
|
|
|
210 |
File size: {file_size} bytes
|
211 |
Created: {os.path.getctime(latest_report)}
|
212 |
|
213 |
+
DOCX Report Content (first 10 paragraphs):
|
214 |
+
{docx_content}
|
215 |
+
|
216 |
Additional analysis content:
|
217 |
{text_content if text_content else 'No additional text content found'}
|
218 |
|