File size: 33,790 Bytes
2508004
 
 
 
 
 
 
 
 
 
 
 
f584ef2
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244cc53
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dd2794
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
2508004
 
 
 
 
 
 
 
 
 
2dd2794
2508004
f584ef2
2508004
 
f584ef2
2508004
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2508004
f584ef2
 
2508004
 
 
 
f584ef2
 
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
2508004
 
 
f584ef2
 
 
 
 
 
 
 
2508004
 
 
 
 
2dd2794
2508004
 
 
2dd2794
 
2508004
 
 
 
2dd2794
2508004
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
 
 
2508004
 
 
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
2508004
 
 
 
 
 
2dd2794
f584ef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
2508004
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
2508004
 
 
f584ef2
 
2508004
 
 
 
f584ef2
 
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dd2794
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f584ef2
 
 
 
 
9cb7c11
f584ef2
 
 
 
 
 
9cb7c11
f584ef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb7c11
 
 
 
 
 
f584ef2
 
 
 
 
 
 
 
 
 
 
 
2508004
 
 
 
f584ef2
 
 
2508004
 
 
f584ef2
 
 
 
 
 
 
 
2508004
 
 
 
 
 
 
 
f584ef2
2508004
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
import os
import pandas as pd
import gradio as gr
import glob
import threading
import time
import queue
import numpy as np
from rank_bm25 import BM25Okapi
from dotenv import load_dotenv
from smolagents import CodeAgent, LiteLLMModel
from agent import create_web_agent, generate_prompt
from followup_agent import run_followup_analysis
from unidecode import unidecode
    
load_dotenv()

# Global variables for progress tracking
progress_queue = queue.Queue()
current_status = ""

# Initialize LLM translator and BM25
llm_translator = None
bm25_model = None
precomputed_titles = None

def initialize_models():
    """Initialize the LLM translator and BM25 model"""
    global llm_translator, bm25_model, precomputed_titles
    
    if llm_translator is None:
        # Initialize LLM for translation
        try:
            model = LiteLLMModel(
                model_id="gemini/gemini-2.5-flash-preview-05-20",
                api_key=os.getenv("GEMINI_API_KEY")
            )
            llm_translator = CodeAgent(tools=[], model=model, max_steps=1)
            print("βœ… LLM translator initialized")
        except Exception as e:
            print(f"⚠️  Error initializing LLM translator: {e}")
    
    # Load pre-computed BM25 model if available
    if bm25_model is None:
        try:
            import pickle
            with open('bm25_data.pkl', 'rb') as f:
                bm25_data = pickle.load(f)
                bm25_model = bm25_data['bm25_model']
                precomputed_titles = bm25_data['titles']
                print(f"βœ… Loaded pre-computed BM25 model for {len(precomputed_titles)} datasets")
        except FileNotFoundError:
            print("⚠️  Pre-computed BM25 model not found. Will compute at runtime.")
        except Exception as e:
            print(f"⚠️  Error loading pre-computed BM25 model: {e}")
            print("Will compute BM25 at runtime.")

def translate_query_llm(query, target_lang='fr'):
    """Translate query using LLM"""
    global llm_translator
    
    if llm_translator is None:
        initialize_models()
    
    if llm_translator is None:
        print("⚠️  LLM translator not available, returning original query")
        return query, 'unknown'
    
    try:
        # Create translation prompt
        if target_lang == 'fr':
            target_language = "French"
        elif target_lang == 'en':
            target_language = "English"
        else:
            target_language = target_lang
        
        translation_prompt = f"""
        Translate the following text to {target_language}. 
        If the text is already in {target_language}, return it as is.
        Only return the translated text, nothing else.
        
        Text to translate: "{query}"
        """
        
        # Get translation from LLM
        response = llm_translator.run(translation_prompt)
        translated_text = str(response).strip().strip('"').strip("'")
        
        # Simple language detection
        if query.lower() == translated_text.lower():
            source_lang = target_lang
        else:
            source_lang = 'en' if target_lang == 'fr' else 'fr'
        
        return translated_text, source_lang
    
    except Exception as e:
        print(f"LLM translation error: {e}")
        return query, 'unknown'

def simple_keyword_preprocessing(text):
    """Simple preprocessing for keyword matching - handles case, accents and basic plurals"""
    # Convert to lowercase and remove accents
    text = unidecode(str(text).lower())
    
    # Basic plural handling - just remove trailing 's' and 'x'
    words = text.split()
    processed_words = []
    
    for word in words:
        # Remove common plural endings
        if word.endswith('s') and len(word) > 3 and not word.endswith('ss'):
            word = word[:-1]
        elif word.endswith('x') and len(word) > 3:
            word = word[:-1]
        processed_words.append(word)
    
    return processed_words

def find_similar_dataset_bm25(query, df):
    """Find the most similar dataset using BM25 keyword matching"""
    global bm25_model, precomputed_titles
    
    # Translate query to French for better matching with French datasets
    translated_query, original_lang = translate_query_llm(query, target_lang='fr')
    
    # Combine original and translated queries for search
    search_queries = [query, translated_query] if query != translated_query else [query]
    
    # Get dataset titles
    dataset_titles = df['title'].fillna('').tolist()
    
    # Use pre-computed BM25 model if available and matches current dataset
    if (bm25_model is not None and precomputed_titles is not None and 
        len(dataset_titles) == len(precomputed_titles) and dataset_titles == precomputed_titles):
        print("πŸš€ Using pre-computed BM25 model for fast matching")
        bm25 = bm25_model
    else:
        # Build BM25 model at runtime
        print("⚠️  Computing BM25 model at runtime...")
        # Preprocess all dataset titles into tokenized form
        processed_titles = [simple_keyword_preprocessing(title) for title in dataset_titles]
        bm25 = BM25Okapi(processed_titles)
    
    best_score = -1
    best_idx = 0
    
    for search_query in search_queries:
        try:
            # Preprocess the search query
            processed_query = simple_keyword_preprocessing(search_query)
            
            # Get BM25 scores for all documents
            scores = bm25.get_scores(processed_query)
            
            max_score = scores.max()
            max_idx = scores.argmax()
            if max_score > best_score:
                best_score = max_score
                best_idx = max_idx
        except Exception as e:
            print(f"Error processing query '{search_query}': {e}")
            continue
    
    # Show top 5 matches for comparison
    if len(search_queries) > 0:
        processed_query = simple_keyword_preprocessing(search_queries[0])
        scores = bm25.get_scores(processed_query)
    return best_idx, best_score, translated_query, original_lang

def create_progress_callback():
    """Create a callback function for tracking agent progress"""
    
    def progress_callback(memory_step, agent=None):
        """Callback function called at each agent step"""
        step_number = memory_step.step_number
        
        # Extract information about the current step
        if hasattr(memory_step, 'action_input') and memory_step.action_input:
            action_content = memory_step.action_input
        elif hasattr(memory_step, 'action_output') and memory_step.action_output:
            action_content = str(memory_step.action_output)
        else:
            action_content = ""
        
        # Define progress based on step content and number
        progress_val = min(0.1 + (step_number * 0.03), 0.95)  # Progressive increase
        
        # Analyze the step content to provide meaningful status
        action_lower = action_content.lower() if action_content else ""
        
        if "visit_webpage" in action_lower or "examining" in action_lower:
            description = f"πŸ” Step {step_number}: Examining webpage..."
        elif "get_all_links" in action_lower or "links" in action_lower:
            description = f"πŸ”— Step {step_number}: Extracting data links..."
        elif "read_file_from_url" in action_lower or "reading" in action_lower:
            description = f"πŸ“Š Step {step_number}: Loading dataset..."
        elif "get_dataset_description" in action_lower or "description" in action_lower:
            description = f"πŸ“‹ Step {step_number}: Analyzing dataset structure..."
        elif "department" in action_lower or "region" in action_lower:
            description = f"πŸ—ΊοΈ Step {step_number}: Processing geographic data..."
        elif "plot" in action_lower or "map" in action_lower or "france" in action_lower:
            description = f"πŸ—ΊοΈ Step {step_number}: Creating France map..."
        elif "visualization" in action_lower or "chart" in action_lower:
            description = f"πŸ“ˆ Step {step_number}: Generating visualizations..."
        elif "save" in action_lower or "png" in action_lower:
            description = f"πŸ’Ύ Step {step_number}: Saving visualizations..."
        elif "docx" in action_lower or "report" in action_lower:
            description = f"πŸ“„ Step {step_number}: Creating DOCX report..."
        elif hasattr(memory_step, 'error') and memory_step.error:
            description = f"⚠️ Step {step_number}: Handling error..."
        else:
            description = f"πŸ€– Step {step_number}: Processing..."
        
        # Check if this is the final step
        if hasattr(memory_step, 'action_output') and memory_step.action_output and "final" in action_lower:
            progress_val = 1.0
            description = "βœ… Analysis complete!"
        
        # Put the progress update in the queue
        try:
            progress_queue.put((progress_val, description))
        except:
            pass
    
    return progress_callback

def run_agent_analysis_with_progress(query, progress_callback, df=None, page_url_callback=None, data_gouv_page=None, most_similar_idx=None):
    """
    Run the agent analysis with progress tracking using smolagents callbacks.
    """
    try:
        # Clean up previous results
        if os.path.exists('generated_data'):
            for file in glob.glob('generated_data/*'):
                try:
                    os.remove(file)
                except:
                    pass
        else:
            os.makedirs('generated_data', exist_ok=True)
        
        # If dataset info not provided, find it (fallback)
        if data_gouv_page is None or most_similar_idx is None:
            progress_callback(0.02, "πŸ€– Initializing LLM translator and BM25...")
            initialize_models()
            
            progress_callback(0.05, "πŸ” Searching for relevant datasets (using BM25 keyword matching)...")
            
            # Read the filtered dataset if not provided
            if df is None:
                df = pd.read_csv('filtered_dataset.csv')
            
            # Find the most similar dataset using BM25 keyword matching
            most_similar_idx, similarity_score, translated_query, original_lang = find_similar_dataset_bm25(query, df)
            data_gouv_page = df.iloc[most_similar_idx]['url']
            
            # Immediately show the page URL via callback
            if page_url_callback:
                page_url_callback(data_gouv_page)
            
            progress_callback(0.08, "πŸ€– Initializing agent...")
        else:
            # Dataset already found, continue from where we left off
            progress_callback(0.09, "πŸ€– Initializing agent...")

        step_callback = create_progress_callback()
        
        progress_callback(0.1, "πŸ€– Starting agent analysis...")
        
        # Create the agent with progress callback
        web_agent = create_web_agent(step_callback)
        prompt = generate_prompt(data_gouv_page)
        
        # Run the agent - the step_callbacks will automatically update progress
        answer = web_agent.run(prompt)
        
        # Check if the agent found no processable data
        answer_lower = str(answer).lower() if answer else ""
        if ("no processable data" in answer_lower or 
            "no csv nor json" in answer_lower or 
            "cannot find csv" in answer_lower or 
            "cannot find json" in answer_lower or
            "no data to process" in answer_lower):
            progress_callback(1.0, "❌ No CSV/JSON files found in the dataset")
            return "❌ No CSV/JSON files found in the selected dataset. This dataset cannot be processed automatically.", [], data_gouv_page
        
        # Check if files were generated
        generated_files = glob.glob('generated_data/*')
        
        if generated_files:
            progress_callback(1.0, "βœ… Analysis completed successfully!")
            return "Analysis completed successfully!", generated_files, data_gouv_page
        else:
            progress_callback(1.0, "⚠️ Analysis completed but no files were generated.")
            return "Analysis completed but no files were generated.", [], data_gouv_page
            
    except Exception as e:
        progress_callback(1.0, f"❌ Error: {str(e)}")
        return f"Error during analysis: {str(e)}", [], None

def search_and_analyze(query, progress=gr.Progress()):
    """
    Unified function that does initial search then lets agent analyze with full autonomy.
    Uses Gradio's progress bar for visual feedback.
    """
    # Clear the progress queue
    while not progress_queue.empty():
        try:
            progress_queue.get_nowait()
        except queue.Empty:
            break
    
    # Initialize outputs
    docx_file = None
    images_output = [gr.Image(visible=False)] * 4
    status = "πŸš€ Starting agent-driven analysis..."
    
    # Initial progress
    progress(0.05, desc="πŸš€ Initializing agent...")
    
    def progress_callback(progress_val, description):
        """Callback function to update progress - puts updates in queue"""
        try:
            progress_queue.put((progress_val, description))
        except:
            pass
    
    # Run analysis in a separate thread
    result_queue = queue.Queue()
    
    def run_analysis():
        try:
            # Clean up previous results
            if os.path.exists('generated_data'):
                for file in glob.glob('generated_data/*'):
                    try:
                        os.remove(file)
                    except:
                        pass
            else:
                os.makedirs('generated_data', exist_ok=True)
            
            # Do initial search if query provided
            initial_search_results = None
            if query.strip():
                progress_callback(0.06, f"πŸ” Initial search for: {query[:50]}...")
                try:
                    # Import search function from tools
                    from tools.retrieval_tools import search_datasets
                    initial_search_results = search_datasets(query, top_k=5)
                    progress_callback(0.08, "πŸ€– Starting agent with search results...")
                except Exception as e:
                    print(f"Initial search failed: {e}")
                    progress_callback(0.08, "πŸ€– Starting agent without initial results...")
            else:
                progress_callback(0.08, "πŸ€– Starting agent for random selection...")
            
            step_callback = create_progress_callback()
            
            # Create the agent with progress callback
            web_agent = create_web_agent(step_callback)
            
            # Generate unified prompt with initial search results
            prompt = generate_prompt(user_query=query, initial_search_results=initial_search_results)
            progress_callback(0.1, "πŸ€– Agent analyzing datasets...")
            
            # Run the agent - the step_callbacks will automatically update progress
            answer = web_agent.run(prompt)
            
            # Check if the agent found no processable data
            answer_lower = str(answer).lower() if answer else ""
            if ("no processable data" in answer_lower or 
                "no csv nor json" in answer_lower or 
                "cannot find csv" in answer_lower or 
                "cannot find json" in answer_lower or
                "no data to process" in answer_lower):
                progress_callback(1.0, "❌ No CSV/JSON files found in the dataset")
                result_queue.put(("❌ No CSV/JSON files found in the selected dataset. This dataset cannot be processed automatically.", [], None))
                return
            
            # Check if files were generated
            generated_files = glob.glob('generated_data/*')
            
            if generated_files:
                progress_callback(1.0, "βœ… Analysis completed successfully!")
                result_queue.put(("Analysis completed successfully!", generated_files, "Agent-selected dataset"))
            else:
                progress_callback(1.0, "⚠️ Analysis completed but no files were generated.")
                result_queue.put(("Analysis completed but no files were generated.", [], None))
                
        except Exception as e:
            progress_callback(1.0, f"❌ Error: {str(e)}")
            result_queue.put((f"Error during analysis: {str(e)}", [], None))
    
    analysis_thread = threading.Thread(target=run_analysis)
    analysis_thread.start()
    
    # Show initial status
    current_status = "πŸ€– Agent is finding relevant datasets..."
    progress(0.08, desc=current_status)
    
    # Monitor progress while analysis runs
    last_progress = 0.08
    
    while analysis_thread.is_alive() or not result_queue.empty():
        try:
            # Check for progress updates from queue
            try:
                progress_val, description = progress_queue.get(timeout=0.1)
                if progress_val > last_progress:
                    last_progress = progress_val
                    current_status = description
                    progress(progress_val, desc=description)
            except queue.Empty:
                pass
            
            # Check if analysis is complete
            try:
                final_status, files, page_url = result_queue.get(timeout=0.1)
                
                # Check if this is a "no data" case
                if "❌ No CSV/JSON files found" in final_status:
                    progress(1.0, desc="❌ No processable data found")
                    return (gr.Textbox(value="Agent-selected dataset", visible=True), 
                           final_status, 
                           gr.File(visible=False),
                           gr.Image(visible=False), gr.Image(visible=False), 
                           gr.Image(visible=False), gr.Image(visible=False),
                           gr.Markdown(visible=False),  # keep follow-up hidden
                           gr.HTML(visible=False),      
                           gr.Row(visible=False),       
                           gr.Row(visible=False),       
                           gr.Row(visible=False),       
                           gr.Row(visible=False),       
                           gr.Row(visible=False))
                
                # Final progress update
                progress(1.0, desc="βœ… Processing results...")
                
                # Process results
                docx_file = None
                png_files = []
                
                for file in files:
                    if file.endswith('.docx'):
                        docx_file = file
                    elif file.endswith('.png'):
                        png_files.append(file)
                
                # Prepare final outputs
                download_button = gr.File(value=docx_file, visible=True) if docx_file else None
                
                # Prepare images for display (up to 4 images)
                images = []
                for i in range(4):
                    if i < len(png_files):
                        images.append(gr.Image(value=png_files[i], visible=True))
                    else:
                        images.append(gr.Image(visible=False))
                
                # final progress completion
                progress(1.0, desc="πŸŽ‰ Complete!")
                
                # Show follow-up section after successful completion
                return (gr.Textbox(value=page_url if page_url else "Agent-selected dataset", visible=True), 
                       final_status, download_button, *images,
                       gr.Markdown(visible=True),  # followup_section_divider
                       gr.HTML(visible=True),      # followup_section_header  
                       gr.Row(visible=True),       # followup_input_row
                       gr.Row(visible=True),       # followup_result_row
                       gr.Row(visible=True),       # followup_image_row
                       gr.Row(visible=True),       # followup_examples_header_row
                       gr.Row(visible=True))       # followup_examples_row
                
            except queue.Empty:
                pass
                
            time.sleep(0.5)  # Small delay to prevent excessive updates
            
        except Exception as e:
            progress(1.0, desc=f"❌ Error: {str(e)}")
            return (gr.Textbox(value="Error", visible=True), f"❌ Error: {str(e)}", None, *images_output,
                   gr.Markdown(visible=False),  # keep follow-up hidden on error
                   gr.HTML(visible=False),      
                   gr.Row(visible=False),       
                   gr.Row(visible=False),       
                   gr.Row(visible=False),       
                   gr.Row(visible=False),       
                   gr.Row(visible=False))
    
    # Ensure thread completes
    analysis_thread.join(timeout=1)
    
    # Fallback return
    progress(1.0, desc="🏁 Finished")
    return (gr.Textbox(value="Completed", visible=True), current_status, docx_file, *images_output,
           gr.Markdown(visible=False),  # keep follow-up hidden
           gr.HTML(visible=False),      
           gr.Row(visible=False),       
           gr.Row(visible=False),       
           gr.Row(visible=False),       
           gr.Row(visible=False),       
           gr.Row(visible=False))

def run_followup_question(question, progress=gr.Progress()):
    """
    Run a follow-up analysis based on user's question about the previous report.
    """
    if not question.strip():
        return "Please enter a follow-up question.", gr.Image(visible=False)
    
    progress(0.1, desc="πŸ€– Starting follow-up analysis...")
    
    try:
        # Check if there are previous results
        if not os.path.exists('generated_data') or not os.listdir('generated_data'):
            return "No previous analysis found. Please run an analysis first.", gr.Image(visible=False)
        
        progress(0.3, desc="πŸ” Analyzing previous report and dataset...")
        
        # Run the follow-up analysis
        result = run_followup_analysis(question)
        
        progress(0.9, desc="πŸ“Š Processing results...")
        
        # Look for new visualizations created by the follow-up analysis
        import glob
        
        # Get all images that were created after the analysis started
        all_images = glob.glob('generated_data/*.png')
        
        # Get recent images (created in the last few seconds)
        import time
        current_time = time.time()
        recent_images = []
        
        for img_path in all_images:
            img_time = os.path.getctime(img_path)
            if current_time - img_time < 120:  # Images created in last 2 minutes
                recent_images.append(img_path)
        
        # Get the most recent image if any
        latest_image = None
        if recent_images:
            latest_image = max(recent_images, key=os.path.getctime)
        
        progress(1.0, desc="βœ… Follow-up analysis complete!")
        
        # Enhanced result formatting
        final_result = result
        if latest_image:
            final_result += f"\n\nπŸ“Š **Visualization Created:** {os.path.basename(latest_image)}"
            if len(recent_images) > 1:
                final_result += f"\nπŸ“ˆ **Total new visualizations:** {len(recent_images)}"
            return final_result, gr.Image(value=latest_image, visible=True)
        else:
            return final_result, gr.Image(visible=False)
        
    except Exception as e:
        progress(1.0, desc="❌ Error in follow-up analysis")
        return f"Error: {str(e)}", gr.Image(visible=False)

# Create the Gradio interface
with gr.Blocks(title="πŸ€– French Public Data Analysis Agent", theme=gr.themes.Soft(), css="""
    .gradio-container {
        max-width: 1200px !important;
        margin: auto;
        width: 100% !important;
    }
    .main-header {
        text-align: center;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 2rem;
        border-radius: 15px;
        margin-bottom: 2rem;
        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
    }
    .accordion-content {
        overflow: hidden !important;
        width: 100% !important;
    }
    .gr-accordion {
        width: 100% !important;
        max-width: 100% !important;
    }
    .gr-accordion .gr-row {
        width: 100% !important;
        max-width: 100% !important;
        margin: 0 !important;
    }
    .gr-accordion .gr-column {
        min-width: 0 !important;
        flex: 1 !important;
        max-width: 50% !important;
        padding-right: 1rem !important;
    }
    .gr-accordion .gr-column:last-child {
        padding-right: 0 !important;
        padding-left: 1rem !important;
    }
""") as demo:
    
    # Main header with better styling
    gr.HTML("""
    <div class="main-header">
        <h1 style="margin: 0; font-size: 2.5rem; font-weight: bold;">
            πŸ€– French Public Data Analysis Agent
        </h1>
        <p style="font-size: 1.2rem; opacity: 0.9;">
            Intelligent analysis of French public datasets with AI-powered insights
        </p>
    </div>
    """)
    
    # What this agent does
    gr.HTML("""
    <div style="text-align: center; background: #f8fafc; padding: 1.5rem; border-radius: 10px; margin: 1rem 0;">
        <p style="font-size: 1.1rem; color: #374151; margin: 0;">
            🌐 <strong>Search in French or English</strong> β€’ πŸ€– <strong>AI Agent finds & analyzes datasets</strong> β€’ πŸ—ΊοΈ <strong>Generate Reports with visualizations</strong>
        </p>
        <p style="font-size: 0.9rem; color: #6b7280; margin-top: 0.5rem;">
            Initial search results guide the agent, but it can search for different datasets if needed
        </p>
    </div>
    """)
    
    # Tips & Information accordion - moved to the top
    with gr.Accordion("πŸ’‘ Tips & Information", open=False):
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                🎯 **How to Use:**
                - Enter search terms related to French public data
                - Leave empty for random high-quality dataset selection
                - System provides initial search results to guide the agent
                - Agent can use provided results or search for different datasets
                - Results include visualizations and downloadable reports
                
                ⏱️ **Processing Time:**
                - Analysis takes 7-15 minutes depending on dataset complexity
                - Agent has full autonomy to find the best datasets
                """)
            with gr.Column():
                gr.Markdown("""
                ⚠️ **Important Notes:**
                - Agent gets initial search results but has full autonomy to make decisions
                - Agent can choose from initial results or search for different datasets
                - Some datasets may not contain processable CSV/JSON files
                - All visualizations are automatically generated
                - Maps focus on France when geographic data is available
                
                🌐 **Language Support:**
                - Search in French or English - queries are automatically translated
                """)
    
    with gr.Row():
        query_input = gr.Textbox(
            label="Search Query", 
            placeholder="e.g., road traffic accidents, education, housing (or leave empty for random selection)",
            scale=4
        )
        search_button = gr.Button(
            "πŸš€ Analyze Dataset", 
            variant="primary", 
            scale=1,
            size="lg"
        )
    
    # Quick Start Examples row
    with gr.Row():
        gr.HTML("""
        <div>
            <h3 style="color: #374151">πŸš€ Quick Start Examples</h3>
            <p style="color: #6b7280">Click any example below to get started</p>
        </div>
        """)
    
    with gr.Row():
        examples = [
            ("πŸš— Road Traffic Accidents 2023", "road traffic accidents 2023"),
            ("πŸŽ“ Education Directory", "education directory"), 
            ("🏠 French Vacant Housing Private Park", "French vacant housing private park"),
        ]
        
        for emoji_text, query_text in examples:
            gr.Button(
                emoji_text, 
                variant="secondary",
                size="sm"
            ).click(
                lambda x=query_text: x,
                outputs=query_input
            )
    
    # Page info and analysis status with progress bar
    with gr.Group():
        page_url_display = gr.Textbox(label="πŸ”— Page Started On", interactive=False, visible=False)
        with gr.Row():
            status_output = gr.Textbox(label="πŸ“Š Analysis Status", interactive=False, scale=1)
    
    # Download section
    with gr.Row():
        download_button = gr.File(
            label="πŸ“„ Download DOCX Report", 
            visible=False
        )
    
    gr.Markdown("---")
    gr.HTML("""
    <div style="text-align: center; margin: 2rem 0;">
        <h2 style="color: #374151; margin-bottom: 0.5rem;">πŸ“Š Generated Visualizations</h2>
        <p style="color: #6b7280; margin: 0;">Automatically generated charts and maps will appear below</p>
    </div>
    """)
    
    with gr.Row():
        with gr.Column():
            image1 = gr.Image(label="πŸ“ˆ Chart 1", visible=False, height=400)
            image2 = gr.Image(label="πŸ“Š Chart 2", visible=False, height=400)
        with gr.Column():
            image3 = gr.Image(label="πŸ—ΊοΈ Map/Chart 3", visible=False, height=400)
            image4 = gr.Image(label="πŸ“‰ Chart 4", visible=False, height=400)
    
    # Follow-up Analysis Section (initially hidden)
    followup_section_divider = gr.Markdown("---", visible=False)
    followup_section_header = gr.HTML("""
    <div style="text-align: center; margin: 2rem 0;">
        <h2 style="color: #374151; margin-bottom: 0.5rem;">πŸ€– Follow-up Analysis</h2>
        <p style="color: #6b7280; margin: 0;">Ask about report findings, request data analysis, or get contextual information</p>
    </div>
    """, visible=False)
    
    with gr.Row(visible=False) as followup_input_row:
        followup_input = gr.Textbox(
            label="Follow-up Question",
            placeholder="e.g., What are the main findings?, Show me correlation between columns, What is road safety policy in France?",
            scale=4
        )
        followup_button = gr.Button(
            "πŸ” Analyze",
            variant="secondary",
            scale=1,
            size="lg"
        )
    
    with gr.Row(visible=False) as followup_result_row:
        followup_result = gr.Textbox(
            label="πŸ“Š Follow-up Analysis Results",
            interactive=False,
            lines=10,
            visible=True
        )
    
    with gr.Row(visible=False) as followup_image_row:
        followup_image = gr.Image(
            label="πŸ“ˆ Follow-up Visualization",
            visible=False,
            height=500
        )
    
    # Follow-up Examples (initially hidden)
    with gr.Row(visible=False) as followup_examples_header_row:
        gr.HTML("""
        <div>
            <h4 style="color: #374151">πŸ’‘ Example Follow-up Questions</h4>
            <p style="color: #6b7280">Click any example below to try it out</p>
        </div>
        """)
    
    with gr.Row(visible=False) as followup_examples_row:
        followup_examples = [
            ("πŸ“‹ Report Summary", "What were the main findings from the analysis?"),
            ("🌐 Context Info", "What is the policy context for this data in France?"),
            ("πŸ“Š Create Chart", "Show me the correlation between two numerical columns with a scatter plot"),
            ("πŸ“ˆ Data Statistics", "Give me statistical summary for a specific column"),
            ("🎯 Filter Data", "Filter the data by specific criteria and show results"),
            ("πŸ” General Question", "Tell me more about this topic and its importance"),
        ]
        
        for emoji_text, query_text in followup_examples:
            gr.Button(
                emoji_text, 
                variant="secondary",
                size="sm"
            ).click(
                lambda x=query_text: x,
                outputs=followup_input
            )
    
    # Set up the search button click event with progress bar
    search_button.click(
        fn=search_and_analyze,
        inputs=[query_input],
        outputs=[page_url_display, status_output, download_button, image1, image2, image3, image4,
                followup_section_divider, followup_section_header, followup_input_row, 
                followup_result_row, followup_image_row, followup_examples_header_row, followup_examples_row],
        show_progress="full"  # Show the built-in progress bar
    )
    
    # Set up the follow-up button click event
    followup_button.click(
        fn=run_followup_question,
        inputs=[followup_input],
        outputs=[followup_result, followup_image],
        show_progress="full"
    )
    


if __name__ == "__main__":
    demo.queue()  # Enable queuing for real-time updates
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True 
    )