File size: 28,697 Bytes
9a46140
 
 
 
7f52dcc
9a46140
 
 
7f52dcc
 
 
ccbba65
7f52dcc
daba178
22e75d1
 
51d3872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
 
 
8cad88e
51d3872
3fec205
6a4459c
3fec205
8cad88e
 
6a4459c
7677101
6a4459c
8cad88e
3fec205
 
51d3872
8844c67
 
 
 
 
 
95145c9
8844c67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56293b7
 
 
8844c67
 
95145c9
51d3872
93d3c59
51d3872
8844c67
51d3872
93d3c59
8844c67
 
51d3872
 
 
93d3c59
 
51d3872
93d3c59
51d3872
93d3c59
51d3872
93d3c59
51d3872
cda6a7a
93d3c59
8844c67
 
 
 
95145c9
51d3872
93d3c59
51d3872
8844c67
93d3c59
51d3872
93d3c59
 
 
51d3872
93d3c59
51d3872
93d3c59
 
c49c6cf
 
 
 
8844c67
 
 
 
 
 
 
93d3c59
22e75d1
 
 
 
 
8844c67
8cad88e
8844c67
 
c49c6cf
 
8844c67
 
 
8cad88e
51d3872
93d3c59
51d3872
 
 
a49ff7a
8844c67
01d80b0
 
 
 
 
 
5219269
56293b7
8844c67
56293b7
 
 
8844c67
 
 
01d80b0
56293b7
01d80b0
56293b7
 
01d80b0
 
 
 
 
56293b7
01d80b0
 
56293b7
 
 
01d80b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56293b7
 
01d80b0
56293b7
 
 
01d80b0
5219269
8844c67
 
 
 
 
 
9fd29fd
d3afc5f
8844c67
d3afc5f
 
 
8844c67
d3afc5f
8844c67
9fd29fd
d3afc5f
 
 
 
 
 
 
01d80b0
 
 
 
d0e3289
64b36f4
8844c67
95145c9
5219269
d3afc5f
8844c67
d3afc5f
 
51d3872
 
56293b7
8844c67
 
 
d3afc5f
 
 
 
8844c67
 
 
01d80b0
 
 
 
 
56293b7
01d80b0
 
0f26b5a
8844c67
95145c9
3fec205
51d3872
93d3c59
51d3872
bac45a2
 
51d3872
bac45a2
51d3872
c49c6cf
93d3c59
 
 
 
51d3872
c49c6cf
93d3c59
 
 
 
 
 
 
d3afc5f
93d3c59
 
 
 
 
 
 
 
 
51d3872
 
93d3c59
 
 
 
c49c6cf
93d3c59
 
 
 
 
 
 
06121b4
93d3c59
 
 
d3afc5f
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3afc5f
51d3872
93d3c59
51d3872
8cad88e
b3accd3
d0a53bf
51d3872
 
 
8cad88e
 
d3afc5f
51d3872
 
 
 
 
 
 
 
 
56293b7
8cad88e
 
 
 
9a46140
d3afc5f
56293b7
b3accd3
 
 
 
 
 
 
 
 
 
 
 
c49c6cf
 
 
b3accd3
c49c6cf
c7d2695
c49c6cf
56293b7
c49c6cf
56293b7
 
 
 
 
c49c6cf
56293b7
 
 
 
c49c6cf
56293b7
8dce56b
 
 
 
 
 
 
 
 
 
 
c49c6cf
 
 
 
 
 
 
8dce56b
c49c6cf
 
 
 
 
8dce56b
56293b7
 
 
c49c6cf
56293b7
 
 
 
c49c6cf
56293b7
 
 
 
 
d3afc5f
 
 
 
 
56293b7
c49c6cf
 
 
 
d3afc5f
56293b7
 
 
 
 
 
51d3872
 
 
 
 
 
d3afc5f
51d3872
 
 
 
56293b7
 
 
 
 
 
 
 
 
 
c97769e
2fa9381
56293b7
 
 
 
 
51d3872
93d3c59
51d3872
d3afc5f
 
93d3c59
 
 
 
 
 
56293b7
d3afc5f
 
 
 
 
 
56293b7
d3afc5f
 
93d3c59
d3afc5f
56293b7
8844c67
93d3c59
d3afc5f
 
51d3872
 
 
d3afc5f
93d3c59
51d3872
93d3c59
d3afc5f
bac45a2
 
c49c6cf
51d3872
 
d3afc5f
 
 
51d3872
d3afc5f
 
56293b7
 
51d3872
56293b7
51d3872
 
56293b7
51d3872
93d3c59
51d3872
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
c49c6cf
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b36f4
22e75d1
64b36f4
22e75d1
 
 
 
 
 
 
64b36f4
22e75d1
 
 
 
 
64b36f4
 
 
 
51d3872
64b36f4
22e75d1
 
 
0ef6e2f
 
 
 
 
 
 
0b1f2f1
46c9a80
 
0b1f2f1
 
 
d3afc5f
 
6ee21d1
d3afc5f
93d3c59
d3afc5f
b3accd3
d3afc5f
93d3c59
 
 
b3accd3
 
d3afc5f
3fec205
93d3c59
3fec205
0b1f2f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
import streamlit as st
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from span_marker import SpanMarkerModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, PartOfSpeech
from torch import cuda
from spacy.cli import download
import transformers
from torch import bfloat16
import os
import scipy.cluster.hierarchy as sch  # HIERARCHY

# ------------------------------------------------------------------------------
# Funzione per ottenere la configurazione della lingua
# ------------------------------------------------------------------------------
def get_language_config(selected_language):
    """
    Restituisce un dizionario di configurazione in base alla lingua selezionata.
    Include il modello spaCy, il modello linguistico per il rilevamento (SpanMarker)
    e i parametri per DataForSEO.
    """
    language_options = {
        "English (US)": {
            "spacy_model": "en_core_web_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "en-us"}
        },
        "English (UK)": {
            "spacy_model": "en_core_web_sm",  # spaCy non ha un modello UK specifico, si usa quello standard
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "en-gb"}
        },
        "Italiano": {
            "spacy_model": "it_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",  # Sostituire con il modello appropriato se disponibile
            "dataforseo_params": {"language": "it-it"}
        },
        "Español": {
            "spacy_model": "es_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "es-es"}
        },
        "Deutsch": {
            "spacy_model": "de_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "de-de"}
        },
        "Français": {
            "spacy_model": "fr_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "fr-fr"}
        }
    }
    return language_options.get(selected_language, language_options["English (US)"])

# ------------------------------------------------------------------------------
# Configurazione della pagina
# ------------------------------------------------------------------------------
st.set_page_config(
    page_title="Keywords Cluster for SEO",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://www.linkedin.com/in/francisco-nardi-212b338b/',
        'Report a bug': "https://www.linkedin.com/in/francisco-nardi-212b338b/",
        'About': "# A simple keywords clustering tool for SEO purpose."
    }
)

# Inizializzazione della sessione (opzionale)
if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False
if 'analysis_complete' not in st.session_state:
    st.session_state.analysis_complete = False
if 'current_step' not in st.session_state:
    st.session_state.current_step = 0

# Stili CSS personalizzati
st.markdown("""
    <style>
        .stProgress > div > div > div > div {
            background-color: #1f77b4;
        }
        .success-message {
            padding: 1rem;
            border-radius: 0.5rem;
            background-color: #d4edda;
            color: #155724;
            border: 1px solid #c3e6cb;
            margin-bottom: 1rem;
        }
        .info-box {
            padding: 1rem;
            border-radius: 0.5rem;
            background-color: #e2f0fd;
            border: 1px solid #b8daff;
            margin-bottom: 1rem;
        }
        .sidebar .sidebar-content {
            width: 400px !important;
        }
    </style>
    """, unsafe_allow_html=True)

# ------------------------------------------------------------------------------
# 1) Caricamento modelli con cache_resource
# ------------------------------------------------------------------------------
@st.cache_resource
def load_models(language_config):
    """Carica i modelli necessari con caching (una sola volta)."""
    with st.spinner("Loading models... This may take a few minutes."):
        try:
            # Scarica il modello spaCy in base alla lingua selezionata
            spacy_model_name = language_config["spacy_model"]
            download(spacy_model_name)
            
            # Modello SpanMarker: rilevazione entità (Brand/Unbranded)
            linguistic_model_name = language_config["linguistic_model"]
            if cuda.is_available():
                model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name).cuda()
            else:
                model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name)
            
            # Modello di embedding SentenceTransformer (resta invariato)
            embedding_model = SentenceTransformer("all-mpnet-base-v2")
            
            return model_filter, embedding_model
        except Exception as e:
            st.error(f"Error loading models: {str(e)}")
            raise

# ------------------------------------------------------------------------------
# 2) Lettura CSV con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def load_csv(file, skiprows, nrows):
    """Carica il CSV con caching."""
    df = pd.read_csv(file, skiprows=skiprows, nrows=nrows)
    return df

# ------------------------------------------------------------------------------
# 3) Funzione di etichettatura Brand/Unbranded con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def process_keywords(df, model_filter):
    """
    Rileva eventuali keyword di tipo 'Brand' utilizzando il modello SpanMarker.
    Ritorna la lista di etichette 'Brand' o 'Unbranded' per ciascuna keyword.
    """
    results = []
    total = len(df)
    progress_text = "Processing keywords..."
    progress_bar = st.progress(0, text=progress_text)
    
    for i, keyword in enumerate(df['Keyword']):
        try:
            entities = model_filter.predict([keyword])
            label = (
                "Brand"
                if entities and isinstance(entities[0], list) and any(entity.get("label") == "ORG" for entity in entities[0])
                else "Unbranded"
            )
            results.append(label)
        except Exception as e:
            st.error(f"Error processing keyword '{keyword}': {str(e)}")
            results.append("Unbranded")
        
        progress_bar.progress((i + 1) / total, text=f"{progress_text} ({i+1}/{total})")
    
    progress_bar.empty()
    return results

# ------------------------------------------------------------------------------
# 4) Creazione del modello di topic
# ------------------------------------------------------------------------------
def create_topic_model(embedding_model, model_params, language_config):
    """Crea e configura il modello di topic modeling."""
    try:
        # Configurazione quantizzazione per Hugging Face
        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

        # Configurazione UMAP
        umap_model = UMAP(
            n_neighbors=model_params['umap_n_neighbors'],
            n_components=model_params['umap_n_components'],
            min_dist=model_params['umap_min_dist'],
            metric='cosine',
            random_state=42
        )
        
        # Configurazione HDBSCAN
        hdbscan_model = HDBSCAN(
            min_cluster_size=model_params['min_cluster_size'],
            min_samples=model_params['min_samples'],
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=True
        )
        
        # Configurazione CountVectorizer
        vectorizer_model = CountVectorizer(
            stop_words="english",
            min_df=model_params['min_df'],
            max_df=model_params['max_df'],
            ngram_range=(model_params['ngram_min'], model_params['ngram_max'])
        )

        # Configurazione Llama 2
        model_id = 'meta-llama/Llama-2-7b-chat-hf'
        
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
        
        model = transformers.AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            quantization_config=bnb_config,
            device_map='auto',
        )
        model.eval()

        generator = transformers.pipeline(
            model=model, 
            tokenizer=tokenizer,
            task='text-generation',
            temperature=model_params['llama_temperature'],
            max_new_tokens=model_params['llama_max_tokens'],
            repetition_penalty=model_params['llama_repetition_penalty']
        )

        # Prompt configuration
        system_prompt = """
        <s>[INST] <<SYS>>
        You are a helpful, respectful and honest assistant for labeling topics.
        <</SYS>>
        """

        example_prompt = """
        I have a topic that contains the following documents:
        - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
        - Meat, but especially beef, is the word food in terms of emissions.
        - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
        
        The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
        
        Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
        
        [/INST] Environmental impacts of eating meat
        """

        main_prompt = """
        [INST]
        I have a topic that contains the following documents:
        [DOCUMENTS]
        
        The topic is described by the following keywords: '[KEYWORDS]'.
        
        Based on the information about the topic above, please create a **short label** of this topic. 
        **Return only the label** and avoid adding any explanations or extra text such as 'topic'.
        [/INST]
        """

        prompt = system_prompt + example_prompt + main_prompt
        
        # Create representation models
        keybert_model = KeyBERTInspired()
        # Utilizza il modello spaCy in base alla lingua selezionata
        pos_model = PartOfSpeech(language_config["spacy_model"])
        mmr_model = MaximalMarginalRelevance(diversity=model_params['diversity_factor'])
        llama2 = TextGeneration(generator, prompt=prompt)
        
        representation_model = {
            "KeyBERT": keybert_model,
            "Llama2": llama2,
            "MMR": mmr_model,
            "POS": pos_model
        }
        
        return BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            representation_model=representation_model,
            top_n_words=model_params['top_n_words'],
            verbose=True
        )
    except Exception as e:
        st.error(f"Error creating topic model: {str(e)}")
        raise

# ------------------------------------------------------------------------------
# 5) Analisi principale (cachiamo i risultati finali dell'analisi)
# ------------------------------------------------------------------------------
@st.cache_data(hash_funcs={
    SpanMarkerModel: lambda _: None,       # ignora hashing per SpanMarker
    SentenceTransformer: lambda _: None      # ignora hashing per SentenceTransformer
})
def run_analysis(df, model_filter, embedding_model, model_params, exclude_brand_keywords, language_config):
    """
    - Etichetta (facoltativo) come 'Brand' o 'Unbranded'
    - Filtra i brand se richiesto
    - Crea embeddings
    - Esegue il topic modeling
    - Restituisce il modello e il DataFrame dei risultati
    """
    # Se l'utente sceglie di escludere i brand, etichettiamo e filtriamo
    if exclude_brand_keywords:
        df['Label'] = process_keywords(df, model_filter)
        filtered_df = df[df['Label'] == 'Unbranded']
    else:
        df['Label'] = "Unbranded"
        filtered_df = df

    filtered_keywords = filtered_df['Keyword'].tolist()
    
    if not filtered_keywords:
        st.warning("No keywords found for analysis (perhaps all were branded).")
        return None, None
    
    # Genera embeddings
    embeddings = embedding_model.encode(filtered_keywords, show_progress_bar=True)
    
    # Crea e applica topic model (passando anche la configurazione della lingua)
    topic_model = create_topic_model(embedding_model, model_params, language_config)
    topics, probs = topic_model.fit_transform(filtered_keywords, embeddings)
    
    # Ottieni gli embeddings ridotti per la visualizzazione
    reduced_embeddings = topic_model.umap_model.embedding_

    # Usa i label generati da Llama 2 come label finali
    llama_topic_labels = {
        topic: "".join(list(zip(*values))[0]) 
        for topic, values in topic_model.topic_aspects_["Llama2"].items()
    }
    llama_topic_labels[-1] = "Outlier Topic"
    topic_model.set_topic_labels(llama_topic_labels)

    # Ottieni le informazioni sui topic
    topic_info = topic_model.get_topic_info()
    topic_labels = dict(zip(topic_info["Topic"], topic_info["CustomName"]))      
    
    # Ottieni le informazioni di default BERT
    bert_labels = dict(zip(topic_info["Topic"], topic_info["Name"]))
    
    # Creiamo il DataFrame dei risultati
    results_df = pd.DataFrame({
        "Keyword": filtered_keywords,
        "Topic ID": topics,
        "Confidence": probs
    })
    
    # Aggiungiamo le label Llama e BERT
    results_df["Llama label"] = [
        topic_labels[topic] if topic in topic_labels else "Outlier Topic" 
        for topic in topics
    ]
    results_df["BERT label"] = [
        bert_labels[topic] if topic in bert_labels else "Outlier Topic" 
        for topic in topics
    ]
    
    # Se nel CSV c'è una colonna 'Volume', la aggiungiamo
    if "Volume" in filtered_df.columns:
        results_df["Volume"] = filtered_df["Volume"].values

    return topic_model, results_df

# ------------------------------------------------------------------------------
# 6) Main Streamlit App
# ------------------------------------------------------------------------------
def main():
    st.title("🔍 Keywords Cluster for SEO")
    
    # ------------------------------------------------------------------------------
    # Sidebar: Selezione della lingua e configurazioni
    # ------------------------------------------------------------------------------
    with st.sidebar:
        st.header("Configuration")
        
        # Selezione della lingua
        selected_language = st.selectbox(
            "Select Language",
            ["English (US)", "English (UK)", "Italiano", "Español", "Deutsch", "Français"],
            index=0,
            help="Seleziona la lingua per l'analisi. Questo imposterà il modello spaCy, il modello linguistico per il rilevamento e i parametri per DataForSEO."
        )
        language_config = get_language_config(selected_language)
        
        # File upload e configurazione righe
        uploaded_file = st.file_uploader(
            "Upload CSV file",
            type="csv",
            help="File must contain a 'Keyword' column"
        )
        
        with st.expander("CSV Reading Options"):
            min_rows = st.number_input(
                "Start reading from row",
                min_value=1,
                value=1,
                help="Define the first row of the CSV file from which data should be read."
            )
            max_rows = st.number_input(
                "Maximum rows to read",
                min_value=1,
                value=5000,
                help="Define how many rows in total to read from the CSV file, starting from the row defined above."
            )
        
        # Opzione per escludere keyword brand
        exclude_brands = st.checkbox(
            "Exclude Organization keywords",
            value=False,
            help="If enabled, organization-labeled keywords are excluded from the analysis. (ex. company ltd)"
        )
            
        # Parametri UMAP
        with st.expander("UMAP Parameters"):
            umap_n_neighbors = st.slider("N Neighbors", 2, 100, 10)
            umap_n_components = st.slider("N Components", 2, 50, 2)
            umap_min_dist = st.slider("Min Distance", 0.0, 1.0, 0.0, 0.01)
            
        # Parametri HDBSCAN
        with st.expander("HDBSCAN Parameters"):
            min_cluster_size = st.slider("Min Cluster Size", 2, 50, 5)
            min_samples = st.slider("Min Samples", 1, 20, 5)
            
        # Parametri Vectorizer
        with st.expander("Vectorizer Parameters"):
            min_df_type = st.radio(
                "Min Document Frequency Type",
                ["Absolute", "Relative"],
                help="Absolute: minimum count of documents, Relative: minimum fraction of documents"
            )
            
            if min_df_type == "Absolute":
                min_df = st.number_input("Min Document Count", 1, 100, 2)
            else:
                min_df = st.slider("Min Document Fraction", 0.0, 0.5, 0.1, 0.01)
            
            max_df = st.slider(
                "Max Document Fraction", 
                min_value=float(min_df) if isinstance(min_df, float) else 0.5,
                max_value=1.0,
                value=0.95,
                step=0.05
            )
            
            st.info(
                f"Documents must appear in at least {min_df} "
                f"{'documents' if isinstance(min_df, int) else '% of documents'} "
                f"and at most {int(max_df * 100)}% of documents"
            )
            
            ngram_min = st.number_input("N-gram Min", 1, 3, 1)
            ngram_max = st.number_input("N-gram Max", 1, 3, 2)
            
        # Parametri Topic Model
        with st.expander("Topic Model Parameters"):
            top_n_words = st.slider("Top N Words", 5, 30, 10)
            diversity_factor = st.slider("Topic Diversity", 0.0, 1.0, 0.3)
            
        # Parametri Llama 2
        with st.expander("Llama 2 Parameters"):
            llama_temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
            llama_max_tokens = st.slider("Max Tokens", 50, 200, 100)
            llama_repetition_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1, 0.1)
            
        # Help section
        with st.expander("ℹ️ Help"):
            st.markdown("""
            **How to use this app:**
            1. Upload a CSV file with keywords
            2. Configure CSV reading options
            3. (Optionally) check "Exclude brand-labeled keywords"
            4. Adjust model parameters if needed
            5. Click 'Start Analysis'
            6. Wait for results to appear
            
            **Advanced Parameters:**
            - UMAP: Controls dimensionality reduction
            - HDBSCAN: Controls clustering behavior
            - Vectorizer: Controls text preprocessing
            - Topic Model: Controls topic generation
            - Llama 2: Controls topic labeling
            
            **Language Selection:**
            Selezionando la lingua verranno impostati:
            - Il modello spaCy da utilizzare (per es. 'en_core_web_sm' per English o 'it_core_news_sm' per Italiano)
            - Il modello linguistico per il rilevamento (SpanMarker) (sostituisci i placeholder con i modelli corretti se disponibili)
            - I parametri per DataForSEO (ad es. il codice lingua come 'en-us', 'it-it', ecc.)
            """)
    
    # ------------------------------------------------------------------------------
    # 7) Prepariamo dizionario parametri per il topic model
    # ------------------------------------------------------------------------------
    model_params = {
        'umap_n_neighbors': umap_n_neighbors,
        'umap_n_components': umap_n_components,
        'umap_min_dist': umap_min_dist,
        'min_cluster_size': min_cluster_size,
        'min_samples': min_samples,
        'min_df': min_df,
        'max_df': max_df,
        'ngram_min': ngram_min,
        'ngram_max': ngram_max,
        'top_n_words': top_n_words,
        'diversity_factor': diversity_factor,
        'llama_temperature': llama_temperature,
        'llama_max_tokens': llama_max_tokens,
        'llama_repetition_penalty': llama_repetition_penalty
    }
    
    # ------------------------------------------------------------------------------
    # 8) Se abbiamo caricato un file, procediamo
    # ------------------------------------------------------------------------------
    if uploaded_file is not None:
        try:
            # Carica dati con caching
            df = load_csv(
                file=uploaded_file,
                skiprows=min_rows - 1,
                nrows=max_rows - min_rows + 1
            )
            
            if 'Keyword' not in df.columns:
                st.error("CSV must contain a 'Keyword' column")
                return
            
            # Preview dati
            with st.expander("Preview Data", expanded=True):
                st.write(f"Reading rows {min_rows} to {max_rows}")
                st.dataframe(
                    df.head(),
                    use_container_width=True
                )
                st.write(f"Total rows loaded: {len(df)}")
            
            # Pulsante per avviare l'analisi
            if st.button("Start Analysis", type="primary"):
                try:
                    # Carichiamo i modelli (cache_resource) con la configurazione della lingua
                    with st.spinner("Loading models..."):
                        model_filter, embedding_model = load_models(language_config)
                    
                    # Eseguiamo l'analisi (cache_data)
                    with st.spinner("Processing data..."):
                        topic_model, results_df = run_analysis(
                            df,
                            model_filter,
                            embedding_model,
                            model_params,
                            exclude_brand_keywords=exclude_brands,
                            language_config=language_config
                        )
                        
                        if topic_model is None or results_df is None:
                            st.error("Analysis failed!")
                            return
                    
                    # Visualizza riepilogo configurazione
                    with st.expander("Configuration Summary", expanded=False):
                        st.subheader("Model Parameters")
                        st.json(model_params)
                        st.subheader("Language Configuration")
                        st.json(language_config)
                    
                    # ------------------------------------------------------------------------------
                    # 9) Mostra risultati
                    # ------------------------------------------------------------------------------
                    st.write("### Results Table")
                    st.dataframe(results_df, use_container_width=True, hide_index=True)
                    
                    # Visualizza la dashboard interattiva
                    st.write("### Interactive Topic Visualization")
                    try:
                        # Embedding ridotto
                        fig = topic_model.visualize_documents(
                            results_df['Keyword'].tolist(),
                            reduced_embeddings=topic_model.umap_model.embedding_,
                            hide_annotations=True,
                            hide_document_hover=False,
                            custom_labels=True
                        )
                        st.plotly_chart(fig, theme="streamlit", use_container_width=True)

                        # Visualizzazione dei topic
                        st.write("### Topic Overview")
                        try:
                            topic_fig = topic_model.visualize_topics(custom_labels=True)
                            st.plotly_chart(topic_fig, theme="streamlit", use_container_width=True)
                        except Exception as e:
                            st.error(f"Error creating topic visualization: {str(e)}")

                        # Visualizzazione barchart dei topic
                        st.write("### Topic Distribution")
                        try:
                            n_topics = len(topic_model.get_topic_info())
                            n_topics = min(50, max(1, n_topics - 1))  # -1 per outlier
                            
                            barchart_fig = topic_model.visualize_barchart(
                                top_n_topics=n_topics,
                                custom_labels=True
                            )
                            st.plotly_chart(barchart_fig, theme="streamlit", use_container_width=True)
                        except Exception as e:
                            st.error(f"Error creating barchart visualization: {str(e)}")
                        
                        # (A) AGGIUNTA: Visualizzazione gerarchica dei topic
                        st.write("### Hierarchical Topics")
                        try:
                            docs = results_df["Keyword"].tolist()
                            linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
                            hierarchical_topics = topic_model.hierarchical_topics(
                                docs,
                                linkage_function=linkage_function
                            )
                            # Grafico gerarchico
                            fig_hierarchy = topic_model.visualize_hierarchy(
                                hierarchical_topics=hierarchical_topics,
                                custom_labels=True
                            )
                            st.plotly_chart(fig_hierarchy, theme="streamlit", use_container_width=True)
                        
                            # (B) AGGIUNTA: Visualizzazione testuale dell'albero
                            st.write("### Hierarchical Topic Tree")
                            tree = topic_model.get_topic_tree(hierarchical_topics)
                            st.text(tree)  # Oppure st.code(tree) per un blocco formattato
                        
                        except Exception as e:
                            st.error(f"Error creating hierarchical visualization: {str(e)}")
                    
                        # Download risultati in CSV
                        st.download_button(
                            label="Download Results",
                            data=results_df.to_csv(index=False),
                            file_name="keyword_analysis_results.csv",
                            mime="text/csv",
                            key="download_results"
                        )
                    except Exception as e:
                        st.error(f"An error occurred: {str(e)}")
                        
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
        except Exception as e:
            st.error(f"Error reading file: {str(e)}")

    else:
        # Messaggio iniziale
        st.info("""
        👋 Welcome to the Keywords Cluster for SEO!
        
        1. Upload a CSV file with a column named **'Keyword'**.
        2. Adjust parameters in the sidebar if needed.
        3. Click **"Start Analysis"**.
        4. Explore the data.
        5. Download the results (this will refresh page).
        """)


if __name__ == "__main__":
    main()