Spaces:

Cisconardi
/

Cluster_keywords

Paused

File size: 28,697 Bytes

9a46140
 
 
 
7f52dcc
9a46140
 
 
7f52dcc
 
 
ccbba65
7f52dcc
daba178
22e75d1
 
51d3872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
4eac896
51d3872
 
 
 
 
 
8cad88e
51d3872
3fec205
6a4459c
3fec205
8cad88e
 
6a4459c
7677101
6a4459c
8cad88e
3fec205
 
51d3872
8844c67
 
 
 
 
 
95145c9
8844c67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56293b7
 
 
8844c67
 
95145c9
51d3872
93d3c59
51d3872
8844c67
51d3872
93d3c59
8844c67
 
51d3872
 
 
93d3c59
 
51d3872
93d3c59
51d3872
93d3c59
51d3872
93d3c59
51d3872
cda6a7a
93d3c59
8844c67
 
 
 
95145c9
51d3872
93d3c59
51d3872
8844c67
93d3c59
51d3872
93d3c59
 
 
51d3872
93d3c59
51d3872
93d3c59
 
c49c6cf
 
 
 
8844c67
 
 
 
 
 
 
93d3c59
22e75d1
 
 
 
 
8844c67
8cad88e
8844c67
 
c49c6cf
 
8844c67
 
 
8cad88e
51d3872
93d3c59
51d3872
 
 
a49ff7a
8844c67
01d80b0
 
 
 
 
 
5219269
56293b7
8844c67
56293b7
 
 
8844c67
 
 
01d80b0
56293b7
01d80b0
56293b7
 
01d80b0
 
 
 
 
56293b7
01d80b0
 
56293b7
 
 
01d80b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56293b7
 
01d80b0
56293b7
 
 
01d80b0
5219269
8844c67
 
 
 
 
 
9fd29fd
d3afc5f
8844c67
d3afc5f
 
 
8844c67
d3afc5f
8844c67
9fd29fd
d3afc5f
 
 
 
 
 
 
01d80b0
 
 
 
d0e3289
64b36f4
8844c67
95145c9
5219269
d3afc5f
8844c67
d3afc5f
 
51d3872
 
56293b7
8844c67
 
 
d3afc5f
 
 
 
8844c67
 
 
01d80b0
 
 
 
 
56293b7
01d80b0
 
0f26b5a
8844c67
95145c9
3fec205
51d3872
93d3c59
51d3872
bac45a2
 
51d3872
bac45a2
51d3872
c49c6cf
93d3c59
 
 
 
51d3872
c49c6cf
93d3c59
 
 
 
 
 
 
d3afc5f
93d3c59
 
 
 
 
 
 
 
 
51d3872
 
93d3c59
 
 
 
c49c6cf
93d3c59
 
 
 
 
 
 
06121b4
93d3c59
 
 
d3afc5f
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3afc5f
51d3872
93d3c59
51d3872
8cad88e
b3accd3
d0a53bf
51d3872
 
 
8cad88e
 
d3afc5f
51d3872
 
 
 
 
 
 
 
 
56293b7
8cad88e
 
 
 
9a46140
d3afc5f
56293b7
b3accd3
 
 
 
 
 
 
 
 
 
 
 
c49c6cf
 
 
b3accd3
c49c6cf
c7d2695
c49c6cf
56293b7
c49c6cf
56293b7
 
 
 
 
c49c6cf
56293b7
 
 
 
c49c6cf
56293b7
8dce56b
 
 
 
 
 
 
 
 
 
 
c49c6cf
 
 
 
 
 
 
8dce56b
c49c6cf
 
 
 
 
8dce56b
56293b7
 
 
c49c6cf
56293b7
 
 
 
c49c6cf
56293b7
 
 
 
 
d3afc5f
 
 
 
 
56293b7
c49c6cf
 
 
 
d3afc5f
56293b7
 
 
 
 
 
51d3872
 
 
 
 
 
d3afc5f
51d3872
 
 
 
56293b7
 
 
 
 
 
 
 
 
 
c97769e
2fa9381
56293b7
 
 
 
 
51d3872
93d3c59
51d3872
d3afc5f
 
93d3c59
 
 
 
 
 
56293b7
d3afc5f
 
 
 
 
 
56293b7
d3afc5f
 
93d3c59
d3afc5f
56293b7
8844c67
93d3c59
d3afc5f
 
51d3872
 
 
d3afc5f
93d3c59
51d3872
93d3c59
d3afc5f
bac45a2
 
c49c6cf
51d3872
 
d3afc5f
 
 
51d3872
d3afc5f
 
56293b7
 
51d3872
56293b7
51d3872
 
56293b7
51d3872
93d3c59
51d3872
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
c49c6cf
93d3c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b36f4
22e75d1
64b36f4
22e75d1
 
 
 
 
 
 
64b36f4
22e75d1
 
 
 
 
64b36f4
 
 
 
51d3872
64b36f4
22e75d1
 
 
0ef6e2f
 
 
 
 
 
 
0b1f2f1
46c9a80
 
0b1f2f1
 
 
d3afc5f
 
6ee21d1
d3afc5f
93d3c59
d3afc5f
b3accd3
d3afc5f
93d3c59
 
 
b3accd3
 
d3afc5f
3fec205
93d3c59
3fec205
0b1f2f1

import streamlit as st
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from span_marker import SpanMarkerModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, PartOfSpeech
from torch import cuda
from spacy.cli import download
import transformers
from torch import bfloat16
import os
import scipy.cluster.hierarchy as sch  # HIERARCHY

# ------------------------------------------------------------------------------
# Funzione per ottenere la configurazione della lingua
# ------------------------------------------------------------------------------
def get_language_config(selected_language):
    """
    Restituisce un dizionario di configurazione in base alla lingua selezionata.
    Include il modello spaCy, il modello linguistico per il rilevamento (SpanMarker)
    e i parametri per DataForSEO.
    """
    language_options = {
        "English (US)": {
            "spacy_model": "en_core_web_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "en-us"}
        },
        "English (UK)": {
            "spacy_model": "en_core_web_sm",  # spaCy non ha un modello UK specifico, si usa quello standard
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "en-gb"}
        },
        "Italiano": {
            "spacy_model": "it_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",  # Sostituire con il modello appropriato se disponibile
            "dataforseo_params": {"language": "it-it"}
        },
        "Español": {
            "spacy_model": "es_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "es-es"}
        },
        "Deutsch": {
            "spacy_model": "de_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "de-de"}
        },
        "Français": {
            "spacy_model": "fr_core_news_sm",
            "linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
            "dataforseo_params": {"language": "fr-fr"}
        }
    }
    return language_options.get(selected_language, language_options["English (US)"])

# ------------------------------------------------------------------------------
# Configurazione della pagina
# ------------------------------------------------------------------------------
st.set_page_config(
    page_title="Keywords Cluster for SEO",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://www.linkedin.com/in/francisco-nardi-212b338b/',
        'Report a bug': "https://www.linkedin.com/in/francisco-nardi-212b338b/",
        'About': "# A simple keywords clustering tool for SEO purpose."
    }
)

# Inizializzazione della sessione (opzionale)
if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False
if 'analysis_complete' not in st.session_state:
    st.session_state.analysis_complete = False
if 'current_step' not in st.session_state:
    st.session_state.current_step = 0

# Stili CSS personalizzati
st.markdown("""
    <style>
        .stProgress > div > div > div > div {
            background-color: #1f77b4;
        }
        .success-message {
            padding: 1rem;
            border-radius: 0.5rem;
            background-color: #d4edda;
            color: #155724;
            border: 1px solid #c3e6cb;
            margin-bottom: 1rem;
        }
        .info-box {
            padding: 1rem;
            border-radius: 0.5rem;
            background-color: #e2f0fd;
            border: 1px solid #b8daff;
            margin-bottom: 1rem;
        }
        .sidebar .sidebar-content {
            width: 400px !important;
        }
    </style>
    """, unsafe_allow_html=True)

# ------------------------------------------------------------------------------
# 1) Caricamento modelli con cache_resource
# ------------------------------------------------------------------------------
@st.cache_resource
def load_models(language_config):
    """Carica i modelli necessari con caching (una sola volta)."""
    with st.spinner("Loading models... This may take a few minutes."):
        try:
            # Scarica il modello spaCy in base alla lingua selezionata
            spacy_model_name = language_config["spacy_model"]
            download(spacy_model_name)
            
            # Modello SpanMarker: rilevazione entità (Brand/Unbranded)
            linguistic_model_name = language_config["linguistic_model"]
            if cuda.is_available():
                model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name).cuda()
            else:
                model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name)
            
            # Modello di embedding SentenceTransformer (resta invariato)
            embedding_model = SentenceTransformer("all-mpnet-base-v2")
            
            return model_filter, embedding_model
        except Exception as e:
            st.error(f"Error loading models: {str(e)}")
            raise

# ------------------------------------------------------------------------------
# 2) Lettura CSV con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def load_csv(file, skiprows, nrows):
    """Carica il CSV con caching."""
    df = pd.read_csv(file, skiprows=skiprows, nrows=nrows)
    return df

# ------------------------------------------------------------------------------
# 3) Funzione di etichettatura Brand/Unbranded con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def process_keywords(df, model_filter):
    """
    Rileva eventuali keyword di tipo 'Brand' utilizzando il modello SpanMarker.
    Ritorna la lista di etichette 'Brand' o 'Unbranded' per ciascuna keyword.
    """
    results = []
    total = len(df)
    progress_text = "Processing keywords..."
    progress_bar = st.progress(0, text=progress_text)
    
    for i, keyword in enumerate(df['Keyword']):
        try:
            entities = model_filter.predict([keyword])
            label = (
                "Brand"
                if entities and isinstance(entities[0], list) and any(entity.get("label") == "ORG" for entity in entities[0])
                else "Unbranded"
            )
            results.append(label)
        except Exception as e:
            st.error(f"Error processing keyword '{keyword}': {str(e)}")
            results.append("Unbranded")
        
        progress_bar.progress((i + 1) / total, text=f"{progress_text} ({i+1}/{total})")
    
    progress_bar.empty()
    return results

# ------------------------------------------------------------------------------
# 4) Creazione del modello di topic
# ------------------------------------------------------------------------------
def create_topic_model(embedding_model, model_params, language_config):
    """Crea e configura il modello di topic modeling."""
    try:
        # Configurazione quantizzazione per Hugging Face
        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

        # Configurazione UMAP
        umap_model = UMAP(
            n_neighbors=model_params['umap_n_neighbors'],
            n_components=model_params['umap_n_components'],
            min_dist=model_params['umap_min_dist'],
            metric='cosine',
            random_state=42
        )
        
        # Configurazione HDBSCAN
        hdbscan_model = HDBSCAN(
            min_cluster_size=model_params['min_cluster_size'],
            min_samples=model_params['min_samples'],
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=True
        )
        
        # Configurazione CountVectorizer
        vectorizer_model = CountVectorizer(
            stop_words="english",
            min_df=model_params['min_df'],
            max_df=model_params['max_df'],
            ngram_range=(model_params['ngram_min'], model_params['ngram_max'])
        )

        # Configurazione Llama 2
        model_id = 'meta-llama/Llama-2-7b-chat-hf'
        
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
        
        model = transformers.AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            quantization_config=bnb_config,
            device_map='auto',
        )
        model.eval()

        generator = transformers.pipeline(
            model=model, 
            tokenizer=tokenizer,
            task='text-generation',
            temperature=model_params['llama_temperature'],
            max_new_tokens=model_params['llama_max_tokens'],
            repetition_penalty=model_params['llama_repetition_penalty']
        )

        # Prompt configuration
        system_prompt = """
        <s>[INST] <<SYS>>
        You are a helpful, respectful and honest assistant for labeling topics.
        <</SYS>>
        """

        example_prompt = """
        I have a topic that contains the following documents:
        - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
        - Meat, but especially beef, is the word food in terms of emissions.
        - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
        
        The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
        
        Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
        
        [/INST] Environmental impacts of eating meat
        """

        main_prompt = """
        [INST]
        I have a topic that contains the following documents:
        [DOCUMENTS]
        
        The topic is described by the following keywords: '[KEYWORDS]'.
        
        Based on the information about the topic above, please create a **short label** of this topic. 
        **Return only the label** and avoid adding any explanations or extra text such as 'topic'.
        [/INST]
        """

        prompt = system_prompt + example_prompt + main_prompt
        
        # Create representation models
        keybert_model = KeyBERTInspired()
        # Utilizza il modello spaCy in base alla lingua selezionata
        pos_model = PartOfSpeech(language_config["spacy_model"])
        mmr_model = MaximalMarginalRelevance(diversity=model_params['diversity_factor'])
        llama2 = TextGeneration(generator, prompt=prompt)
        
        representation_model = {
            "KeyBERT": keybert_model,
            "Llama2": llama2,
            "MMR": mmr_model,
            "POS": pos_model
        }
        
        return BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            representation_model=representation_model,
            top_n_words=model_params['top_n_words'],
            verbose=True
        )
    except Exception as e:
        st.error(f"Error creating topic model: {str(e)}")
        raise

# ------------------------------------------------------------------------------
# 5) Analisi principale (cachiamo i risultati finali dell'analisi)
# ------------------------------------------------------------------------------
@st.cache_data(hash_funcs={
    SpanMarkerModel: lambda _: None,       # ignora hashing per SpanMarker
    SentenceTransformer: lambda _: None      # ignora hashing per SentenceTransformer
})
def run_analysis(df, model_filter, embedding_model, model_params, exclude_brand_keywords, language_config):
    """
    - Etichetta (facoltativo) come 'Brand' o 'Unbranded'
    - Filtra i brand se richiesto
    - Crea embeddings
    - Esegue il topic modeling
    - Restituisce il modello e il DataFrame dei risultati
    """
    # Se l'utente sceglie di escludere i brand, etichettiamo e filtriamo
    if exclude_brand_keywords:
        df['Label'] = process_keywords(df, model_filter)
        filtered_df = df[df['Label'] == 'Unbranded']
    else:
        df['Label'] = "Unbranded"
        filtered_df = df

    filtered_keywords = filtered_df['Keyword'].tolist()
    
    if not filtered_keywords:
        st.warning("No keywords found for analysis (perhaps all were branded).")
        return None, None
    
    # Genera embeddings
    embeddings = embedding_model.encode(filtered_keywords, show_progress_bar=True)
    
    # Crea e applica topic model (passando anche la configurazione della lingua)
    topic_model = create_topic_model(embedding_model, model_params, language_config)
    topics, probs = topic_model.fit_transform(filtered_keywords, embeddings)
    
    # Ottieni gli embeddings ridotti per la visualizzazione
    reduced_embeddings = topic_model.umap_model.embedding_

    # Usa i label generati da Llama 2 come label finali
    llama_topic_labels = {
        topic: "".join(list(zip(*values))[0]) 
        for topic, values in topic_model.topic_aspects_["Llama2"].items()
    }
    llama_topic_labels[-1] = "Outlier Topic"
    topic_model.set_topic_labels(llama_topic_labels)

    # Ottieni le informazioni sui topic
    topic_info = topic_model.get_topic_info()
    topic_labels = dict(zip(topic_info["Topic"], topic_info["CustomName"]))      
    
    # Ottieni le informazioni di default BERT
    bert_labels = dict(zip(topic_info["Topic"], topic_info["Name"]))
    
    # Creiamo il DataFrame dei risultati
    results_df = pd.DataFrame({
        "Keyword": filtered_keywords,
        "Topic ID": topics,
        "Confidence": probs
    })
    
    # Aggiungiamo le label Llama e BERT
    results_df["Llama label"] = [
        topic_labels[topic] if topic in topic_labels else "Outlier Topic" 
        for topic in topics
    ]
    results_df["BERT label"] = [
        bert_labels[topic] if topic in bert_labels else "Outlier Topic" 
        for topic in topics
    ]
    
    # Se nel CSV c'è una colonna 'Volume', la aggiungiamo
    if "Volume" in filtered_df.columns:
        results_df["Volume"] = filtered_df["Volume"].values

    return topic_model, results_df

# ------------------------------------------------------------------------------
# 6) Main Streamlit App
# ------------------------------------------------------------------------------
def main():
    st.title("🔍 Keywords Cluster for SEO")
    
    # ------------------------------------------------------------------------------
    # Sidebar: Selezione della lingua e configurazioni
    # ------------------------------------------------------------------------------
    with st.sidebar:
        st.header("Configuration")
        
        # Selezione della lingua
        selected_language = st.selectbox(
            "Select Language",
            ["English (US)", "English (UK)", "Italiano", "Español", "Deutsch", "Français"],
            index=0,
            help="Seleziona la lingua per l'analisi. Questo imposterà il modello spaCy, il modello linguistico per il rilevamento e i parametri per DataForSEO."
        )
        language_config = get_language_config(selected_language)
        
        # File upload e configurazione righe
        uploaded_file = st.file_uploader(
            "Upload CSV file",
            type="csv",
            help="File must contain a 'Keyword' column"
        )
        
        with st.expander("CSV Reading Options"):
            min_rows = st.number_input(
                "Start reading from row",
                min_value=1,
                value=1,
                help="Define the first row of the CSV file from which data should be read."
            )
            max_rows = st.number_input(
                "Maximum rows to read",
                min_value=1,
                value=5000,
                help="Define how many rows in total to read from the CSV file, starting from the row defined above."
            )
        
        # Opzione per escludere keyword brand
        exclude_brands = st.checkbox(
            "Exclude Organization keywords",
            value=False,
            help="If enabled, organization-labeled keywords are excluded from the analysis. (ex. company ltd)"
        )
            
        # Parametri UMAP
        with st.expander("UMAP Parameters"):
            umap_n_neighbors = st.slider("N Neighbors", 2, 100, 10)
            umap_n_components = st.slider("N Components", 2, 50, 2)
            umap_min_dist = st.slider("Min Distance", 0.0, 1.0, 0.0, 0.01)
            
        # Parametri HDBSCAN
        with st.expander("HDBSCAN Parameters"):
            min_cluster_size = st.slider("Min Cluster Size", 2, 50, 5)
            min_samples = st.slider("Min Samples", 1, 20, 5)
            
        # Parametri Vectorizer
        with st.expander("Vectorizer Parameters"):
            min_df_type = st.radio(
                "Min Document Frequency Type",
                ["Absolute", "Relative"],
                help="Absolute: minimum count of documents, Relative: minimum fraction of documents"
            )
            
            if min_df_type == "Absolute":
                min_df = st.number_input("Min Document Count", 1, 100, 2)
            else:
                min_df = st.slider("Min Document Fraction", 0.0, 0.5, 0.1, 0.01)
            
            max_df = st.slider(
                "Max Document Fraction", 
                min_value=float(min_df) if isinstance(min_df, float) else 0.5,
                max_value=1.0,
                value=0.95,
                step=0.05
            )
            
            st.info(
                f"Documents must appear in at least {min_df} "
                f"{'documents' if isinstance(min_df, int) else '% of documents'} "
                f"and at most {int(max_df * 100)}% of documents"
            )
            
            ngram_min = st.number_input("N-gram Min", 1, 3, 1)
            ngram_max = st.number_input("N-gram Max", 1, 3, 2)
            
        # Parametri Topic Model
        with st.expander("Topic Model Parameters"):
            top_n_words = st.slider("Top N Words", 5, 30, 10)
            diversity_factor = st.slider("Topic Diversity", 0.0, 1.0, 0.3)
            
        # Parametri Llama 2
        with st.expander("Llama 2 Parameters"):
            llama_temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
            llama_max_tokens = st.slider("Max Tokens", 50, 200, 100)
            llama_repetition_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1, 0.1)
            
        # Help section
        with st.expander("ℹ️ Help"):
            st.markdown("""
            **How to use this app:**
            1. Upload a CSV file with keywords
            2. Configure CSV reading options
            3. (Optionally) check "Exclude brand-labeled keywords"
            4. Adjust model parameters if needed
            5. Click 'Start Analysis'
            6. Wait for results to appear
            
            **Advanced Parameters:**
            - UMAP: Controls dimensionality reduction
            - HDBSCAN: Controls clustering behavior
            - Vectorizer: Controls text preprocessing
            - Topic Model: Controls topic generation
            - Llama 2: Controls topic labeling
            
            **Language Selection:**
            Selezionando la lingua verranno impostati:
            - Il modello spaCy da utilizzare (per es. 'en_core_web_sm' per English o 'it_core_news_sm' per Italiano)
            - Il modello linguistico per il rilevamento (SpanMarker) (sostituisci i placeholder con i modelli corretti se disponibili)
            - I parametri per DataForSEO (ad es. il codice lingua come 'en-us', 'it-it', ecc.)
            """)
    
    # ------------------------------------------------------------------------------
    # 7) Prepariamo dizionario parametri per il topic model
    # ------------------------------------------------------------------------------
    model_params = {
        'umap_n_neighbors': umap_n_neighbors,
        'umap_n_components': umap_n_components,
        'umap_min_dist': umap_min_dist,
        'min_cluster_size': min_cluster_size,
        'min_samples': min_samples,
        'min_df': min_df,
        'max_df': max_df,
        'ngram_min': ngram_min,
        'ngram_max': ngram_max,
        'top_n_words': top_n_words,
        'diversity_factor': diversity_factor,
        'llama_temperature': llama_temperature,
        'llama_max_tokens': llama_max_tokens,
        'llama_repetition_penalty': llama_repetition_penalty
    }
    
    # ------------------------------------------------------------------------------
    # 8) Se abbiamo caricato un file, procediamo
    # ------------------------------------------------------------------------------
    if uploaded_file is not None:
        try:
            # Carica dati con caching
            df = load_csv(
                file=uploaded_file,
                skiprows=min_rows - 1,
                nrows=max_rows - min_rows + 1
            )
            
            if 'Keyword' not in df.columns:
                st.error("CSV must contain a 'Keyword' column")
                return
            
            # Preview dati
            with st.expander("Preview Data", expanded=True):
                st.write(f"Reading rows {min_rows} to {max_rows}")
                st.dataframe(
                    df.head(),
                    use_container_width=True
                )
                st.write(f"Total rows loaded: {len(df)}")
            
            # Pulsante per avviare l'analisi
            if st.button("Start Analysis", type="primary"):
                try:
                    # Carichiamo i modelli (cache_resource) con la configurazione della lingua
                    with st.spinner("Loading models..."):
                        model_filter, embedding_model = load_models(language_config)
                    
                    # Eseguiamo l'analisi (cache_data)
                    with st.spinner("Processing data..."):
                        topic_model, results_df = run_analysis(
                            df,
                            model_filter,
                            embedding_model,
                            model_params,
                            exclude_brand_keywords=exclude_brands,
                            language_config=language_config
                        )
                        
                        if topic_model is None or results_df is None:
                            st.error("Analysis failed!")
                            return
                    
                    # Visualizza riepilogo configurazione
                    with st.expander("Configuration Summary", expanded=False):
                        st.subheader("Model Parameters")
                        st.json(model_params)
                        st.subheader("Language Configuration")
                        st.json(language_config)
                    
                    # ------------------------------------------------------------------------------
                    # 9) Mostra risultati
                    # ------------------------------------------------------------------------------
                    st.write("### Results Table")
                    st.dataframe(results_df, use_container_width=True, hide_index=True)
                    
                    # Visualizza la dashboard interattiva
                    st.write("### Interactive Topic Visualization")
                    try:
                        # Embedding ridotto
                        fig = topic_model.visualize_documents(
                            results_df['Keyword'].tolist(),
                            reduced_embeddings=topic_model.umap_model.embedding_,
                            hide_annotations=True,
                            hide_document_hover=False,
                            custom_labels=True
                        )
                        st.plotly_chart(fig, theme="streamlit", use_container_width=True)

                        # Visualizzazione dei topic
                        st.write("### Topic Overview")
                        try:
                            topic_fig = topic_model.visualize_topics(custom_labels=True)
                            st.plotly_chart(topic_fig, theme="streamlit", use_container_width=True)
                        except Exception as e:
                            st.error(f"Error creating topic visualization: {str(e)}")

                        # Visualizzazione barchart dei topic
                        st.write("### Topic Distribution")
                        try:
                            n_topics = len(topic_model.get_topic_info())
                            n_topics = min(50, max(1, n_topics - 1))  # -1 per outlier
                            
                            barchart_fig = topic_model.visualize_barchart(
                                top_n_topics=n_topics,
                                custom_labels=True
                            )
                            st.plotly_chart(barchart_fig, theme="streamlit", use_container_width=True)
                        except Exception as e:
                            st.error(f"Error creating barchart visualization: {str(e)}")
                        
                        # (A) AGGIUNTA: Visualizzazione gerarchica dei topic
                        st.write("### Hierarchical Topics")
                        try:
                            docs = results_df["Keyword"].tolist()
                            linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
                            hierarchical_topics = topic_model.hierarchical_topics(
                                docs,
                                linkage_function=linkage_function
                            )
                            # Grafico gerarchico
                            fig_hierarchy = topic_model.visualize_hierarchy(
                                hierarchical_topics=hierarchical_topics,
                                custom_labels=True
                            )
                            st.plotly_chart(fig_hierarchy, theme="streamlit", use_container_width=True)
                        
                            # (B) AGGIUNTA: Visualizzazione testuale dell'albero
                            st.write("### Hierarchical Topic Tree")
                            tree = topic_model.get_topic_tree(hierarchical_topics)
                            st.text(tree)  # Oppure st.code(tree) per un blocco formattato
                        
                        except Exception as e:
                            st.error(f"Error creating hierarchical visualization: {str(e)}")
                    
                        # Download risultati in CSV
                        st.download_button(
                            label="Download Results",
                            data=results_df.to_csv(index=False),
                            file_name="keyword_analysis_results.csv",
                            mime="text/csv",
                            key="download_results"
                        )
                    except Exception as e:
                        st.error(f"An error occurred: {str(e)}")
                        
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
        except Exception as e:
            st.error(f"Error reading file: {str(e)}")

    else:
        # Messaggio iniziale
        st.info("""
        👋 Welcome to the Keywords Cluster for SEO!
        
        1. Upload a CSV file with a column named **'Keyword'**.
        2. Adjust parameters in the sidebar if needed.
        3. Click **"Start Analysis"**.
        4. Explore the data.
        5. Download the results (this will refresh page).
        """)


if __name__ == "__main__":
    main()