Spaces:

Cisconardi
/

Cluster_keywords

Paused

App Files Files Community

Cisconardi commited on Jan 23, 2025

Commit

93d3c59

verified ·

1 Parent(s): 0c5c9fa

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -159

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ st.set_page_config(
     }
 )
-# Inizializzazione della sessione
 if 'model_loaded' not in st.session_state:
     st.session_state.model_loaded = False
 if 'analysis_complete' not in st.session_state:
@@ -60,43 +60,69 @@ st.markdown("""
     </style>
     """, unsafe_allow_html=True)
-# Funzioni di utilità
 @st.cache_resource
 def load_models():
-    """Carica i modelli necessari con caching."""
     with st.spinner("Loading models... This may take a few minutes."):
         try:
             download("en_core_web_sm")
-            model_filter = SpanMarkerModel.from_pretrained(
-                "nbroad/span-marker-xdistil-l12-h384-orgs-v3"
-            ).cuda() if cuda.is_available() else SpanMarkerModel.from_pretrained(
-                "nbroad/span-marker-xdistil-l12-h384-orgs-v3")
             embedding_model = SentenceTransformer("all-mpnet-base-v2")
             return model_filter, embedding_model
         except Exception as e:
             st.error(f"Error loading models: {str(e)}")
             raise
 @st.cache_data
-def process_keywords(df, _model_filter):
     """
     Rileva eventuali keyword di tipo 'Brand' utilizzando il modello SpanMarker.
     Ritorna la lista di etichette 'Brand' o 'Unbranded' per ciascuna keyword.
     """
     results = []
     total = len(df)
     progress_text = "Processing keywords..."
     progress_bar = st.progress(0, text=progress_text)
     for i, keyword in enumerate(df['Keyword']):
         try:
-            entities = _model_filter.predict([keyword])
             label = "Brand" if entities and isinstance(entities[0], list) and \
                    any(entity.get("label") == "ORG" for entity in entities[0]) else "Unbranded"
             results.append(label)
         except Exception as e:
-            # Se non riesce a rilevare entità, di default etichetta come 'Unbranded'
             st.error(f"Error processing keyword '{keyword}': {str(e)}")
             results.append("Unbranded")
@@ -105,8 +131,13 @@ def process_keywords(df, _model_filter):
     progress_bar.empty()
     return results
 def create_topic_model(embedding_model, model_params):
-    """Crea e configura il modello di topic modeling."""
     try:
         # Configurazione quantizzazione per Hugging Face
         bnb_config = transformers.BitsAndBytesConfig(
@@ -224,129 +255,87 @@ def create_topic_model(embedding_model, model_params):
         st.error(f"Error creating topic model: {str(e)}")
         raise
-def process_data(df, model_filter, embedding_model, model_params, exclude_brand_keywords=False):
     """
-    Processa i dati e crea il topic model.
-    Se exclude_brand_keywords è True, esegue l'etichettatura "Brand"/"Unbranded" e
-    rimuove le keyword 'Brand' dal dataset prima di eseguire il clustering.
-    Altrimenti, include tutte le keyword.
     """
-    try:
-        # Se l'utente sceglie di escludere i brand, etichettiamo e filtriamo
-        if exclude_brand_keywords:
-            df['Label'] = process_keywords(df, model_filter)
-            filtered_df = df[df['Label'] == 'Unbranded']
-        else:
-            # Non facciamo alcuna classificazione se non necessario
-            df['Label'] = "Unbranded"
-            filtered_df = df
-        filtered_keywords = filtered_df['Keyword'].tolist()
-        if not filtered_keywords:
-            st.warning("No keywords found for analysis (perhaps all were branded).")
-            return None, None
-        # Genera embeddings
-        embeddings = embedding_model.encode(filtered_keywords, show_progress_bar=True)
-        # Crea e applica topic model
-        topic_model = create_topic_model(embedding_model, model_params)
-        topics, probs = topic_model.fit_transform(filtered_keywords, embeddings)
-        # Ottieni gli embeddings ridotti per la visualizzazione
-        reduced_embeddings = topic_model.umap_model.embedding_
-        # Usa i label generati da Llama 2 (TextGeneration) come label finali
-        llama_topic_labels = {
-            topic: "".join(list(zip(*values))[0])
-            for topic, values in topic_model.topic_aspects_["Llama2"].items()
-        }
-        llama_topic_labels[-1] = "Outlier Topic"
-        topic_model.set_topic_labels(llama_topic_labels)
-        # Ottieni le informazioni sui topic
-        topic_info = topic_model.get_topic_info()
-        topic_labels = dict(zip(topic_info["Topic"], topic_info["CustomName"]))
-        # Ottieni le informazioni di default BERT
-        bert_labels = dict(zip(topic_info["Topic"], topic_info["Name"]))
-        # Creiamo il DataFrame dei risultati
-        results_df = pd.DataFrame({
-            "Keyword": filtered_keywords,
-            "Topic ID": topics,
-            "Confidence": probs
-        })
-        # Aggiungiamo le label Llama e BERT
-        results_df["Llama label"] = [
-            topic_labels[topic] if topic in topic_labels else "Outlier Topic"
-            for topic in topics
-        ]
-        results_df["BERT label"] = [
-            bert_labels[topic] if topic in bert_labels else "Outlier Topic"
-            for topic in topics
-        ]
-        # Se nel CSV c'è una colonna 'Volume', la aggiungiamo
-        if "Volume" in filtered_df.columns:
-            results_df["Volume"] = filtered_df["Volume"].values
-        # Visualizza risultati
-        st.write("### Results Table")
-        st.dataframe(results_df, use_container_width=True, hide_index=True)
-        # Visualizza la dashboard interattiva
-        st.write("### Interactive Topic Visualization")
-        try:
-            fig = topic_model.visualize_documents(
-                filtered_keywords,
-                reduced_embeddings=reduced_embeddings,
-                hide_annotations=True,
-                hide_document_hover=False,
-                custom_labels=True
-            )
-            st.plotly_chart(fig, theme="streamlit", use_container_width=True)
-            # Visualizzazione dei topic
-            st.write("### Topic Overview")
-            try:
-                topic_fig = topic_model.visualize_topics(custom_labels=True)
-                st.plotly_chart(topic_fig, theme="streamlit", use_container_width=True)
-            except Exception as e:
-                st.error(f"Error creating topic visualization: {str(e)}")
-            # Visualizzazione barchart dei topic
-            st.write("### Topic Distribution")
-            try:
-                # Calcola il numero di topic da visualizzare
-                n_topics = len(topic_model.get_topic_info())
-                n_topics = min(50, max(1, n_topics - 1))  # -1 per escludere l'outlier topic se presente
-                barchart_fig = topic_model.visualize_barchart(
-                    top_n_topics=n_topics,
-                    custom_labels=True
-                )
-                st.plotly_chart(barchart_fig, theme="streamlit", use_container_width=True)
-            except Exception as e:
-                st.error(f"Error creating barchart visualization: {str(e)}")
-        except Exception as e:
-            st.error(f"Error creating visualization: {str(e)}")
-        return topic_model, results_df
-    except Exception as e:
-        st.error(f"Error processing data: {str(e)}")
-        return None, None
 def main():
-    st.title("🔍 NLP Keyword Analysis")
-    topic_model = None  # Inizializza topic_model qui
     # Sidebar con configurazioni
     with st.sidebar:
@@ -439,9 +428,12 @@ def main():
             - Vectorizer: Controls text preprocessing
             - Topic Model: Controls topic generation
             - Llama 2: Controls topic labeling
             """)
-    # Raccolta parametri in un dizionario
     model_params = {
         'umap_n_neighbors': umap_n_neighbors,
         'umap_n_components': umap_n_components,
@@ -459,10 +451,15 @@ def main():
         'llama_repetition_penalty': llama_repetition_penalty
     }
     if uploaded_file is not None:
         try:
-            # Carica dati con il numero di righe specificato
-            df = pd.read_csv(uploaded_file, skiprows=min_rows - 1, nrows=max_rows - min_rows + 1)
             if 'Keyword' not in df.columns:
                 st.error("CSV must contain a 'Keyword' column")
@@ -473,27 +470,21 @@ def main():
                 st.write(f"Reading rows {min_rows} to {max_rows}")
                 st.dataframe(
                     df.head(),
-                    use_container_width=True,
-                    column_config={
-                        "Keyword": st.column_config.TextColumn(
-                            "Keyword",
-                            help="Input keywords for analysis"
-                        )
-                    }
                 )
                 st.write(f"Total rows loaded: {len(df)}")
-            # Analisi
             if st.button("Start Analysis", type="primary"):
                 try:
-                    # Carica modelli
                     with st.status("Loading models...", expanded=True) as status:
                         model_filter, embedding_model = load_models()
                         status.update(label="Models loaded successfully!", state="complete")
-                    # Processa dati
                     with st.status("Processing data...", expanded=True) as status:
-                        topic_model, results_df = process_data(
                             df,
                             model_filter,
                             embedding_model,
@@ -511,15 +502,56 @@ def main():
                     with st.expander("Configuration Summary", expanded=False):
                         st.json(model_params)
-                    # Download risultati
-                    if results_df is not None:
-                        st.download_button(
-                            label="Download Results",
-                            data=results_df.to_csv(index=False),
-                            file_name="keyword_analysis_results.csv",
-                            mime="text/csv",
-                            key="download_results"
                         )
                 except Exception as e:
                     st.error(f"An error occurred during analysis: {str(e)}")
@@ -527,21 +559,18 @@ def main():
             st.error(f"Error reading file: {str(e)}")
     else:
         st.info("""
-        👋 Welcome to the NLP Keyword Analysis tool!
-        Please upload a CSV file containing your keywords to get started.
-        The file should have a column named 'Keyword'.
-        You can configure:
-        - Number of rows to read from the CSV
-        - (Optionally) Exclude brand-labeled keywords
-        - UMAP parameters for dimensionality reduction
-        - HDBSCAN parameters for clustering
-        - Vectorizer parameters for text preprocessing
-        - Topic model parameters
-        - Llama 2 parameters for topic labeling
         """)
 if __name__ == "__main__":
     main()

     }
 )
+# Inizializzazione della sessione (opzionale, utile se vuoi tenere traccia di stati extra)
 if 'model_loaded' not in st.session_state:
     st.session_state.model_loaded = False
 if 'analysis_complete' not in st.session_state:
     </style>
     """, unsafe_allow_html=True)
+#
+# 1) Caricamento modelli con cache_resource
+#
 @st.cache_resource
 def load_models():
+    """Carica i modelli necessari con caching (una sola volta)."""
     with st.spinner("Loading models... This may take a few minutes."):
         try:
+            # Scarica en_core_web_sm se non presente (per PartOfSpeech)
             download("en_core_web_sm")
+            # Modello SpanMarker: rilevazione entità (Brand/Unbranded)
+            if cuda.is_available():
+                model_filter = SpanMarkerModel.from_pretrained(
+                    "nbroad/span-marker-xdistil-l12-h384-orgs-v3"
+                ).cuda()
+            else:
+                model_filter = SpanMarkerModel.from_pretrained(
+                    "nbroad/span-marker-xdistil-l12-h384-orgs-v3"
+                )
+            # Modello di embedding SentenceTransformer
             embedding_model = SentenceTransformer("all-mpnet-base-v2")
             return model_filter, embedding_model
         except Exception as e:
             st.error(f"Error loading models: {str(e)}")
             raise
+#
+# 2) Lettura CSV con cache_data
+#
 @st.cache_data
+def load_csv(file, skiprows, nrows):
+    """Carica il CSV con caching, così se l'utente riscarica o scarica i risultati,
+    Streamlit non rilegge il file da zero (se non è cambiato)."""
+    df = pd.read_csv(file, skiprows=skiprows, nrows=nrows)
+    return df
+#
+# 3) Funzione di etichettatura Brand/Unbranded con cache_data
+#
+@st.cache_data
+def process_keywords(df, model_filter):
     """
     Rileva eventuali keyword di tipo 'Brand' utilizzando il modello SpanMarker.
     Ritorna la lista di etichette 'Brand' o 'Unbranded' per ciascuna keyword.
     """
     results = []
     total = len(df)
     progress_text = "Processing keywords..."
     progress_bar = st.progress(0, text=progress_text)
     for i, keyword in enumerate(df['Keyword']):
         try:
+            entities = model_filter.predict([keyword])
             label = "Brand" if entities and isinstance(entities[0], list) and \
                    any(entity.get("label") == "ORG" for entity in entities[0]) else "Unbranded"
             results.append(label)
         except Exception as e:
             st.error(f"Error processing keyword '{keyword}': {str(e)}")
             results.append("Unbranded")
     progress_bar.empty()
     return results
+#
+# 4) Creazione del modello di topic
+#
 def create_topic_model(embedding_model, model_params):
+    """Crea e configura il modello di topic modeling (non cachiamo,
+    perché potrebbe dipendere da molti parametri)"""
     try:
         # Configurazione quantizzazione per Hugging Face
         bnb_config = transformers.BitsAndBytesConfig(
         st.error(f"Error creating topic model: {str(e)}")
         raise
+#
+# 5) Analisi principale (cachiamo i risultati finali dell'analisi)
+#
+@st.cache_data
+def run_analysis(df, model_filter, embedding_model, model_params, exclude_brand_keywords):
     """
+    - Etichetta (facoltativo) come 'Brand' o 'Unbranded'
+    - Filtra i brand se richiesto
+    - Crea embeddings
+    - Esegue il topic modeling
+    - Restituisce il modello + results_df
     """
+    # Se l'utente sceglie di escludere i brand, etichettiamo e filtriamo
+    if exclude_brand_keywords:
+        df['Label'] = process_keywords(df, model_filter)
+        filtered_df = df[df['Label'] == 'Unbranded']
+    else:
+        df['Label'] = "Unbranded"
+        filtered_df = df
+    filtered_keywords = filtered_df['Keyword'].tolist()
+    if not filtered_keywords:
+        st.warning("No keywords found for analysis (perhaps all were branded).")
+        return None, None
+    # Genera embeddings
+    embeddings = embedding_model.encode(filtered_keywords, show_progress_bar=True)
+    # Crea e applica topic model
+    topic_model = create_topic_model(embedding_model, model_params)
+    topics, probs = topic_model.fit_transform(filtered_keywords, embeddings)
+    # Ottieni gli embeddings ridotti per la visualizzazione
+    reduced_embeddings = topic_model.umap_model.embedding_
+    # Usa i label generati da Llama 2 come label finali
+    llama_topic_labels = {
+        topic: "".join(list(zip(*values))[0])
+        for topic, values in topic_model.topic_aspects_["Llama2"].items()
+    }
+    llama_topic_labels[-1] = "Outlier Topic"
+    topic_model.set_topic_labels(llama_topic_labels)
+    # Ottieni le informazioni sui topic
+    topic_info = topic_model.get_topic_info()
+    topic_labels = dict(zip(topic_info["Topic"], topic_info["CustomName"]))
+    # Ottieni le informazioni di default BERT
+    bert_labels = dict(zip(topic_info["Topic"], topic_info["Name"]))
+    # Creiamo il DataFrame dei risultati
+    results_df = pd.DataFrame({
+        "Keyword": filtered_keywords,
+        "Topic ID": topics,
+        "Confidence": probs
+    })
+    # Aggiungiamo le label Llama e BERT
+    results_df["Llama label"] = [
+        topic_labels[topic] if topic in topic_labels else "Outlier Topic"
+        for topic in topics
+    ]
+    results_df["BERT label"] = [
+        bert_labels[topic] if topic in bert_labels else "Outlier Topic"
+        for topic in topics
+    ]
+    # Se nel CSV c'è una colonna 'Volume', la aggiungiamo
+    if "Volume" in filtered_df.columns:
+        results_df["Volume"] = filtered_df["Volume"].values
+    return topic_model, results_df
+#
+# 6) Main Streamlit App
+#
 def main():
+    st.title("🔍 NLP Keyword Analysis with Cache")
     # Sidebar con configurazioni
     with st.sidebar:
             - Vectorizer: Controls text preprocessing
             - Topic Model: Controls topic generation
             - Llama 2: Controls topic labeling
+            **Caching:**
+            - Con i decorator `@st.cache_data` e `@st.cache_resource`, eviterai ricalcoli costosi quando l'app si ricarica.
             """)
+    # 7) Prepariamo dizionario parametri
     model_params = {
         'umap_n_neighbors': umap_n_neighbors,
         'umap_n_components': umap_n_components,
         'llama_repetition_penalty': llama_repetition_penalty
     }
+    # 8) Se abbiamo caricato un file, procediamo
     if uploaded_file is not None:
         try:
+            # Carica dati con caching
+            df = load_csv(
+                file=uploaded_file,
+                skiprows=min_rows - 1,
+                nrows=max_rows - min_rows + 1
+            )
             if 'Keyword' not in df.columns:
                 st.error("CSV must contain a 'Keyword' column")
                 st.write(f"Reading rows {min_rows} to {max_rows}")
                 st.dataframe(
                     df.head(),
+                    use_container_width=True
                 )
                 st.write(f"Total rows loaded: {len(df)}")
+            # Pulsante per avviare l'analisi
             if st.button("Start Analysis", type="primary"):
                 try:
+                    # Carichiamo i modelli (cache_resource)
                     with st.status("Loading models...", expanded=True) as status:
                         model_filter, embedding_model = load_models()
                         status.update(label="Models loaded successfully!", state="complete")
+                    # Eseguiamo l'analisi (cache_data)
                     with st.status("Processing data...", expanded=True) as status:
+                        topic_model, results_df = run_analysis(
                             df,
                             model_filter,
                             embedding_model,
                     with st.expander("Configuration Summary", expanded=False):
                         st.json(model_params)
+                    # 9) Mostra risultati
+                    st.write("### Results Table")
+                    st.dataframe(results_df, use_container_width=True, hide_index=True)
+                    # Visualizza la dashboard interattiva
+                    st.write("### Interactive Topic Visualization")
+                    try:
+                        # Embedding ridotto
+                        fig = topic_model.visualize_documents(
+                            results_df['Keyword'].tolist(),
+                            reduced_embeddings=topic_model.umap_model.embedding_,
+                            hide_annotations=True,
+                            hide_document_hover=False,
+                            custom_labels=True
                         )
+                        st.plotly_chart(fig, theme="streamlit", use_container_width=True)
+                        # Visualizzazione dei topic
+                        st.write("### Topic Overview")
+                        try:
+                            topic_fig = topic_model.visualize_topics(custom_labels=True)
+                            st.plotly_chart(topic_fig, theme="streamlit", use_container_width=True)
+                        except Exception as e:
+                            st.error(f"Error creating topic visualization: {str(e)}")
+                        # Visualizzazione barchart dei topic
+                        st.write("### Topic Distribution")
+                        try:
+                            n_topics = len(topic_model.get_topic_info())
+                            n_topics = min(50, max(1, n_topics - 1))  # -1 per outlier
+                            barchart_fig = topic_model.visualize_barchart(
+                                top_n_topics=n_topics,
+                                custom_labels=True
+                            )
+                            st.plotly_chart(barchart_fig, theme="streamlit", use_container_width=True)
+                        except Exception as e:
+                            st.error(f"Error creating barchart visualization: {str(e)}")
+                    except Exception as e:
+                        st.error(f"Error creating visualization: {str(e)}")
+                    # Download risultati in CSV
+                    st.download_button(
+                        label="Download Results",
+                        data=results_df.to_csv(index=False),
+                        file_name="keyword_analysis_results.csv",
+                        mime="text/csv",
+                        key="download_results"
+                    )
                 except Exception as e:
                     st.error(f"An error occurred during analysis: {str(e)}")
             st.error(f"Error reading file: {str(e)}")
     else:
+        # Messaggio iniziale
         st.info("""
+        👋 Welcome to the NLP Keyword Analysis tool (with caching)!
+        1. Upload a CSV file with a column named **'Keyword'**.
+        2. Adjust parameters in the sidebar if needed.
+        3. Click **"Start Analysis"**.
+        4. Download the results.
+        *Note: Caching helps avoid re-running expensive computations when the app reloads.*
         """)
 if __name__ == "__main__":
     main()