Spaces:

wu981526092
/

Optimal_Cluster_Analysis_with_PCA_Visualization

Sleeping

Optimal_Cluster_Analysis_with_PCA_Visualization

File size: 3,938 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
from pca import perform_pca, plot_pca, plot_pca_3D
from data_preprocessing import mainDataWrangling


@st.cache_data
def convert_df(df):
    return df.to_csv().encode('utf-8')

# Streamlit code
st.set_option('deprecation.showPyplotGlobalUse', False)

st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

    raw_data = mainDataWrangling(df)
    raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
    project_names = raw_data["projectName"]
    information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
                           "Efficacy", "Privacy", "Bias", "Explainability"]

    data = raw_data.drop(columns=information_columns)
    st.subheader('🔍 Data Preview')
    st.write(df)

    st.subheader('🔍 Preprocessed Data')
    st.write(raw_data)

    # Step 1: Plot Elbow Method and Silhouette Scores
    wcss = calculate_wcss(data)
    silhouette_scores = calculate_silhouette_scores(data)
    st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')

    fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
                  labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
    st.plotly_chart(fig)

    fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
                  title='Elbow Method')
    st.plotly_chart(fig)

    st.markdown('''
       **Directions:**

       - Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.

       - For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.

       - For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
       From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
       value of K or an optimal number of clusters.

       
       ''')


    optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
    st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')

    optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
                                       value=optimal_clusters_silhouette, step=1)

    # Step 2: KMeans fitting and PCA
    st.header('KMeans Clustering and PCA')
    st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
    kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)

    # Add project names back to the data
    display_data = pd.concat([project_names, clustered_data], axis=1)

    st.subheader('📌 Clustered Data')
    st.write(display_data[["projectName", "cluster"]])

    principalDf = perform_pca(clustered_data, 2)

    st.subheader('📊 2D PCA Plot')
    fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
    st.plotly_chart(fig2D)

    principalDf_3D = perform_pca(clustered_data, 3)
    st.subheader('📊 3D PCA Plot')
    fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
    st.plotly_chart(fig3D)

    st.subheader('📩 Data Download')
    csv = convert_df(display_data)
    st.download_button(
        label="Download clustered data as CSV",
        data=csv,
        file_name='clustered_data.csv',
        mime='text/csv',
    )