|
import streamlit as st |
|
import pandas as pd |
|
import plotly.express as px |
|
from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow |
|
from pca import perform_pca, plot_pca, plot_pca_3D |
|
from data_preprocessing import mainDataWrangling |
|
|
|
|
|
@st.cache_data |
|
def convert_df(df): |
|
return df.to_csv().encode('utf-8') |
|
|
|
|
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
|
|
st.title('π Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization') |
|
uploaded_file = st.file_uploader("π€ Upload a CSV file", type='csv') |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_csv(uploaded_file) |
|
|
|
raw_data = mainDataWrangling(df) |
|
raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0) |
|
project_names = raw_data["projectName"] |
|
information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness", |
|
"Efficacy", "Privacy", "Bias", "Explainability"] |
|
|
|
data = raw_data.drop(columns=information_columns) |
|
st.subheader('π Data Preview') |
|
st.write(df) |
|
|
|
st.subheader('π Preprocessed Data') |
|
st.write(raw_data) |
|
|
|
|
|
wcss = calculate_wcss(data) |
|
silhouette_scores = calculate_silhouette_scores(data) |
|
st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores') |
|
|
|
fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores, |
|
labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores') |
|
st.plotly_chart(fig) |
|
|
|
fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'}, |
|
title='Elbow Method') |
|
st.plotly_chart(fig) |
|
|
|
st.markdown(''' |
|
**Directions:** |
|
|
|
- Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph. |
|
|
|
- For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot. |
|
|
|
- For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape. |
|
From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal |
|
value of K or an optimal number of clusters. |
|
|
|
|
|
''') |
|
|
|
|
|
optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores) |
|
st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}') |
|
|
|
optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1, |
|
value=optimal_clusters_silhouette, step=1) |
|
|
|
|
|
st.header('KMeans Clustering and PCA') |
|
st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.') |
|
kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow) |
|
|
|
|
|
display_data = pd.concat([project_names, clustered_data], axis=1) |
|
|
|
st.subheader('π Clustered Data') |
|
st.write(display_data[["projectName", "cluster"]]) |
|
|
|
principalDf = perform_pca(clustered_data, 2) |
|
|
|
st.subheader('π 2D PCA Plot') |
|
fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns) |
|
st.plotly_chart(fig2D) |
|
|
|
principalDf_3D = perform_pca(clustered_data, 3) |
|
st.subheader('π 3D PCA Plot') |
|
fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns) |
|
st.plotly_chart(fig3D) |
|
|
|
st.subheader('π© Data Download') |
|
csv = convert_df(display_data) |
|
st.download_button( |
|
label="Download clustered data as CSV", |
|
data=csv, |
|
file_name='clustered_data.csv', |
|
mime='text/csv', |
|
) |