Spaces:

wu981526092
/

Optimal_Cluster_Analysis_with_PCA_Visualization

Sleeping

App Files Files Community

Optimal_Cluster_Analysis_with_PCA_Visualization / app.py

wu981526092

update

4b1f0bc over 1 year ago

raw

history blame contribute delete

3.94 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
	from pca import perform_pca, plot_pca, plot_pca_3D
	from data_preprocessing import mainDataWrangling


	@st.cache_data
	def convert_df(df):
	return df.to_csv().encode('utf-8')

	# Streamlit code
	st.set_option('deprecation.showPyplotGlobalUse', False)

	st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
	uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')

	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)

	raw_data = mainDataWrangling(df)
	raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
	project_names = raw_data["projectName"]
	information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
	"Efficacy", "Privacy", "Bias", "Explainability"]

	data = raw_data.drop(columns=information_columns)
	st.subheader('🔍 Data Preview')
	st.write(df)

	st.subheader('🔍 Preprocessed Data')
	st.write(raw_data)

	# Step 1: Plot Elbow Method and Silhouette Scores
	wcss = calculate_wcss(data)
	silhouette_scores = calculate_silhouette_scores(data)
	st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')

	fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
	labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
	st.plotly_chart(fig)

	fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
	title='Elbow Method')
	st.plotly_chart(fig)

	st.markdown('''
	Directions:

	- Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.

	- For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.

	- For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
	From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
	value of K or an optimal number of clusters.


	''')


	optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
	st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')

	optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
	value=optimal_clusters_silhouette, step=1)

	# Step 2: KMeans fitting and PCA
	st.header('KMeans Clustering and PCA')
	st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
	kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)

	# Add project names back to the data
	display_data = pd.concat([project_names, clustered_data], axis=1)

	st.subheader('📌 Clustered Data')
	st.write(display_data[["projectName", "cluster"]])

	principalDf = perform_pca(clustered_data, 2)

	st.subheader('📊 2D PCA Plot')
	fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
	st.plotly_chart(fig2D)

	principalDf_3D = perform_pca(clustered_data, 3)
	st.subheader('📊 3D PCA Plot')
	fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
	st.plotly_chart(fig3D)

	st.subheader('📩 Data Download')
	csv = convert_df(display_data)
	st.download_button(
	label="Download clustered data as CSV",
	data=csv,
	file_name='clustered_data.csv',
	mime='text/csv',
	)