import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # Load dataset @st.cache_data() def load_data(): df = pd.read_csv("datasets/Mall_Customers.csv") return df df = load_data() df.drop(columns=["CustomerID"], inplace=True) # Drop non-essential column le = LabelEncoder() df["Genre"] = le.fit_transform(df["Genre"]) # Encode Gender (Male=0, Female=1) scaler = StandardScaler() df_scaled = scaler.fit_transform(df) # Find optimal K wcss = [] k_values = range(1, 11) for k in k_values: kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10) kmeans.fit(df_scaled) wcss.append(kmeans.inertia_) # Choose optimal K (assumed 5 based on elbow curve) k_optimal = 5 kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10) kmeans.fit(df_scaled) df['Cluster'] = kmeans.labels_ sil_score = silhouette_score(df_scaled, kmeans.labels_) # Streamlit App st.title("Clustering: Mall Customers Segmentation") st.caption("Dataset: Mall_Customers.csv") tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"]) with tab1: st.header("Model Performance") st.write(f"**Silhouette Score:** {sil_score:.4f}") fig, ax = plt.subplots() plt.plot(k_values, wcss, marker='o', linestyle='--') plt.xlabel('Number of Clusters (K)') plt.ylabel('WCSS (Within-Cluster Sum of Squares)') plt.title('Elbow Method for Optimal K') st.pyplot(fig) st.subheader("Customer Segments Visualization") fig, ax = plt.subplots() sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis') plt.xlabel('Annual Income (k$)') plt.ylabel('Spending Score') plt.title('Customer Segments') st.pyplot(fig) st.divider() with tab2: st.header("Dataset") def corr_matrix(data, title): data = data.select_dtypes(include=["number"]) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) ax.set_title(title) st.pyplot(fig) corr_matrix(df, "Correlation Matrix") view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) if view_type == "Top -> Bottom": st.dataframe(df.head(len(df))) else: st.dataframe(df.tail(len(df)).iloc[::-1]) st.divider() with tab3: st.header("Customer Segment Predictor") income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median())) spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median())) age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median())) gender = st.radio("Gender", ["Male", "Female"]) input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"]) input_data["Genre"] = le.transform([gender])[0] # Encode gender input_scaled = scaler.transform(input_data) predicted_cluster = kmeans.predict(input_scaled)[0] st.subheader("Predicted Customer Segment") st.markdown(f"

Cluster {predicted_cluster}

", unsafe_allow_html=True) # Graph to visualize input placement fig, ax = plt.subplots() sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6) plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100) plt.xlabel('Annual Income (k$)') plt.ylabel('Spending Score') plt.title('Customer Segments with Your Input') plt.legend() st.pyplot(fig) st.divider()