Spaces:

allantacuelwvsu
/

k-means_clustering

Running

App Files Files Community

allantacuelwvsu commited on 11 days ago

Commit

5b3cd15

1 Parent(s): 322abb6

update app.py

Browse files

Files changed (1) hide show

app.py +66 -41

app.py CHANGED Viewed

@@ -3,83 +3,108 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.metrics import silhouette_score
 # Load dataset
-df = pd.read_csv("datasets/Mall_Customers.csv")
-df.drop(columns=['CustomerID'], inplace=True)
-# Encode Gender
-le = LabelEncoder()
-df['Genre'] = le.fit_transform(df['Genre'])
-# Standardize numerical columns
 scaler = StandardScaler()
 df_scaled = scaler.fit_transform(df)
-# Optimal K estimation
 k_optimal = 5
-kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=42)
 kmeans.fit(df_scaled)
 df['Cluster'] = kmeans.labels_
 sil_score = silhouette_score(df_scaled, kmeans.labels_)
 # Streamlit App
-st.title("Customer Segmentation using K-Means")
-st.caption("Dataset: Mall Customers")
-tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Predictor"])
 with tab1:
     st.header("Model Performance")
     st.write(f"**Silhouette Score:** {sil_score:.4f}")
-    wcss = []
-    k_values = range(1, 11)
-    for k in k_values:
-        kmeans_temp = KMeans(n_clusters=k, init='k-means++', random_state=42)
-        kmeans_temp.fit(df_scaled)
-        wcss.append(kmeans_temp.inertia_)
     fig, ax = plt.subplots()
-    ax.plot(k_values, wcss, marker='o', linestyle='--')
-    ax.set_xlabel('Number of Clusters (K)')
-    ax.set_ylabel('WCSS')
-    ax.set_title('Elbow Method for Optimal K')
     st.pyplot(fig)
 with tab2:
     st.header("Dataset")
-    st.dataframe(df.head())
-    fig, ax = plt.subplots(figsize=(8, 6))
-    sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
-    ax.set_title("Correlation Matrix")
-    st.pyplot(fig)
 with tab3:
-    st.header("Customer Segment Prediction")
-    annual_income = st.slider("Annual Income (k$)", min_value=int(df['Annual Income (k$)'].min()), max_value=int(df['Annual Income (k$)'].max()), value=50)
-    spending_score = st.slider("Spending Score (1-100)", min_value=int(df['Spending Score (1-100)'].min()), max_value=int(df['Spending Score (1-100)'].max()), value=50)
-    gender = st.radio("Gender", ["Male", "Female"], index=0)
-    gender_encoded = 1 if gender == "Female" else 0
-    input_data = np.array([[gender_encoded, annual_income, spending_score]])
-    input_scaled = scaler.transform(np.hstack((input_data, np.zeros((input_data.shape[0], 1)))))  # Ensure same feature shape
-    cluster_prediction = kmeans.predict(input_scaled)[0]
     st.subheader("Predicted Customer Segment")
-    st.markdown(f"<h1 style='color:blue;'>{cluster_prediction}</h1>", unsafe_allow_html=True)
     fig, ax = plt.subplots()
     sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
-    ax.scatter(annual_income, spending_score, color='red', s=100, edgecolors='black', label='Input Customer')
-    ax.set_xlabel('Annual Income (k$)')
-    ax.set_ylabel('Spending Score')
-    ax.legend()
     st.pyplot(fig)
     st.divider()

 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
 # Load dataset
+@st.cache_data()
+def load_data():
+    df = pd.read_csv("datasets/Mall_Customers.csv")
+    return df
+df = load_data()
+df.drop(columns=["CustomerID"], inplace=True)  # Drop non-essential column
+le = LabelEncoder()
+df["Genre"] = le.fit_transform(df["Genre"])  # Encode Gender (Male=0, Female=1)
 scaler = StandardScaler()
 df_scaled = scaler.fit_transform(df)
+# Find optimal K
+wcss = []
+k_values = range(1, 11)
+for k in k_values:
+    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10)
+    kmeans.fit(df_scaled)
+    wcss.append(kmeans.inertia_)
+# Choose optimal K (assumed 5 based on elbow curve)
 k_optimal = 5
+kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10)
 kmeans.fit(df_scaled)
 df['Cluster'] = kmeans.labels_
 sil_score = silhouette_score(df_scaled, kmeans.labels_)
 # Streamlit App
+st.title("Clustering: Mall Customers Segmentation")
+st.caption("Dataset: Mall_Customers.csv")
+tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"])
 with tab1:
     st.header("Model Performance")
     st.write(f"**Silhouette Score:** {sil_score:.4f}")
     fig, ax = plt.subplots()
+    plt.plot(k_values, wcss, marker='o', linestyle='--')
+    plt.xlabel('Number of Clusters (K)')
+    plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
+    plt.title('Elbow Method for Optimal K')
     st.pyplot(fig)
+    st.subheader("Customer Segments Visualization")
+    fig, ax = plt.subplots()
+    sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis')
+    plt.xlabel('Annual Income (k$)')
+    plt.ylabel('Spending Score')
+    plt.title('Customer Segments')
+    st.pyplot(fig)
+    st.divider()
 with tab2:
     st.header("Dataset")
+    def corr_matrix(data, title):
+        data = data.select_dtypes(include=["number"])
+        fig, ax = plt.subplots(figsize=(8, 6))
+        sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
+        ax.set_title(title)
+        st.pyplot(fig)
+    corr_matrix(df, "Correlation Matrix")
+    view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
+    if view_type == "Top -> Bottom":
+        st.dataframe(df.head(len(df)))
+    else:
+        st.dataframe(df.tail(len(df)).iloc[::-1])
+    st.divider()
 with tab3:
+    st.header("Customer Segment Predictor")
+    income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median()))
+    spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median()))
+    age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median()))
+    gender = st.radio("Gender", ["Male", "Female"])
+    input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"])
+    input_data["Genre"] = le.transform([gender])[0]  # Encode gender
+    input_scaled = scaler.transform(input_data)
+    predicted_cluster = kmeans.predict(input_scaled)[0]
     st.subheader("Predicted Customer Segment")
+    st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True)
+    # Graph to visualize input placement
     fig, ax = plt.subplots()
     sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
+    plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100)
+    plt.xlabel('Annual Income (k$)')
+    plt.ylabel('Spending Score')
+    plt.title('Customer Segments with Your Input')
+    plt.legend()
     st.pyplot(fig)
     st.divider()