|
import streamlit as st |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
file_path = "Mall_Customers.csv" |
|
df = pd.read_csv(file_path) |
|
return df |
|
|
|
df = load_data() |
|
|
|
|
|
st.sidebar.title("K-Means Clustering App") |
|
section = st.sidebar.radio("Go to", ["Introduction", "Data Exploration", "K-Means Model", "Test Model"]) |
|
|
|
if section == "Introduction": |
|
st.title("Introduction to K-Means Clustering") |
|
|
|
st.write("### About the Model") |
|
st.write("K-Means Clustering is an unsupervised machine learning algorithm used for customer segmentation. It helps identify different groups of customers based on their spending behavior and income.") |
|
|
|
st.write("### About the Dataset") |
|
st.write("The dataset consists of customer information, including:") |
|
st.markdown("- **CustomerID**: Unique identifier for each customer.") |
|
st.markdown("- **Gender**: Male or Female.") |
|
st.markdown("- **Age**: Age of the customer.") |
|
st.markdown("- **Annual Income (k$)**: Customer's yearly income.") |
|
st.markdown("- **Spending Score (1-100)**: A score assigned based on spending behavior.") |
|
|
|
st.write("### How to Use the App") |
|
st.markdown("1. **Go to 'Data Exploration'**: Understand the dataset using statistics and visualizations.") |
|
st.markdown("2. **Go to 'K-Means Model'**: Train the model and visualize clusters.") |
|
st.markdown("3. **Go to 'Test Model'**: Input values to predict customer cluster.") |
|
|
|
st.write("### Insights") |
|
st.markdown("- Customers can be grouped into different segments based on their income and spending habits.") |
|
st.markdown("- The Elbow Method helps determine the optimal number of clusters.") |
|
st.markdown("- Businesses can use these insights to tailor marketing strategies and improve customer engagement.") |
|
|
|
elif section == "Data Exploration": |
|
st.title("Data Exploration") |
|
st.write("### First 5 rows of dataset") |
|
st.dataframe(df.head()) |
|
|
|
st.write("### Summary Statistics") |
|
st.write(df.describe()) |
|
|
|
st.write("### Pairplot") |
|
sns.pairplot(df.drop(columns=["CustomerID", "Gender"]), diag_kind="kde") |
|
st.pyplot() |
|
|
|
st.write("### Correlation Heatmap") |
|
plt.figure(figsize=(8, 6)) |
|
sns.heatmap(df.drop(columns=["CustomerID", "Gender"]).corr(), annot=True, cmap="coolwarm") |
|
st.pyplot() |
|
|
|
elif section == "K-Means Model": |
|
st.title("K-Means Clustering") |
|
|
|
|
|
features = df[["Annual Income (k$)", "Spending Score (1-100)"]] |
|
scaler = StandardScaler() |
|
scaled_features = scaler.fit_transform(features) |
|
|
|
|
|
st.write("### Elbow Method") |
|
inertia = [] |
|
for k in range(1, 11): |
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) |
|
kmeans.fit(scaled_features) |
|
inertia.append(kmeans.inertia_) |
|
|
|
plt.figure(figsize=(8, 5)) |
|
plt.plot(range(1, 11), inertia, marker='o') |
|
plt.xlabel('Number of Clusters') |
|
plt.ylabel('Inertia') |
|
plt.title('Elbow Method for Optimal k') |
|
st.pyplot() |
|
|
|
|
|
k = st.slider("Select Number of Clusters", 2, 10, 5) |
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) |
|
df['Cluster'] = kmeans.fit_predict(scaled_features) |
|
|
|
st.write("### Clustered Data") |
|
st.dataframe(df) |
|
|
|
|
|
plt.figure(figsize=(8, 6)) |
|
sns.scatterplot(x=df["Annual Income (k$)"], y=df["Spending Score (1-100)"], hue=df['Cluster'], palette='viridis') |
|
plt.xlabel("Annual Income (k$)") |
|
plt.ylabel("Spending Score (1-100)") |
|
plt.title("Customer Segmentation using K-Means") |
|
st.pyplot() |
|
|
|
|
|
st.session_state['scaler'] = scaler |
|
st.session_state['kmeans'] = kmeans |
|
|
|
elif section == "Test Model": |
|
st.title("Test K-Means Model") |
|
|
|
income = st.number_input("Enter Annual Income (k$)", min_value=0, max_value=200, value=50) |
|
score = st.number_input("Enter Spending Score (1-100)", min_value=1, max_value=100, value=50) |
|
|
|
if 'scaler' in st.session_state and 'kmeans' in st.session_state: |
|
input_data = st.session_state['scaler'].transform([[income, score]]) |
|
prediction = st.session_state['kmeans'].predict(input_data) |
|
st.write(f"### Predicted Cluster: {prediction[0]}") |
|
else: |
|
st.write("### Please run the K-Means Model section first.") |
|
|