File size: 4,606 Bytes
a487e3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
@st.cache_data
def load_data():
    file_path = "Mall_Customers.csv"
    df = pd.read_csv(file_path)
    return df

df = load_data()

# Sidebar for navigation
st.sidebar.title("K-Means Clustering App")
section = st.sidebar.radio("Go to", ["Introduction", "Data Exploration", "K-Means Model", "Test Model"])

if section == "Introduction":
    st.title("Introduction to K-Means Clustering")
    
    st.write("### About the Model")
    st.write("K-Means Clustering is an unsupervised machine learning algorithm used for customer segmentation. It helps identify different groups of customers based on their spending behavior and income.")
    
    st.write("### About the Dataset")
    st.write("The dataset consists of customer information, including:")
    st.markdown("- **CustomerID**: Unique identifier for each customer.")
    st.markdown("- **Gender**: Male or Female.")
    st.markdown("- **Age**: Age of the customer.")
    st.markdown("- **Annual Income (k$)**: Customer's yearly income.")
    st.markdown("- **Spending Score (1-100)**: A score assigned based on spending behavior.")
    
    st.write("### How to Use the App")
    st.markdown("1. **Go to 'Data Exploration'**: Understand the dataset using statistics and visualizations.")
    st.markdown("2. **Go to 'K-Means Model'**: Train the model and visualize clusters.")
    st.markdown("3. **Go to 'Test Model'**: Input values to predict customer cluster.")
    
    st.write("### Insights")
    st.markdown("- Customers can be grouped into different segments based on their income and spending habits.")
    st.markdown("- The Elbow Method helps determine the optimal number of clusters.")
    st.markdown("- Businesses can use these insights to tailor marketing strategies and improve customer engagement.")

elif section == "Data Exploration":
    st.title("Data Exploration")
    st.write("### First 5 rows of dataset")
    st.dataframe(df.head())
    
    st.write("### Summary Statistics")
    st.write(df.describe())
    
    st.write("### Pairplot")
    sns.pairplot(df.drop(columns=["CustomerID", "Gender"]), diag_kind="kde")
    st.pyplot()
    
    st.write("### Correlation Heatmap")
    plt.figure(figsize=(8, 6))
    sns.heatmap(df.drop(columns=["CustomerID", "Gender"]).corr(), annot=True, cmap="coolwarm")
    st.pyplot()

elif section == "K-Means Model":
    st.title("K-Means Clustering")
    
    # Selecting features for clustering
    features = df[["Annual Income (k$)", "Spending Score (1-100)"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Finding the optimal number of clusters using Elbow Method
    st.write("### Elbow Method")
    inertia = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(scaled_features)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, 11), inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    st.pyplot()
    
    # Train K-Means Model
    k = st.slider("Select Number of Clusters", 2, 10, 5)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(scaled_features)
    
    st.write("### Clustered Data")
    st.dataframe(df)
    
    # Visualization of clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df["Annual Income (k$)"], y=df["Spending Score (1-100)"], hue=df['Cluster'], palette='viridis')
    plt.xlabel("Annual Income (k$)")
    plt.ylabel("Spending Score (1-100)")
    plt.title("Customer Segmentation using K-Means")
    st.pyplot()

    # Store the model and scaler globally
    st.session_state['scaler'] = scaler
    st.session_state['kmeans'] = kmeans

elif section == "Test Model":
    st.title("Test K-Means Model")
    
    income = st.number_input("Enter Annual Income (k$)", min_value=0, max_value=200, value=50)
    score = st.number_input("Enter Spending Score (1-100)", min_value=1, max_value=100, value=50)
    
    if 'scaler' in st.session_state and 'kmeans' in st.session_state:
        input_data = st.session_state['scaler'].transform([[income, score]])
        prediction = st.session_state['kmeans'].predict(input_data)
        st.write(f"### Predicted Cluster: {prediction[0]}")
    else:
        st.write("### Please run the K-Means Model section first.")