import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # Load dataset @st.cache_data def load_data(): file_path = "Mall_Customers.csv" df = pd.read_csv(file_path) return df df = load_data() # Sidebar for navigation st.sidebar.title("K-Means Clustering App") section = st.sidebar.radio("Go to", ["Introduction", "Data Exploration", "K-Means Model", "Test Model"]) if section == "Introduction": st.title("Introduction to K-Means Clustering") st.write("### About the Model") st.write("K-Means Clustering is an unsupervised machine learning algorithm used for customer segmentation. It helps identify different groups of customers based on their spending behavior and income.") st.write("### About the Dataset") st.write("The dataset consists of customer information, including:") st.markdown("- **CustomerID**: Unique identifier for each customer.") st.markdown("- **Gender**: Male or Female.") st.markdown("- **Age**: Age of the customer.") st.markdown("- **Annual Income (k$)**: Customer's yearly income.") st.markdown("- **Spending Score (1-100)**: A score assigned based on spending behavior.") st.write("### How to Use the App") st.markdown("1. **Go to 'Data Exploration'**: Understand the dataset using statistics and visualizations.") st.markdown("2. **Go to 'K-Means Model'**: Train the model and visualize clusters.") st.markdown("3. **Go to 'Test Model'**: Input values to predict customer cluster.") st.write("### Insights") st.markdown("- Customers can be grouped into different segments based on their income and spending habits.") st.markdown("- The Elbow Method helps determine the optimal number of clusters.") st.markdown("- Businesses can use these insights to tailor marketing strategies and improve customer engagement.") elif section == "Data Exploration": st.title("Data Exploration") st.write("### First 5 rows of dataset") st.dataframe(df.head()) st.write("### Summary Statistics") st.write(df.describe()) st.write("### Pairplot") sns.pairplot(df.drop(columns=["CustomerID", "Gender"]), diag_kind="kde") st.pyplot() st.write("### Correlation Heatmap") plt.figure(figsize=(8, 6)) sns.heatmap(df.drop(columns=["CustomerID", "Gender"]).corr(), annot=True, cmap="coolwarm") st.pyplot() elif section == "K-Means Model": st.title("K-Means Clustering") # Selecting features for clustering features = df[["Annual Income (k$)", "Spending Score (1-100)"]] scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Finding the optimal number of clusters using Elbow Method st.write("### Elbow Method") inertia = [] for k in range(1, 11): kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(scaled_features) inertia.append(kmeans.inertia_) plt.figure(figsize=(8, 5)) plt.plot(range(1, 11), inertia, marker='o') plt.xlabel('Number of Clusters') plt.ylabel('Inertia') plt.title('Elbow Method for Optimal k') st.pyplot() # Train K-Means Model k = st.slider("Select Number of Clusters", 2, 10, 5) kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) df['Cluster'] = kmeans.fit_predict(scaled_features) st.write("### Clustered Data") st.dataframe(df) # Visualization of clusters plt.figure(figsize=(8, 6)) sns.scatterplot(x=df["Annual Income (k$)"], y=df["Spending Score (1-100)"], hue=df['Cluster'], palette='viridis') plt.xlabel("Annual Income (k$)") plt.ylabel("Spending Score (1-100)") plt.title("Customer Segmentation using K-Means") st.pyplot() # Store the model and scaler globally st.session_state['scaler'] = scaler st.session_state['kmeans'] = kmeans elif section == "Test Model": st.title("Test K-Means Model") income = st.number_input("Enter Annual Income (k$)", min_value=0, max_value=200, value=50) score = st.number_input("Enter Spending Score (1-100)", min_value=1, max_value=100, value=50) if 'scaler' in st.session_state and 'kmeans' in st.session_state: input_data = st.session_state['scaler'].transform([[income, score]]) prediction = st.session_state['kmeans'].predict(input_data) st.write(f"### Predicted Cluster: {prediction[0]}") else: st.write("### Please run the K-Means Model section first.")