chiichann's picture
Update app.py
b5535f7 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# App title
st.title("🛍️ Customer Segmentation Tool")
# 🎯 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])
# About Tab
with tab1:
st.write("""
This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
The dataset is preloaded and contains online retail data.
### How It Works:
- **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
- **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
- **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
- **Step 4**: Visualize the customer segments with a scatter plot.
""")
# Load preloaded dataset
file_path = "Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')
# Dataset Overview Tab
with tab2:
st.write("### Dataset Overview")
st.write(df.head())
# Preprocess data
df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
df = df.dropna(subset=["TotalSpent"])
# Aggregate data by Customer
customer_data = df.groupby("CustomerID").agg({
"TotalSpent": "sum",
"Quantity": "sum",
"UnitPrice": "mean"
}).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
st.write("### Processed Customer Data")
st.write(customer_data.head())
# Standardize the data
scaler = StandardScaler()
customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
# Elbow Method to determine optimal clusters
st.write("### Elbow Method for Optimal Cluster Selection")
distortions = []
K = range(1, 11)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(customer_scaled)
distortions.append(kmeans.inertia_)
fig, ax = plt.subplots()
ax.plot(K, distortions, marker='o')
ax.set_xlabel("Number of Clusters")
ax.set_ylabel("Distortion")
ax.set_title("Elbow Method for Optimal k")
st.pyplot(fig)
# Customer Segmentation Tab
with tab3:
# User selects the number of clusters
num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
# Apply K-Means clustering
model = KMeans(n_clusters=num_clusters, random_state=42)
customer_data["Cluster"] = model.fit_predict(customer_scaled)
# Visualize the clusters
st.write("### Clusters Visualization")
fig, ax = plt.subplots()
scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
ax.set_xlabel("Total Spent")
ax.set_ylabel("Number of Transactions")
ax.set_title("Customer Segments")
plt.colorbar(scatter, label="Cluster")
st.pyplot(fig)
# Show the segmented customer data
st.write("### Customer Segments Data")
st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))