|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
st.title("🛍️ Customer Segmentation Tool") |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑🤝🧑 Customer Segmentation"]) |
|
|
|
|
|
with tab1: |
|
st.write(""" |
|
This app uses unsupervised learning techniques to segment customers based on their purchasing behavior. |
|
The dataset is preloaded and contains online retail data. |
|
### How It Works: |
|
- **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID. |
|
- **Step 2**: Process the data by calculating the total spent and aggregating the information by customer. |
|
- **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups. |
|
- **Step 4**: Visualize the customer segments with a scatter plot. |
|
""") |
|
|
|
|
|
file_path = "Online Retail.xlsx" |
|
df = pd.read_excel(file_path, sheet_name='Online Retail') |
|
|
|
|
|
with tab2: |
|
st.write("### Dataset Overview") |
|
st.write(df.head()) |
|
|
|
|
|
df = df.dropna(subset=["CustomerID"]) |
|
df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce') |
|
df = df.dropna(subset=["TotalSpent"]) |
|
|
|
|
|
customer_data = df.groupby("CustomerID").agg({ |
|
"TotalSpent": "sum", |
|
"Quantity": "sum", |
|
"UnitPrice": "mean" |
|
}).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"}) |
|
|
|
st.write("### Processed Customer Data") |
|
st.write(customer_data.head()) |
|
|
|
|
|
scaler = StandardScaler() |
|
customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index) |
|
|
|
|
|
st.write("### Elbow Method for Optimal Cluster Selection") |
|
distortions = [] |
|
K = range(1, 11) |
|
for k in K: |
|
kmeans = KMeans(n_clusters=k, random_state=42) |
|
kmeans.fit(customer_scaled) |
|
distortions.append(kmeans.inertia_) |
|
|
|
fig, ax = plt.subplots() |
|
ax.plot(K, distortions, marker='o') |
|
ax.set_xlabel("Number of Clusters") |
|
ax.set_ylabel("Distortion") |
|
ax.set_title("Elbow Method for Optimal k") |
|
st.pyplot(fig) |
|
|
|
|
|
with tab3: |
|
|
|
num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3) |
|
|
|
|
|
model = KMeans(n_clusters=num_clusters, random_state=42) |
|
customer_data["Cluster"] = model.fit_predict(customer_scaled) |
|
|
|
|
|
st.write("### Clusters Visualization") |
|
fig, ax = plt.subplots() |
|
scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis') |
|
ax.set_xlabel("Total Spent") |
|
ax.set_ylabel("Number of Transactions") |
|
ax.set_title("Customer Segments") |
|
plt.colorbar(scatter, label="Cluster") |
|
st.pyplot(fig) |
|
|
|
|
|
st.write("### Customer Segments Data") |
|
st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"})) |
|
|