Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

App Files Files Community

customer_segmentation_tool / app.py

chiichann

Update app.py

b5535f7 verified about 1 month ago

raw

history blame contribute delete

3.52 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler
	import matplotlib.pyplot as plt
	import seaborn as sns

	# App title
	st.title("🛍️ Customer Segmentation Tool")

	# 🎯 Streamlit Tabs
	tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])

	# About Tab
	with tab1:
	st.write("""
	This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
	The dataset is preloaded and contains online retail data.
	### How It Works:
	- Step 1: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
	- Step 2: Process the data by calculating the total spent and aggregating the information by customer.
	- Step 3: Apply K-Means Clustering to segment the customers into distinct groups.
	- Step 4: Visualize the customer segments with a scatter plot.
	""")

	# Load preloaded dataset
	file_path = "Online Retail.xlsx"
	df = pd.read_excel(file_path, sheet_name='Online Retail')

	# Dataset Overview Tab
	with tab2:
	st.write("### Dataset Overview")
	st.write(df.head())

	# Preprocess data
	df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
	df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
	df = df.dropna(subset=["TotalSpent"])

	# Aggregate data by Customer
	customer_data = df.groupby("CustomerID").agg({
	"TotalSpent": "sum",
	"Quantity": "sum",
	"UnitPrice": "mean"
	}).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})

	st.write("### Processed Customer Data")
	st.write(customer_data.head())

	# Standardize the data
	scaler = StandardScaler()
	customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)

	# Elbow Method to determine optimal clusters
	st.write("### Elbow Method for Optimal Cluster Selection")
	distortions = []
	K = range(1, 11)
	for k in K:
	kmeans = KMeans(n_clusters=k, random_state=42)
	kmeans.fit(customer_scaled)
	distortions.append(kmeans.inertia_)

	fig, ax = plt.subplots()
	ax.plot(K, distortions, marker='o')
	ax.set_xlabel("Number of Clusters")
	ax.set_ylabel("Distortion")
	ax.set_title("Elbow Method for Optimal k")
	st.pyplot(fig)

	# Customer Segmentation Tab
	with tab3:
	# User selects the number of clusters
	num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)

	# Apply K-Means clustering
	model = KMeans(n_clusters=num_clusters, random_state=42)
	customer_data["Cluster"] = model.fit_predict(customer_scaled)

	# Visualize the clusters
	st.write("### Clusters Visualization")
	fig, ax = plt.subplots()
	scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
	ax.set_xlabel("Total Spent")
	ax.set_ylabel("Number of Transactions")
	ax.set_title("Customer Segments")
	plt.colorbar(scatter, label="Cluster")
	st.pyplot(fig)

	# Show the segmented customer data
	st.write("### Customer Segments Data")
	st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))