allantacuelwvsu commited on
Commit
5b3cd15
·
1 Parent(s): 322abb6

update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -41
app.py CHANGED
@@ -3,83 +3,108 @@ import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- from sklearn.cluster import KMeans
7
  from sklearn.preprocessing import StandardScaler, LabelEncoder
 
8
  from sklearn.metrics import silhouette_score
9
 
10
  # Load dataset
11
- df = pd.read_csv("datasets/Mall_Customers.csv")
12
- df.drop(columns=['CustomerID'], inplace=True)
 
 
13
 
14
- # Encode Gender
15
- le = LabelEncoder()
16
- df['Genre'] = le.fit_transform(df['Genre'])
17
 
18
- # Standardize numerical columns
 
 
19
  scaler = StandardScaler()
20
  df_scaled = scaler.fit_transform(df)
21
 
22
- # Optimal K estimation
 
 
 
 
 
 
 
 
23
  k_optimal = 5
24
- kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=42)
25
  kmeans.fit(df_scaled)
26
  df['Cluster'] = kmeans.labels_
27
 
28
  sil_score = silhouette_score(df_scaled, kmeans.labels_)
29
 
30
  # Streamlit App
31
- st.title("Customer Segmentation using K-Means")
32
- st.caption("Dataset: Mall Customers")
33
 
34
- tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Predictor"])
35
 
36
  with tab1:
37
  st.header("Model Performance")
38
  st.write(f"**Silhouette Score:** {sil_score:.4f}")
39
 
40
- wcss = []
41
- k_values = range(1, 11)
42
- for k in k_values:
43
- kmeans_temp = KMeans(n_clusters=k, init='k-means++', random_state=42)
44
- kmeans_temp.fit(df_scaled)
45
- wcss.append(kmeans_temp.inertia_)
46
-
47
  fig, ax = plt.subplots()
48
- ax.plot(k_values, wcss, marker='o', linestyle='--')
49
- ax.set_xlabel('Number of Clusters (K)')
50
- ax.set_ylabel('WCSS')
51
- ax.set_title('Elbow Method for Optimal K')
52
  st.pyplot(fig)
53
 
 
 
 
 
 
 
 
 
 
54
  with tab2:
55
  st.header("Dataset")
56
- st.dataframe(df.head())
 
 
 
 
 
57
 
58
- fig, ax = plt.subplots(figsize=(8, 6))
59
- sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
60
- ax.set_title("Correlation Matrix")
61
- st.pyplot(fig)
62
 
 
 
 
 
 
 
63
  with tab3:
64
- st.header("Customer Segment Prediction")
65
 
66
- annual_income = st.slider("Annual Income (k$)", min_value=int(df['Annual Income (k$)'].min()), max_value=int(df['Annual Income (k$)'].max()), value=50)
67
- spending_score = st.slider("Spending Score (1-100)", min_value=int(df['Spending Score (1-100)'].min()), max_value=int(df['Spending Score (1-100)'].max()), value=50)
68
- gender = st.radio("Gender", ["Male", "Female"], index=0)
69
- gender_encoded = 1 if gender == "Female" else 0
70
 
71
- input_data = np.array([[gender_encoded, annual_income, spending_score]])
72
- input_scaled = scaler.transform(np.hstack((input_data, np.zeros((input_data.shape[0], 1))))) # Ensure same feature shape
73
- cluster_prediction = kmeans.predict(input_scaled)[0]
 
74
 
75
  st.subheader("Predicted Customer Segment")
76
- st.markdown(f"<h1 style='color:blue;'>{cluster_prediction}</h1>", unsafe_allow_html=True)
77
 
 
78
  fig, ax = plt.subplots()
79
  sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
80
- ax.scatter(annual_income, spending_score, color='red', s=100, edgecolors='black', label='Input Customer')
81
- ax.set_xlabel('Annual Income (k$)')
82
- ax.set_ylabel('Spending Score')
83
- ax.legend()
 
84
  st.pyplot(fig)
 
85
  st.divider()
 
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
 
6
  from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ from sklearn.cluster import KMeans
8
  from sklearn.metrics import silhouette_score
9
 
10
  # Load dataset
11
+ @st.cache_data()
12
+ def load_data():
13
+ df = pd.read_csv("datasets/Mall_Customers.csv")
14
+ return df
15
 
16
+ df = load_data()
 
 
17
 
18
+ df.drop(columns=["CustomerID"], inplace=True) # Drop non-essential column
19
+ le = LabelEncoder()
20
+ df["Genre"] = le.fit_transform(df["Genre"]) # Encode Gender (Male=0, Female=1)
21
  scaler = StandardScaler()
22
  df_scaled = scaler.fit_transform(df)
23
 
24
+ # Find optimal K
25
+ wcss = []
26
+ k_values = range(1, 11)
27
+ for k in k_values:
28
+ kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10)
29
+ kmeans.fit(df_scaled)
30
+ wcss.append(kmeans.inertia_)
31
+
32
+ # Choose optimal K (assumed 5 based on elbow curve)
33
  k_optimal = 5
34
+ kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10)
35
  kmeans.fit(df_scaled)
36
  df['Cluster'] = kmeans.labels_
37
 
38
  sil_score = silhouette_score(df_scaled, kmeans.labels_)
39
 
40
  # Streamlit App
41
+ st.title("Clustering: Mall Customers Segmentation")
42
+ st.caption("Dataset: Mall_Customers.csv")
43
 
44
+ tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"])
45
 
46
  with tab1:
47
  st.header("Model Performance")
48
  st.write(f"**Silhouette Score:** {sil_score:.4f}")
49
 
 
 
 
 
 
 
 
50
  fig, ax = plt.subplots()
51
+ plt.plot(k_values, wcss, marker='o', linestyle='--')
52
+ plt.xlabel('Number of Clusters (K)')
53
+ plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
54
+ plt.title('Elbow Method for Optimal K')
55
  st.pyplot(fig)
56
 
57
+ st.subheader("Customer Segments Visualization")
58
+ fig, ax = plt.subplots()
59
+ sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis')
60
+ plt.xlabel('Annual Income (k$)')
61
+ plt.ylabel('Spending Score')
62
+ plt.title('Customer Segments')
63
+ st.pyplot(fig)
64
+ st.divider()
65
+
66
  with tab2:
67
  st.header("Dataset")
68
+ def corr_matrix(data, title):
69
+ data = data.select_dtypes(include=["number"])
70
+ fig, ax = plt.subplots(figsize=(8, 6))
71
+ sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
72
+ ax.set_title(title)
73
+ st.pyplot(fig)
74
 
75
+ corr_matrix(df, "Correlation Matrix")
76
+ view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
 
 
77
 
78
+ if view_type == "Top -> Bottom":
79
+ st.dataframe(df.head(len(df)))
80
+ else:
81
+ st.dataframe(df.tail(len(df)).iloc[::-1])
82
+ st.divider()
83
+
84
  with tab3:
85
+ st.header("Customer Segment Predictor")
86
 
87
+ income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median()))
88
+ spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median()))
89
+ age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median()))
90
+ gender = st.radio("Gender", ["Male", "Female"])
91
 
92
+ input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"])
93
+ input_data["Genre"] = le.transform([gender])[0] # Encode gender
94
+ input_scaled = scaler.transform(input_data)
95
+ predicted_cluster = kmeans.predict(input_scaled)[0]
96
 
97
  st.subheader("Predicted Customer Segment")
98
+ st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True)
99
 
100
+ # Graph to visualize input placement
101
  fig, ax = plt.subplots()
102
  sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
103
+ plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100)
104
+ plt.xlabel('Annual Income (k$)')
105
+ plt.ylabel('Spending Score')
106
+ plt.title('Customer Segments with Your Input')
107
+ plt.legend()
108
  st.pyplot(fig)
109
+
110
  st.divider()