import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
@st.cache_data
def load_data():
    file_path = "Mall_Customers.csv"
    df = pd.read_csv(file_path)
    return df

df = load_data()

# Sidebar for navigation
st.sidebar.title("K-Means Clustering App")
section = st.sidebar.radio("Go to", ["Introduction", "Data Exploration", "K-Means Model", "Test Model"])

if section == "Introduction":
    st.title("Introduction to K-Means Clustering")
    
    st.write("### About the Model")
    st.write("K-Means Clustering is an unsupervised machine learning algorithm used for customer segmentation. It helps identify different groups of customers based on their spending behavior and income.")
    
    st.write("### About the Dataset")
    st.write("The dataset consists of customer information, including:")
    st.markdown("- **CustomerID**: Unique identifier for each customer.")
    st.markdown("- **Gender**: Male or Female.")
    st.markdown("- **Age**: Age of the customer.")
    st.markdown("- **Annual Income (k$)**: Customer's yearly income.")
    st.markdown("- **Spending Score (1-100)**: A score assigned based on spending behavior.")
    
    st.write("### How to Use the App")
    st.markdown("1. **Go to 'Data Exploration'**: Understand the dataset using statistics and visualizations.")
    st.markdown("2. **Go to 'K-Means Model'**: Train the model and visualize clusters.")
    st.markdown("3. **Go to 'Test Model'**: Input values to predict customer cluster.")
    
    st.write("### Insights")
    st.markdown("- Customers can be grouped into different segments based on their income and spending habits.")
    st.markdown("- The Elbow Method helps determine the optimal number of clusters.")
    st.markdown("- Businesses can use these insights to tailor marketing strategies and improve customer engagement.")

elif section == "Data Exploration":
    st.title("Data Exploration")
    st.write("### First 5 rows of dataset")
    st.dataframe(df.head())
    
    st.write("### Summary Statistics")
    st.write(df.describe())
    
    st.write("### Pairplot")
    sns.pairplot(df.drop(columns=["CustomerID", "Gender"]), diag_kind="kde")
    st.pyplot()
    
    st.write("### Correlation Heatmap")
    plt.figure(figsize=(8, 6))
    sns.heatmap(df.drop(columns=["CustomerID", "Gender"]).corr(), annot=True, cmap="coolwarm")
    st.pyplot()

elif section == "K-Means Model":
    st.title("K-Means Clustering")
    
    # Selecting features for clustering
    features = df[["Annual Income (k$)", "Spending Score (1-100)"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Finding the optimal number of clusters using Elbow Method
    st.write("### Elbow Method")
    inertia = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(scaled_features)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, 11), inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    st.pyplot()
    
    # Train K-Means Model
    k = st.slider("Select Number of Clusters", 2, 10, 5)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(scaled_features)
    
    st.write("### Clustered Data")
    st.dataframe(df)
    
    # Visualization of clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df["Annual Income (k$)"], y=df["Spending Score (1-100)"], hue=df['Cluster'], palette='viridis')
    plt.xlabel("Annual Income (k$)")
    plt.ylabel("Spending Score (1-100)")
    plt.title("Customer Segmentation using K-Means")
    st.pyplot()

    # Store the model and scaler globally
    st.session_state['scaler'] = scaler
    st.session_state['kmeans'] = kmeans

elif section == "Test Model":
    st.title("Test K-Means Model")
    
    income = st.number_input("Enter Annual Income (k$)", min_value=0, max_value=200, value=50)
    score = st.number_input("Enter Spending Score (1-100)", min_value=1, max_value=100, value=50)
    
    if 'scaler' in st.session_state and 'kmeans' in st.session_state:
        input_data = st.session_state['scaler'].transform([[income, score]])
        prediction = st.session_state['kmeans'].predict(input_data)
        st.write(f"### Predicted Cluster: {prediction[0]}")
    else:
        st.write("### Please run the K-Means Model section first.")