Ci-Dave's picture
Modified the summary table
68f459d
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.mixture import GaussianMixture
from datasets import load_dataset
import os
# Load dataset
@st.cache_data
def load_data():
dataset = load_dataset("Ci-Dave/SPY500MW_StockMarket")
df = pd.DataFrame(dataset["train"])
return df
# Preprocessing function
def preprocess_data(df):
numeric_cols = df.select_dtypes(include=[np.number]).columns # Select only numeric columns
df_numeric = df[numeric_cols].dropna() # Drop missing values
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)
return df_numeric, df_scaled, scaler
# Load dataset
df = load_data()
df_numeric, df_scaled, scaler = preprocess_data(df)
# Load trained models with error handling
def load_model(filename):
if os.path.exists(filename):
return joblib.load(filename)
else:
st.error(f"Model file {filename} not found. Make sure you uploaded the trained model.")
return None
kmeans = load_model("kmeans_stock_model.pkl")
hierarchical = load_model("hierarchical_stock_model.pkl")
dbscan = load_model("dbscan_stock_model.pkl")
gmm = load_model("gmm_stock_model.pkl")
# Sidebar navigation
st.sidebar.title("Stock Market Clustering")
page = st.sidebar.radio("Choose a model:", ["Home", "Dataset", "K-Means", "Hierarchical", "DBSCAN", "GMM"])
if page == "Home":
st.title("Stock Market Clustering Analysis")
st.write("""
This application analyzes stock market data using clustering techniques.
Explore different clustering models (K-Means, Hierarchical, DBSCAN, and GMM) to identify patterns in stock prices.
**Dataset:**
The dataset includes stock market price data with key attributes such as Open, High, Low, and Close values.
**Clustering Models:**
- **K-Means:** Groups stocks into clusters based on price similarities.
- **Hierarchical Clustering:** Forms a tree-like structure to identify stock relationships.
- **DBSCAN:** Detects core clusters and outliers in stock prices.
- **Gaussian Mixture Model (GMM):** Uses probabilistic clustering for stock analysis.
**Instructions:**
- Select a clustering model from the sidebar.
- View the clustering results, including cluster assignments, distributions, and visualizations.
""")
elif page == "Dataset":
st.title("Stock Market Dataset")
st.write("### Dataset Preview:")
st.dataframe(df[["Open", "High", "Low", "Close"]].head(), use_container_width=True) # Show only key features
st.write("### Full Dataset:")
st.dataframe(df, use_container_width=True)
st.write("### Dataset Summary:")
st.write(df_numeric.describe(), use_container_width=True)
st.write("### Correlation Heatmap:")
fig, ax = plt.subplots()
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", ax=ax)
st.pyplot(fig)
elif page == "K-Means":
st.title("K-Means Clustering")
if kmeans:
clusters = kmeans.predict(df_scaled)
df_numeric["KMeans Cluster"] = clusters
# Display Cluster Assignments
st.write("Cluster Assignments:")
st.dataframe(df_numeric[["KMeans Cluster"]].head())
# Display Cluster Distribution
cluster_counts = df_numeric["KMeans Cluster"].value_counts().reset_index()
cluster_counts.columns = ["Cluster", "Count"]
st.write("Cluster Distribution:")
st.dataframe(cluster_counts)
# Compute and Display Silhouette Score
silhouette = silhouette_score(df_scaled, clusters)
st.write(f"Silhouette Score: {silhouette:.4f}")
# Visualization
fig, ax = plt.subplots()
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax)
st.pyplot(fig)
elif page == "Hierarchical":
st.title("Hierarchical Clustering")
# Perform Hierarchical Clustering
linked = linkage(df_scaled, method='ward') # Create linkage matrix
clusters = fcluster(linked, 4, criterion='maxclust') # Assign cluster labels
df_numeric["Hierarchical Cluster"] = clusters # Add to dataframe
# Display Cluster Assignments
st.write("Cluster Assignments:")
st.dataframe(df_numeric[["Hierarchical Cluster"]].head())
# Display Cluster Distribution
cluster_counts = df_numeric["Hierarchical Cluster"].value_counts().reset_index()
cluster_counts.columns = ["Cluster", "Count"]
st.write("Cluster Distribution:")
st.dataframe(cluster_counts)
# Dendrogram Visualization
fig, ax = plt.subplots(figsize=(10, 5))
dendrogram(linked, truncate_mode='level', p=5, ax=ax) # Plot dendrogram
st.pyplot(fig)
# Scatter Plot of First Two Features
fig, ax = plt.subplots()
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax)
st.pyplot(fig)
elif page == "DBSCAN":
st.title("DBSCAN Clustering")
if dbscan:
clusters = dbscan.fit_predict(df_scaled)
df_numeric["DBSCAN Cluster"] = clusters
# Display Cluster Assignments
st.write("Cluster Assignments:")
st.dataframe(df_numeric[["DBSCAN Cluster"]].head())
# Display Cluster Distribution (Handle outliers)
cluster_counts = df_numeric["DBSCAN Cluster"].value_counts().reset_index()
cluster_counts.columns = ["Cluster", "Count"]
st.write("Cluster Distribution:")
st.dataframe(cluster_counts)
# Visualization: Different color for outliers
fig, ax = plt.subplots()
palette = {i: sns.color_palette("viridis", as_cmap=True)(i / 4) for i in set(clusters) if i != -1}
palette[-1] = "red" # Mark outliers in red
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette=palette, ax=ax)
st.pyplot(fig)
elif page == "GMM":
st.title("Gaussian Mixture Model (GMM)")
if gmm:
expected_features = gmm.means_.shape[1] # Get the number of features used during training
df_gmm = df_scaled[:, :expected_features] # Keep only the needed columns
clusters = gmm.predict(df_gmm)
df_numeric["GMM Cluster"] = clusters
# Display Cluster Assignments
st.write("Cluster Assignments:")
st.dataframe(df_numeric[["GMM Cluster"]].head())
# Display Cluster Distribution
cluster_counts = df_numeric["GMM Cluster"].value_counts().reset_index()
cluster_counts.columns = ["Cluster", "Count"]
st.write("Cluster Distribution:")
st.dataframe(cluster_counts)
# Compute and Display Silhouette Score
silhouette = silhouette_score(df_gmm, clusters)
st.write(f"Silhouette Score: {silhouette:.4f}")
# Visualization
fig, ax = plt.subplots()
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax)
st.pyplot(fig)