|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import joblib |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.cluster import KMeans, DBSCAN |
|
from sklearn.metrics import silhouette_score |
|
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster |
|
from sklearn.mixture import GaussianMixture |
|
from datasets import load_dataset |
|
import os |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
dataset = load_dataset("Ci-Dave/SPY500MW_StockMarket") |
|
df = pd.DataFrame(dataset["train"]) |
|
return df |
|
|
|
|
|
def preprocess_data(df): |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
df_numeric = df[numeric_cols].dropna() |
|
scaler = StandardScaler() |
|
df_scaled = scaler.fit_transform(df_numeric) |
|
return df_numeric, df_scaled, scaler |
|
|
|
|
|
df = load_data() |
|
df_numeric, df_scaled, scaler = preprocess_data(df) |
|
|
|
|
|
def load_model(filename): |
|
if os.path.exists(filename): |
|
return joblib.load(filename) |
|
else: |
|
st.error(f"Model file {filename} not found. Make sure you uploaded the trained model.") |
|
return None |
|
|
|
kmeans = load_model("kmeans_stock_model.pkl") |
|
hierarchical = load_model("hierarchical_stock_model.pkl") |
|
dbscan = load_model("dbscan_stock_model.pkl") |
|
gmm = load_model("gmm_stock_model.pkl") |
|
|
|
|
|
st.sidebar.title("Stock Market Clustering") |
|
page = st.sidebar.radio("Choose a model:", ["Home", "Dataset", "K-Means", "Hierarchical", "DBSCAN", "GMM"]) |
|
|
|
if page == "Home": |
|
st.title("Stock Market Clustering Analysis") |
|
st.write(""" |
|
This application analyzes stock market data using clustering techniques. |
|
Explore different clustering models (K-Means, Hierarchical, DBSCAN, and GMM) to identify patterns in stock prices. |
|
|
|
**Dataset:** |
|
The dataset includes stock market price data with key attributes such as Open, High, Low, and Close values. |
|
|
|
**Clustering Models:** |
|
- **K-Means:** Groups stocks into clusters based on price similarities. |
|
- **Hierarchical Clustering:** Forms a tree-like structure to identify stock relationships. |
|
- **DBSCAN:** Detects core clusters and outliers in stock prices. |
|
- **Gaussian Mixture Model (GMM):** Uses probabilistic clustering for stock analysis. |
|
|
|
**Instructions:** |
|
- Select a clustering model from the sidebar. |
|
- View the clustering results, including cluster assignments, distributions, and visualizations. |
|
""") |
|
|
|
elif page == "Dataset": |
|
st.title("Stock Market Dataset") |
|
|
|
st.write("### Dataset Preview:") |
|
st.dataframe(df[["Open", "High", "Low", "Close"]].head(), use_container_width=True) |
|
|
|
st.write("### Full Dataset:") |
|
st.dataframe(df, use_container_width=True) |
|
|
|
st.write("### Dataset Summary:") |
|
st.write(df_numeric.describe(), use_container_width=True) |
|
|
|
st.write("### Correlation Heatmap:") |
|
fig, ax = plt.subplots() |
|
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", ax=ax) |
|
st.pyplot(fig) |
|
|
|
elif page == "K-Means": |
|
st.title("K-Means Clustering") |
|
if kmeans: |
|
clusters = kmeans.predict(df_scaled) |
|
df_numeric["KMeans Cluster"] = clusters |
|
|
|
|
|
st.write("Cluster Assignments:") |
|
st.dataframe(df_numeric[["KMeans Cluster"]].head()) |
|
|
|
|
|
cluster_counts = df_numeric["KMeans Cluster"].value_counts().reset_index() |
|
cluster_counts.columns = ["Cluster", "Count"] |
|
st.write("Cluster Distribution:") |
|
st.dataframe(cluster_counts) |
|
|
|
|
|
silhouette = silhouette_score(df_scaled, clusters) |
|
st.write(f"Silhouette Score: {silhouette:.4f}") |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) |
|
st.pyplot(fig) |
|
|
|
elif page == "Hierarchical": |
|
st.title("Hierarchical Clustering") |
|
|
|
|
|
linked = linkage(df_scaled, method='ward') |
|
clusters = fcluster(linked, 4, criterion='maxclust') |
|
df_numeric["Hierarchical Cluster"] = clusters |
|
|
|
|
|
st.write("Cluster Assignments:") |
|
st.dataframe(df_numeric[["Hierarchical Cluster"]].head()) |
|
|
|
|
|
cluster_counts = df_numeric["Hierarchical Cluster"].value_counts().reset_index() |
|
cluster_counts.columns = ["Cluster", "Count"] |
|
st.write("Cluster Distribution:") |
|
st.dataframe(cluster_counts) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
dendrogram(linked, truncate_mode='level', p=5, ax=ax) |
|
st.pyplot(fig) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) |
|
st.pyplot(fig) |
|
|
|
elif page == "DBSCAN": |
|
st.title("DBSCAN Clustering") |
|
if dbscan: |
|
clusters = dbscan.fit_predict(df_scaled) |
|
df_numeric["DBSCAN Cluster"] = clusters |
|
|
|
|
|
st.write("Cluster Assignments:") |
|
st.dataframe(df_numeric[["DBSCAN Cluster"]].head()) |
|
|
|
|
|
cluster_counts = df_numeric["DBSCAN Cluster"].value_counts().reset_index() |
|
cluster_counts.columns = ["Cluster", "Count"] |
|
st.write("Cluster Distribution:") |
|
st.dataframe(cluster_counts) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
palette = {i: sns.color_palette("viridis", as_cmap=True)(i / 4) for i in set(clusters) if i != -1} |
|
palette[-1] = "red" |
|
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette=palette, ax=ax) |
|
st.pyplot(fig) |
|
|
|
elif page == "GMM": |
|
st.title("Gaussian Mixture Model (GMM)") |
|
if gmm: |
|
expected_features = gmm.means_.shape[1] |
|
df_gmm = df_scaled[:, :expected_features] |
|
clusters = gmm.predict(df_gmm) |
|
df_numeric["GMM Cluster"] = clusters |
|
|
|
|
|
st.write("Cluster Assignments:") |
|
st.dataframe(df_numeric[["GMM Cluster"]].head()) |
|
|
|
|
|
cluster_counts = df_numeric["GMM Cluster"].value_counts().reset_index() |
|
cluster_counts.columns = ["Cluster", "Count"] |
|
st.write("Cluster Distribution:") |
|
st.dataframe(cluster_counts) |
|
|
|
|
|
silhouette = silhouette_score(df_gmm, clusters) |
|
st.write(f"Silhouette Score: {silhouette:.4f}") |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) |
|
st.pyplot(fig) |
|
|