allantacuelwvsu's picture
update app.py
c17612a
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Load
@st.cache_data()
def load_data():
df = pd.read_csv("datasets/diabetes.csv")
df.fillna(df.mean(), inplace=True)
df.drop(columns=["SkinThickness", "BloodPressure"], inplace=True) # Cull negative importance valued columns. (plz see notebook)
return df
df = load_data()
# Preprocess
X = df.drop(columns=["Outcome"])
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Models
models = {
"Logistic Regression": LogisticRegression(class_weight="balanced", random_state=1),
"Decision Tree": DecisionTreeClassifier(random_state=1),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=1),
"Support Vector Machine": SVC(probability=True, random_state=1)
}
# Train
for model in models.values():
model.fit(X_train, y_train)
# App
st.title("Classification: Diabetes Prediction")
st.caption("dataset: https://www.kaggle.com/code/mvanshika/diabetes-prediction/input -> diabetes.csv")
selected_model = st.radio("Model:", list(models.keys()), index=0)
model = models[selected_model]
y_pred = model.predict(X_test)
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Diabetes Predictor"])
with tab1:
# Model Assessment
st.header("Model Performance")
st.write(f"**Accuracy:** {accuracy_score(y_test, y_pred):.2f}")
st.write(f"**Precision:** {precision_score(y_test, y_pred):.2f}")
st.write(f"**Recall:** {recall_score(y_test, y_pred):.2f}")
st.write(f"**F1 Score:** {f1_score(y_test, y_pred):.2f}")
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", ax=ax)
st.pyplot(fig)
st.subheader("Feature Importances")
# Plot feature importance. If can't, edi don't
if hasattr(model, 'coef_'): # Linear models have 'coef_' component and np.abs() is needed to rank data by absolute importance as some values go negative and prediction gets awful. (plz see notebook)
feature_importance = pd.DataFrame({
"Feature": X.columns,
"Importance": np.abs(model.coef_[0])
}).sort_values(by="Importance", ascending=False)
elif hasattr(model, 'feature_importances_'): # Tree-based models have 'feature_importances_' component, detect, then use it as is.
feature_importance = pd.DataFrame({
"Feature": X.columns,
"Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)
else:
feature_importance = None
if feature_importance is not None:
fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="coolwarm", ax=ax)
ax.set_xlabel("Importance")
ax.set_title("Feature Importance in Diabetes Prediction")
st.pyplot(fig)
else:
st.write("Feature importance not available for this model.")
st.divider()
with tab2:
# Dataset
st.header("Dataset")
@st.cache_data()
def load():
return pd.read_csv("datasets/diabetes.csv")
dataset = load()
dataset_processed = df
def corr(data, title):
data = data.select_dtypes(include=["number"])
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
ax.set_title(title)
st.pyplot(fig)
corr(dataset, "Correlation Matrix")
# Toggle order
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
# Display head() or tail()
if view_type == "Top -> Bottom":
st.caption("datasets/diabetes.csv")
st.dataframe(dataset.head(len(dataset)))
st.caption("df")
st.dataframe(dataset_processed.head(len(dataset_processed)))
elif view_type == "Bottom -> Top":
st.caption("datasets/diabetes.csv")
st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
st.caption("df")
st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
st.divider()
with tab3:
# Predictor
st.header("Diabetes Prediction")
def set_character(gender, age, glucose, bmi, pregnancies, insulin, diabetespedigreefunction):
st.session_state.gender = gender
st.session_state.age = age
st.session_state.glucose = glucose
st.session_state.bmi = bmi
st.session_state.pregnancies = pregnancies
st.session_state.insulin = insulin
st.session_state.diabetespedigreefunction = diabetespedigreefunction
# Celebrity / Made-up Characters Selection
with st.expander("For reference: try a preset person. **click here**"):
col1, col2 = st.columns(2)
with col1:
st.button("Tom Hanks (Diabetic)", on_click=set_character, args=("Male", 65, 150, 28.0, 0, 200, 0.6))
st.button("Halle Berry (Diabetic)", on_click=set_character, args=("Female", 50, 160, 26.5, 2, 180, 0.8))
with col2:
st.button("Chris Hemsworth (Healthy)", on_click=set_character, args=("Male", 40, 90, 24.0, 0, 80, 0.4))
st.button("Emma Watson (Healthy)", on_click=set_character, args=("Female", 33, 85, 22.0, 0, 75, 0.3))
st.caption("(These aren't real people, just coincidental names)")
# Input
# Hide Pregnancies slider if male
gender = st.radio("Gender:", ["Male", "Female"], index=["Male", "Female"].index(st.session_state.get("gender", "Male")))
if gender == "Female":
pregnancies = st.slider("Pregnancies", 0, 20, st.session_state.get("pregnancies", 0)) # Default 0
else:
pregnancies = 0 # Males can't get pregnant
glucose = st.slider("Glucose Level", 50, 200, st.session_state.get("glucose", 90))
insulin = st.slider("Insulin (mm Hg)", 5, 400, st.session_state.get("insulin", 20))
bmi = st.slider("BMI", 10.0, 50.0, st.session_state.get("bmi", 23.5))
diabetespedigreefunction = st.slider("Diabetes Pedigree Function", 0.0, 2.5, st.session_state.get("diabetespedigreefunction", 0.5))
age = st.slider("Age", 10, 100, st.session_state.get("age", 25))
input_data = np.array([[pregnancies, glucose, insulin, bmi, diabetespedigreefunction, age]])
# Predict and Output
diabetes_probability = model.predict_proba(scaler.transform(input_data))[0][1] # Have to scale data cos logreg is very sensitive to data scale
# Dynamic color coding based on probability
if diabetes_probability < 0.25:
color = "green"
elif diabetes_probability < 0.50:
color = "#FFFF99"
elif diabetes_probability < 0.75:
color = "yellow"
elif diabetes_probability < 0.90:
color = "orange"
else:
color = "red"
# Display result with color
st.subheader("Diabetes Chance")
st.markdown(f"<h1 style='font-size:50px; color:{color};'>{diabetes_probability:.2%}</h1>", unsafe_allow_html=True)
st.divider()