|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.svm import SVC |
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix |
|
|
|
|
|
@st.cache_data() |
|
def load_data(): |
|
df = pd.read_csv("datasets/diabetes.csv") |
|
df.fillna(df.mean(), inplace=True) |
|
df.drop(columns=["SkinThickness", "BloodPressure"], inplace=True) |
|
return df |
|
|
|
df = load_data() |
|
|
|
|
|
X = df.drop(columns=["Outcome"]) |
|
y = df["Outcome"] |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) |
|
|
|
scaler = StandardScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
|
|
models = { |
|
"Logistic Regression": LogisticRegression(class_weight="balanced", random_state=1), |
|
"Decision Tree": DecisionTreeClassifier(random_state=1), |
|
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=1), |
|
"Support Vector Machine": SVC(probability=True, random_state=1) |
|
} |
|
|
|
|
|
for model in models.values(): |
|
model.fit(X_train, y_train) |
|
|
|
|
|
st.title("Classification: Diabetes Prediction") |
|
st.caption("dataset: https://www.kaggle.com/code/mvanshika/diabetes-prediction/input -> diabetes.csv") |
|
selected_model = st.radio("Model:", list(models.keys()), index=0) |
|
model = models[selected_model] |
|
|
|
y_pred = model.predict(X_test) |
|
|
|
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Diabetes Predictor"]) |
|
|
|
with tab1: |
|
|
|
st.header("Model Performance") |
|
|
|
st.write(f"**Accuracy:** {accuracy_score(y_test, y_pred):.2f}") |
|
st.write(f"**Precision:** {precision_score(y_test, y_pred):.2f}") |
|
st.write(f"**Recall:** {recall_score(y_test, y_pred):.2f}") |
|
st.write(f"**F1 Score:** {f1_score(y_test, y_pred):.2f}") |
|
|
|
fig, ax = plt.subplots() |
|
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", ax=ax) |
|
st.pyplot(fig) |
|
|
|
st.subheader("Feature Importances") |
|
|
|
|
|
if hasattr(model, 'coef_'): |
|
feature_importance = pd.DataFrame({ |
|
"Feature": X.columns, |
|
"Importance": np.abs(model.coef_[0]) |
|
}).sort_values(by="Importance", ascending=False) |
|
elif hasattr(model, 'feature_importances_'): |
|
feature_importance = pd.DataFrame({ |
|
"Feature": X.columns, |
|
"Importance": model.feature_importances_ |
|
}).sort_values(by="Importance", ascending=False) |
|
else: |
|
feature_importance = None |
|
|
|
if feature_importance is not None: |
|
fig, ax = plt.subplots(figsize=(6, 4)) |
|
sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="coolwarm", ax=ax) |
|
ax.set_xlabel("Importance") |
|
ax.set_title("Feature Importance in Diabetes Prediction") |
|
st.pyplot(fig) |
|
else: |
|
st.write("Feature importance not available for this model.") |
|
|
|
st.divider() |
|
|
|
with tab2: |
|
|
|
st.header("Dataset") |
|
|
|
@st.cache_data() |
|
def load(): |
|
return pd.read_csv("datasets/diabetes.csv") |
|
dataset = load() |
|
dataset_processed = df |
|
def corr(data, title): |
|
data = data.select_dtypes(include=["number"]) |
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) |
|
ax.set_title(title) |
|
st.pyplot(fig) |
|
corr(dataset, "Correlation Matrix") |
|
|
|
|
|
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) |
|
|
|
|
|
if view_type == "Top -> Bottom": |
|
st.caption("datasets/diabetes.csv") |
|
st.dataframe(dataset.head(len(dataset))) |
|
st.caption("df") |
|
st.dataframe(dataset_processed.head(len(dataset_processed))) |
|
|
|
elif view_type == "Bottom -> Top": |
|
st.caption("datasets/diabetes.csv") |
|
st.dataframe(dataset.tail(len(dataset)).iloc[::-1]) |
|
st.caption("df") |
|
st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1]) |
|
st.divider() |
|
|
|
with tab3: |
|
|
|
st.header("Diabetes Prediction") |
|
|
|
def set_character(gender, age, glucose, bmi, pregnancies, insulin, diabetespedigreefunction): |
|
st.session_state.gender = gender |
|
st.session_state.age = age |
|
st.session_state.glucose = glucose |
|
st.session_state.bmi = bmi |
|
st.session_state.pregnancies = pregnancies |
|
st.session_state.insulin = insulin |
|
st.session_state.diabetespedigreefunction = diabetespedigreefunction |
|
|
|
|
|
with st.expander("For reference: try a preset person. **click here**"): |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.button("Tom Hanks (Diabetic)", on_click=set_character, args=("Male", 65, 150, 28.0, 0, 200, 0.6)) |
|
st.button("Halle Berry (Diabetic)", on_click=set_character, args=("Female", 50, 160, 26.5, 2, 180, 0.8)) |
|
with col2: |
|
st.button("Chris Hemsworth (Healthy)", on_click=set_character, args=("Male", 40, 90, 24.0, 0, 80, 0.4)) |
|
st.button("Emma Watson (Healthy)", on_click=set_character, args=("Female", 33, 85, 22.0, 0, 75, 0.3)) |
|
st.caption("(These aren't real people, just coincidental names)") |
|
|
|
|
|
gender = st.radio("Gender:", ["Male", "Female"], index=["Male", "Female"].index(st.session_state.get("gender", "Male"))) |
|
if gender == "Female": |
|
pregnancies = st.slider("Pregnancies", 0, 20, st.session_state.get("pregnancies", 0)) |
|
else: |
|
pregnancies = 0 |
|
glucose = st.slider("Glucose Level", 50, 200, st.session_state.get("glucose", 90)) |
|
insulin = st.slider("Insulin (mm Hg)", 5, 400, st.session_state.get("insulin", 20)) |
|
bmi = st.slider("BMI", 10.0, 50.0, st.session_state.get("bmi", 23.5)) |
|
diabetespedigreefunction = st.slider("Diabetes Pedigree Function", 0.0, 2.5, st.session_state.get("diabetespedigreefunction", 0.5)) |
|
age = st.slider("Age", 10, 100, st.session_state.get("age", 25)) |
|
input_data = np.array([[pregnancies, glucose, insulin, bmi, diabetespedigreefunction, age]]) |
|
|
|
|
|
diabetes_probability = model.predict_proba(scaler.transform(input_data))[0][1] |
|
|
|
if diabetes_probability < 0.25: |
|
color = "green" |
|
elif diabetes_probability < 0.50: |
|
color = "#FFFF99" |
|
elif diabetes_probability < 0.75: |
|
color = "yellow" |
|
elif diabetes_probability < 0.90: |
|
color = "orange" |
|
else: |
|
color = "red" |
|
|
|
|
|
st.subheader("Diabetes Chance") |
|
st.markdown(f"<h1 style='font-size:50px; color:{color};'>{diabetes_probability:.2%}</h1>", unsafe_allow_html=True) |
|
st.divider() |