import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # Load @st.cache_data() def load_data(): df = pd.read_csv("datasets/diabetes.csv") df.fillna(df.mean(), inplace=True) df.drop(columns=["SkinThickness", "BloodPressure"], inplace=True) # Cull negative importance valued columns. (plz see notebook) return df df = load_data() # Preprocess X = df.drop(columns=["Outcome"]) y = df["Outcome"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Models models = { "Logistic Regression": LogisticRegression(class_weight="balanced", random_state=1), "Decision Tree": DecisionTreeClassifier(random_state=1), "Random Forest": RandomForestClassifier(n_estimators=100, random_state=1), "Support Vector Machine": SVC(probability=True, random_state=1) } # Train for model in models.values(): model.fit(X_train, y_train) # App st.title("Classification: Diabetes Prediction") st.caption("dataset: https://www.kaggle.com/code/mvanshika/diabetes-prediction/input -> diabetes.csv") selected_model = st.radio("Model:", list(models.keys()), index=0) model = models[selected_model] y_pred = model.predict(X_test) tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Diabetes Predictor"]) with tab1: # Model Assessment st.header("Model Performance") st.write(f"**Accuracy:** {accuracy_score(y_test, y_pred):.2f}") st.write(f"**Precision:** {precision_score(y_test, y_pred):.2f}") st.write(f"**Recall:** {recall_score(y_test, y_pred):.2f}") st.write(f"**F1 Score:** {f1_score(y_test, y_pred):.2f}") fig, ax = plt.subplots() sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", ax=ax) st.pyplot(fig) st.subheader("Feature Importances") # Plot feature importance. If can't, edi don't if hasattr(model, 'coef_'): # Linear models have 'coef_' component and np.abs() is needed to rank data by absolute importance as some values go negative and prediction gets awful. (plz see notebook) feature_importance = pd.DataFrame({ "Feature": X.columns, "Importance": np.abs(model.coef_[0]) }).sort_values(by="Importance", ascending=False) elif hasattr(model, 'feature_importances_'): # Tree-based models have 'feature_importances_' component, detect, then use it as is. feature_importance = pd.DataFrame({ "Feature": X.columns, "Importance": model.feature_importances_ }).sort_values(by="Importance", ascending=False) else: feature_importance = None if feature_importance is not None: fig, ax = plt.subplots(figsize=(6, 4)) sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="coolwarm", ax=ax) ax.set_xlabel("Importance") ax.set_title("Feature Importance in Diabetes Prediction") st.pyplot(fig) else: st.write("Feature importance not available for this model.") st.divider() with tab2: # Dataset st.header("Dataset") @st.cache_data() def load(): return pd.read_csv("datasets/diabetes.csv") dataset = load() dataset_processed = df def corr(data, title): data = data.select_dtypes(include=["number"]) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) ax.set_title(title) st.pyplot(fig) corr(dataset, "Correlation Matrix") # Toggle order view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) # Display head() or tail() if view_type == "Top -> Bottom": st.caption("datasets/diabetes.csv") st.dataframe(dataset.head(len(dataset))) st.caption("df") st.dataframe(dataset_processed.head(len(dataset_processed))) elif view_type == "Bottom -> Top": st.caption("datasets/diabetes.csv") st.dataframe(dataset.tail(len(dataset)).iloc[::-1]) st.caption("df") st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1]) st.divider() with tab3: # Predictor st.header("Diabetes Prediction") def set_character(gender, age, glucose, bmi, pregnancies, insulin, diabetespedigreefunction): st.session_state.gender = gender st.session_state.age = age st.session_state.glucose = glucose st.session_state.bmi = bmi st.session_state.pregnancies = pregnancies st.session_state.insulin = insulin st.session_state.diabetespedigreefunction = diabetespedigreefunction # Celebrity / Made-up Characters Selection with st.expander("For reference: try a preset person. **click here**"): col1, col2 = st.columns(2) with col1: st.button("Tom Hanks (Diabetic)", on_click=set_character, args=("Male", 65, 150, 28.0, 0, 200, 0.6)) st.button("Halle Berry (Diabetic)", on_click=set_character, args=("Female", 50, 160, 26.5, 2, 180, 0.8)) with col2: st.button("Chris Hemsworth (Healthy)", on_click=set_character, args=("Male", 40, 90, 24.0, 0, 80, 0.4)) st.button("Emma Watson (Healthy)", on_click=set_character, args=("Female", 33, 85, 22.0, 0, 75, 0.3)) st.caption("(These aren't real people, just coincidental names)") # Input # Hide Pregnancies slider if male gender = st.radio("Gender:", ["Male", "Female"], index=["Male", "Female"].index(st.session_state.get("gender", "Male"))) if gender == "Female": pregnancies = st.slider("Pregnancies", 0, 20, st.session_state.get("pregnancies", 0)) # Default 0 else: pregnancies = 0 # Males can't get pregnant glucose = st.slider("Glucose Level", 50, 200, st.session_state.get("glucose", 90)) insulin = st.slider("Insulin (mm Hg)", 5, 400, st.session_state.get("insulin", 20)) bmi = st.slider("BMI", 10.0, 50.0, st.session_state.get("bmi", 23.5)) diabetespedigreefunction = st.slider("Diabetes Pedigree Function", 0.0, 2.5, st.session_state.get("diabetespedigreefunction", 0.5)) age = st.slider("Age", 10, 100, st.session_state.get("age", 25)) input_data = np.array([[pregnancies, glucose, insulin, bmi, diabetespedigreefunction, age]]) # Predict and Output diabetes_probability = model.predict_proba(scaler.transform(input_data))[0][1] # Have to scale data cos logreg is very sensitive to data scale # Dynamic color coding based on probability if diabetes_probability < 0.25: color = "green" elif diabetes_probability < 0.50: color = "#FFFF99" elif diabetes_probability < 0.75: color = "yellow" elif diabetes_probability < 0.90: color = "orange" else: color = "red" # Display result with color st.subheader("Diabetes Chance") st.markdown(f"

{diabetes_probability:.2%}

", unsafe_allow_html=True) st.divider()