File size: 5,084 Bytes
b797e49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
st.title("๐ฉบ Diabetes Prediction App")
# Load dataset
@st.cache_data
def load_data():
file_path = "diabetes_prediction_dataset.csv"
df = pd.read_csv(file_path)
return df
df = load_data()
# Encode categorical features
label_encoders = {}
for col in ["gender", "smoking_history"]:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
# Convert binary features (0,1) to "Yes" and "No" for display
binary_columns = ["hypertension", "heart_disease", "diabetes"]
df_display = df.copy() # Keep a copy for display
for col in binary_columns:
df_display[col] = df_display[col].map({0: "No", 1: "Yes"})
# Splitting dataset
X = df.drop(columns=["diabetes"])
y = df["diabetes"] # Keep original 0/1 format
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
# Tabs
tab1, tab2, tab3 = st.tabs(["๐ Dataset Preview", "๐ Model Performance", "๐ฉบ Prediction"])
# 1๏ธโฃ **Tab 1: Dataset Preview**
with tab1:
st.subheader("๐ Complete Dataset Preview")
st.write(df_display) # Show dataset with Yes/No for better readability
st.subheader("๐ Correlation Heatmap")
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
st.pyplot(plt)
# 2๏ธโฃ **Tab 2: Model Performance**
with tab2:
st.subheader("๐ Model Performance")
# Evaluate model
y_pred = rf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
st.write(f"### โก Random Forest Accuracy: **{accuracy:.2f}**")
# Confusion Matrix
st.write("### ๐ Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(plt)
# ROC Curve
st.write("### ๐ ROC Curve")
fpr, tpr, _ = roc_curve(y_test, rf.predict_proba(X_test_scaled)[:,1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
st.pyplot(plt)
# 3๏ธโฃ **Tab 3: Prediction**
with tab3:
st.subheader("๐ฉบ Make a Prediction")
# User inputs
user_name = st.text_input("Patient Name", value="John Doe")
user_gender = st.selectbox("Gender", label_encoders["gender"].classes_, key="gender_input")
user_smoking = st.selectbox("Smoking History", label_encoders["smoking_history"].classes_, key="smoking_input")
# Convert categorical inputs using label encoders
user_gender_encoded = label_encoders["gender"].transform([user_gender])[0]
user_smoking_encoded = label_encoders["smoking_history"].transform([user_smoking])[0]
# User inputs numerical features
user_data = [user_gender_encoded, user_smoking_encoded]
for col in ["age", "bmi", "HbA1c_level", "blood_glucose_level"]:
user_data.append(st.number_input(f"Enter {col}", float(df[col].min()), float(df[col].max()), float(df[col].mean())))
# User inputs binary features
user_binary_data = {}
for col in ["hypertension", "heart_disease"]:
user_binary_data[col] = st.radio(f"{col.replace('_', ' ').title()} (Yes/No)", ["No", "Yes"])
# Convert "Yes"/"No" to numerical (0 or 1) before prediction
for col in ["hypertension", "heart_disease"]:
user_data.append(1 if user_binary_data[col] == "Yes" else 0)
# Convert input into array
user_data = np.array([user_data]).reshape(1, -1)
# Predict button
if st.button("๐ฎ Predict"):
user_data_scaled = scaler.transform(user_data)
# Prediction
prediction = rf.predict(user_data_scaled)
probability = rf.predict_proba(user_data_scaled)[:, 1][0]
# Display result with patient name
st.subheader(f"๐ค Prediction for {user_name}")
if prediction[0] == 1:
st.error(f"๐จ **{user_name} is likely to have diabetes.** (Probability: {probability:.2f})")
else:
st.success(f"โ
**{user_name} is not likely to have diabetes.** (Probability: {probability:.2f})")
|