Spaces:

allantacuelwvsu
/

diabetes_prediction

Sleeping

App Files Files Community

diabetes_prediction / app.py

allantacuelwvsu

update app.py

c17612a 18 days ago

raw

history blame contribute delete

7.56 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import SVC
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

	# Load
	@st.cache_data()
	def load_data():
	df = pd.read_csv("datasets/diabetes.csv")
	df.fillna(df.mean(), inplace=True)
	df.drop(columns=["SkinThickness", "BloodPressure"], inplace=True) # Cull negative importance valued columns. (plz see notebook)
	return df

	df = load_data()

	# Preprocess
	X = df.drop(columns=["Outcome"])
	y = df["Outcome"]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	# Models
	models = {
	"Logistic Regression": LogisticRegression(class_weight="balanced", random_state=1),
	"Decision Tree": DecisionTreeClassifier(random_state=1),
	"Random Forest": RandomForestClassifier(n_estimators=100, random_state=1),
	"Support Vector Machine": SVC(probability=True, random_state=1)
	}

	# Train
	for model in models.values():
	model.fit(X_train, y_train)

	# App
	st.title("Classification: Diabetes Prediction")
	st.caption("dataset: https://www.kaggle.com/code/mvanshika/diabetes-prediction/input -> diabetes.csv")
	selected_model = st.radio("Model:", list(models.keys()), index=0)
	model = models[selected_model]

	y_pred = model.predict(X_test)

	tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Diabetes Predictor"])

	with tab1:
	# Model Assessment
	st.header("Model Performance")

	st.write(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
	st.write(f"Precision: {precision_score(y_test, y_pred):.2f}")
	st.write(f"Recall: {recall_score(y_test, y_pred):.2f}")
	st.write(f"F1 Score: {f1_score(y_test, y_pred):.2f}")

	fig, ax = plt.subplots()
	sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", ax=ax)
	st.pyplot(fig)

	st.subheader("Feature Importances")

	# Plot feature importance. If can't, edi don't
	if hasattr(model, 'coef_'): # Linear models have 'coef_' component and np.abs() is needed to rank data by absolute importance as some values go negative and prediction gets awful. (plz see notebook)
	feature_importance = pd.DataFrame({
	"Feature": X.columns,
	"Importance": np.abs(model.coef_[0])
	}).sort_values(by="Importance", ascending=False)
	elif hasattr(model, 'feature_importances_'): # Tree-based models have 'feature_importances_' component, detect, then use it as is.
	feature_importance = pd.DataFrame({
	"Feature": X.columns,
	"Importance": model.feature_importances_
	}).sort_values(by="Importance", ascending=False)
	else:
	feature_importance = None

	if feature_importance is not None:
	fig, ax = plt.subplots(figsize=(6, 4))
	sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="coolwarm", ax=ax)
	ax.set_xlabel("Importance")
	ax.set_title("Feature Importance in Diabetes Prediction")
	st.pyplot(fig)
	else:
	st.write("Feature importance not available for this model.")

	st.divider()

	with tab2:
	# Dataset
	st.header("Dataset")

	@st.cache_data()
	def load():
	return pd.read_csv("datasets/diabetes.csv")
	dataset = load()
	dataset_processed = df
	def corr(data, title):
	data = data.select_dtypes(include=["number"])
	fig, ax = plt.subplots(figsize=(8, 6))
	sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
	ax.set_title(title)
	st.pyplot(fig)
	corr(dataset, "Correlation Matrix")

	# Toggle order
	view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])

	# Display head() or tail()
	if view_type == "Top -> Bottom":
	st.caption("datasets/diabetes.csv")
	st.dataframe(dataset.head(len(dataset)))
	st.caption("df")
	st.dataframe(dataset_processed.head(len(dataset_processed)))

	elif view_type == "Bottom -> Top":
	st.caption("datasets/diabetes.csv")
	st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
	st.caption("df")
	st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
	st.divider()

	with tab3:
	# Predictor
	st.header("Diabetes Prediction")

	def set_character(gender, age, glucose, bmi, pregnancies, insulin, diabetespedigreefunction):
	st.session_state.gender = gender
	st.session_state.age = age
	st.session_state.glucose = glucose
	st.session_state.bmi = bmi
	st.session_state.pregnancies = pregnancies
	st.session_state.insulin = insulin
	st.session_state.diabetespedigreefunction = diabetespedigreefunction

	# Celebrity / Made-up Characters Selection
	with st.expander("For reference: try a preset person. click here"):
	col1, col2 = st.columns(2)
	with col1:
	st.button("Tom Hanks (Diabetic)", on_click=set_character, args=("Male", 65, 150, 28.0, 0, 200, 0.6))
	st.button("Halle Berry (Diabetic)", on_click=set_character, args=("Female", 50, 160, 26.5, 2, 180, 0.8))
	with col2:
	st.button("Chris Hemsworth (Healthy)", on_click=set_character, args=("Male", 40, 90, 24.0, 0, 80, 0.4))
	st.button("Emma Watson (Healthy)", on_click=set_character, args=("Female", 33, 85, 22.0, 0, 75, 0.3))
	st.caption("(These aren't real people, just coincidental names)")
	# Input
	# Hide Pregnancies slider if male
	gender = st.radio("Gender:", ["Male", "Female"], index=["Male", "Female"].index(st.session_state.get("gender", "Male")))
	if gender == "Female":
	pregnancies = st.slider("Pregnancies", 0, 20, st.session_state.get("pregnancies", 0)) # Default 0
	else:
	pregnancies = 0 # Males can't get pregnant
	glucose = st.slider("Glucose Level", 50, 200, st.session_state.get("glucose", 90))
	insulin = st.slider("Insulin (mm Hg)", 5, 400, st.session_state.get("insulin", 20))
	bmi = st.slider("BMI", 10.0, 50.0, st.session_state.get("bmi", 23.5))
	diabetespedigreefunction = st.slider("Diabetes Pedigree Function", 0.0, 2.5, st.session_state.get("diabetespedigreefunction", 0.5))
	age = st.slider("Age", 10, 100, st.session_state.get("age", 25))
	input_data = np.array([[pregnancies, glucose, insulin, bmi, diabetespedigreefunction, age]])

	# Predict and Output
	diabetes_probability = model.predict_proba(scaler.transform(input_data))[0][1] # Have to scale data cos logreg is very sensitive to data scale
	# Dynamic color coding based on probability
	if diabetes_probability < 0.25:
	color = "green"
	elif diabetes_probability < 0.50:
	color = "#FFFF99"
	elif diabetes_probability < 0.75:
	color = "yellow"
	elif diabetes_probability < 0.90:
	color = "orange"
	else:
	color = "red"

	# Display result with color
	st.subheader("Diabetes Chance")
	st.markdown(f"<h1 style='font-size:50px; color:{color};'>{diabetes_probability:.2%}</h1>", unsafe_allow_html=True)
	st.divider()