import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import joblib from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from plotly import graph_objects as go st.set_page_config(layout="wide") # Load Dataset def load_data(): data = pd.read_excel(r'Model Training/colelithiasis_dataset.xlsx') # Update with your dataset file path data.drop('Patient No.', axis=1, inplace=True) return data # Initialize Session State if "data" not in st.session_state: st.session_state.data = load_data() def introduction_page(): st.title("Introduction") st.markdown(""" ## Project Overview This project analyzes the Colelithiasis dataset to perform exploratory data analysis (EDA) and prediction using pre-trained machine learning models. The goal is to provide insights into the data and make predictions efficiently. ## Objectives - Perform EDA to uncover patterns and insights. - Use pre-trained machine learning models for predictions. - Create an interactive Streamlit application. """) def stats_page(): st.title("Exploratory Data Analysis") # Dataset Overview st.subheader("Dataset Overview") st.dataframe(st.session_state.data.head()) # Summary Statistics st.subheader("Summary Statistics") st.write(st.session_state.data.describe()) # Correlation Matrix st.subheader("Correlation Analysis") # encode the target variable data = st.session_state.data.copy() data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True) # apply ordinal encoding to the categorical columns categorical_columns = ['Gender','Family history','Obese/non obese'] encoder = joblib.load('Model Training\encoder.pkl') data[categorical_columns] = encoder.transform(data[categorical_columns]) correlation = data.corr() plt.figure(figsize=(5, 3)) # reduce the font size of the heatmap sns.set(font_scale=0.5) sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f") st.pyplot(plt, use_container_width=False) def eda_page(): st.title("Exploratory Data Analysis") # Interactive Visualizations st.subheader("Visualizations") chart_type = st.selectbox("Choose Chart Type", ["Histogram", "Scatter Plot", "Box Plot"]) if chart_type == "Histogram": column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns) fig = go.Figure() fig.add_trace(go.Histogram(x=st.session_state.data[column], name=column, marker_color="indigo")) fig.update_layout( title=dict(text="Histogram Analysis", x=0.5, font=dict(size=22)), xaxis_title=column, yaxis_title="Count", legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), bargap=0.2, hovermode="x unified", template="plotly_dark" ) st.plotly_chart(fig) elif chart_type == "Scatter Plot": x_col = st.selectbox("Choose X-axis Column", st.session_state.data.columns) y_col = st.selectbox("Choose Y-axis Column", st.session_state.data.columns) fig = go.Figure() fig.add_trace(go.Scatter( x=st.session_state.data[x_col], y=st.session_state.data[y_col], mode="markers", marker=dict(size=10, color="purple", line=dict(width=1, color="white")), name=f"{y_col} vs {x_col}" )) fig.update_layout( title=dict(text="Scatter Plot Analysis", x=0.5, font=dict(size=22)), xaxis_title=x_col, yaxis_title=y_col, legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), hovermode="closest", template="plotly_dark" ) st.plotly_chart(fig) elif chart_type == "Box Plot": column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns) fig = go.Figure() fig.add_trace(go.Box( y=st.session_state.data[column], name=column, boxmean="sd", marker_color="teal" )) fig.update_layout( title=dict(text="Boxplot Analysis", x=0.5, font=dict(size=22)), yaxis_title=column, legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), hovermode="y", template="plotly_dark" ) st.plotly_chart(fig) def model_page(): st.title("Model Evaluation") test_data = pd.read_excel(r'Model Training\test_data.xlsx') # encode the target variable test_data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True) # apply ordinal encoding to the categorical columns categorical_columns = ['Gender','Family history','Obese/non obese'] encoder = joblib.load('Model Training\encoder.pkl') X = test_data.drop( columns=['Health_status']) X[categorical_columns] = encoder.transform(X[categorical_columns]) y = test_data['Health_status'] # apply standard scalling to numberical features in X numerical_columns = [col_name for col_name in X.columns if col_name not in categorical_columns] scaler = joblib.load('Model Training\scaler.pkl') X[numerical_columns] = scaler.transform(X[numerical_columns]) # Model Selection st.text("Model Selection") model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF", "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"]) # Load pre-trained model model = None if model_choice == "SVM - Linear": model = joblib.load('Model Training\svm_model_linear.pkl') elif model_choice == "SVM - Polynomial": model = joblib.load('Model Training\svm_model_poly.pkl') elif model_choice == "SVM - RBF": model = joblib.load('Model Training\svm_model_rbf.pkl') elif model_choice == "Random Forest": model = joblib.load('Model Training\rf_model.pkl') elif model_choice == "Random Forest Boosted": model = joblib.load('Model Training\rf_boosted.pkl') elif model_choice == "Logistic Regression": model = joblib.load('Model Training\lr_model.pkl') elif model_choice == "GDA": model = joblib.load('Model Training\gda.pkl') if model: # Make Predictions y_pred = model.predict(X) col1, col2 = st.columns(2) with col1: st.subheader("### Predictions on the Test Data:") st.dataframe(pd.DataFrame({"Actual": y, "Predicted": y_pred})) with col2: st.subheader("Classification Report") report = classification_report(y, y_pred, output_dict=True) report_df = pd.DataFrame(report).transpose().reset_index() report_df.drop('support', axis=1, inplace=True) report_df.set_index(['index'], inplace=True) report_df.rename(index={'0.0': 'Negative', '1.0': 'Positive'}, inplace=True) report_df.iloc[report_df.index.get_loc('accuracy'), 0:2] = '' st.table(report_df) st.subheader("Confusion Matrix") conf_matrix = confusion_matrix(y, y_pred) # Generate text annotations for the confusion matrix text_annotations = np.array([[str(value) for value in row] for row in conf_matrix]) col1, col2 = st.columns(2) with col1: # Create the heatmap using seaborn plt.figure(figsize=(3 , 3)) sns.heatmap(conf_matrix, annot=text_annotations, fmt="", cmap="Blues", cbar=False, square=True) plt.xlabel("Predicted") plt.ylabel("Actual") plt.title("Confusion Matrix") st.pyplot(plt) def prediction_page(): st.title("Get Your Diagnosis") st.subheader("Symptoms Entry Form") # Model Selection model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF", "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"]) # Load pre-trained model model = None if model_choice == "SVM - Linear": model = joblib.load('Model Training\svm_model_linear.pkl') elif model_choice == "SVM - Polynomial": model = joblib.load('Model Training\svm_model_poly.pkl') elif model_choice == "SVM - RBF": model = joblib.load('Model Training\svm_model_rbf.pkl') elif model_choice == "Random Forest": model = joblib.load('Model Training\rf_model.pkl') elif model_choice == "Random Forest Boosted": model = joblib.load('Model Training\rf_boosted.pkl') elif model_choice == "Logistic Regression": model = joblib.load('Model Training\lr_model.pkl') elif model_choice == "GDA": model = joblib.load('Model Training\gda.pkl') with st.form(key="health_data_form"): col1, col2, col3, col4 = st.columns(4) with col1: # Categorical features with dropdown selection gender = st.selectbox("Gender", ["Male", "Female"], key="gender") weight = st.number_input("Weight (kg)", min_value=0, step=1, key="weight") cholesterol = st.number_input("Cholesterol (mg/dL)", min_value=0, step=1, key="cholesterol") with col2: family_history = st.selectbox("Family History of Illness", ["Yes", "No"], key="family_history") bmi = st.number_input("BMI", min_value=0.0, step=0.1, key="bmi") triglycerides = st.number_input("Triglycerides Level (mg/dL)", min_value=0, step=1, key="triglycerides") with col3: height = st.number_input("Height (cm)", min_value=0.0, step=0.1, key="height") obese_status = st.selectbox("Obese/Non Obese", ["Obese", "Non-Obese"], key="obese_status") ldl = st.number_input("LDL Level (mg/dL)", min_value=0.0, step=0.1, key="ldl") with col4: vldl = st.number_input("VLDL Level (mg/dL)", min_value=0.0, step=0.1, key="vldl") # Submit button submit_button = st.form_submit_button(label="Submit" ) if submit_button: # Create a DataFrame directly with the user input data data = pd.DataFrame({ "Gender": [gender], "Family history": [family_history], "Height": [height], "Weight": [weight], "BMI": [bmi], "Obese/non obese": [obese_status], "Cholesterol": [cholesterol], "Triglycerides": [triglycerides], "LDL level": [ldl], "VLDL level": [vldl] }) columns = ['Gender', 'Family history', 'Height', 'Weight', 'BMI', 'Obese/non obese', 'Cholesterol', 'Triglycerides level', 'LDL level', 'VLDL level'] data = data.reindex(columns=columns, fill_value=0) categorical_columns = ['Gender','Family history','Obese/non obese'] numerical_columns = [col_name for col_name in data.columns if col_name not in categorical_columns] # Encoding categorical data encoder = joblib.load('Model Training\encoder.pkl') data[categorical_columns] = encoder.transform(data[categorical_columns]) # Scaling the numeric features scaler = joblib.load('Model Training\scaler.pkl') data[numerical_columns] = scaler.transform(data[numerical_columns]) prediction = int(model.predict(data)[0]) st.write(f"### Predicted Diagnosis: {'Positive' if prediction == 1 else 'Negative'}") def conclusion_page(): st.title("Conclusion") st.markdown(""" ## Key Takeaways - Comprehensive EDA provides actionable insights into the data. - Pre-trained machine learning models allow efficient predictions. - The interactive app makes the analysis accessible and engaging. Thank you for exploring this project! """) # Sidebar Navigation Menu with radio buttons for page selection page = st.sidebar.radio("Navigation Menu", ["Introduction","Descriptive Statistics", "Data Analytics", "Model Evaluation", "Get Your Diagnosis", "Conclusion"]) if page == "Introduction": introduction_page() elif page == "Descriptive Statistics": stats_page() elif page == "Data Analytics": eda_page() elif page == "Model Evaluation": model_page() elif page == "Get Your Diagnosis": prediction_page() elif page == "Conclusion": conclusion_page()