Spaces:

mr-usman
/

colelithiasis

Sleeping

File size: 12,963 Bytes

461222e

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from plotly import graph_objects as go

st.set_page_config(layout="wide")

# Load Dataset
def load_data():
    data = pd.read_excel(r'Model Training/colelithiasis_dataset.xlsx')  # Update with your dataset file path
    data.drop('Patient No.', axis=1, inplace=True)
    return data

# Initialize Session State
if "data" not in st.session_state:
    st.session_state.data = load_data()

def introduction_page():
    st.title("Introduction")
    st.markdown("""

    ## Project Overview

    This project analyzes the Colelithiasis dataset to perform exploratory data analysis (EDA) and prediction using pre-trained machine learning models. The goal is to provide insights into the data and make predictions efficiently.



    ## Objectives

    - Perform EDA to uncover patterns and insights.

    - Use pre-trained machine learning models for predictions.

    - Create an interactive Streamlit application.

    """)

def stats_page():
    st.title("Exploratory Data Analysis")

    # Dataset Overview
    st.subheader("Dataset Overview")
    st.dataframe(st.session_state.data.head())

    # Summary Statistics
    st.subheader("Summary Statistics")
    st.write(st.session_state.data.describe())

    # Correlation Matrix
    st.subheader("Correlation Analysis")

    # encode the target variable
    data = st.session_state.data.copy()
    data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True)

    # apply ordinal encoding to the categorical columns
    categorical_columns = ['Gender','Family history','Obese/non obese']
    encoder = joblib.load('Model Training\encoder.pkl')
    data[categorical_columns] = encoder.transform(data[categorical_columns])
    
    correlation = data.corr()
    plt.figure(figsize=(5, 3))
    # reduce the font size of the heatmap
    sns.set(font_scale=0.5)
    sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
    st.pyplot(plt, use_container_width=False)

def eda_page():
    st.title("Exploratory Data Analysis")

    # Interactive Visualizations
    st.subheader("Visualizations")
    chart_type = st.selectbox("Choose Chart Type", ["Histogram", "Scatter Plot", "Box Plot"])

    if chart_type == "Histogram":
        column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns)
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=st.session_state.data[column], name=column, marker_color="indigo"))
        fig.update_layout(
            title=dict(text="Histogram Analysis", x=0.5, font=dict(size=22)),
            xaxis_title=column,
            yaxis_title="Count",
            legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
            bargap=0.2,
            hovermode="x unified",
            template="plotly_dark"
        )
        st.plotly_chart(fig)

    elif chart_type == "Scatter Plot":
        x_col = st.selectbox("Choose X-axis Column", st.session_state.data.columns)
        y_col = st.selectbox("Choose Y-axis Column", st.session_state.data.columns)
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=st.session_state.data[x_col],
            y=st.session_state.data[y_col],
            mode="markers",
            marker=dict(size=10, color="purple", line=dict(width=1, color="white")),
            name=f"{y_col} vs {x_col}"
        ))
        fig.update_layout(
            title=dict(text="Scatter Plot Analysis", x=0.5, font=dict(size=22)),
            xaxis_title=x_col,
            yaxis_title=y_col,
            legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
            hovermode="closest",
            template="plotly_dark"
        )
        st.plotly_chart(fig)

    elif chart_type == "Box Plot":
        column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns)
        fig = go.Figure()
        fig.add_trace(go.Box(
            y=st.session_state.data[column],
            name=column,
            boxmean="sd",
            marker_color="teal"
        ))
        fig.update_layout(
            title=dict(text="Boxplot Analysis", x=0.5, font=dict(size=22)),
            yaxis_title=column,
            legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
            hovermode="y",
            template="plotly_dark"
        )
        st.plotly_chart(fig)


def model_page():
    st.title("Model Evaluation")
    test_data = pd.read_excel(r'Model Training\test_data.xlsx')  


    # encode the target variable
    test_data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True)

    # apply ordinal encoding to the categorical columns
    categorical_columns = ['Gender','Family history','Obese/non obese']
    encoder = joblib.load('Model Training\encoder.pkl')

    X = test_data.drop( columns=['Health_status'])
    X[categorical_columns] = encoder.transform(X[categorical_columns])
    y = test_data['Health_status']

    # apply standard scalling to numberical features in X
    numerical_columns = [col_name for col_name in X.columns if col_name not in categorical_columns]
    scaler = joblib.load('Model Training\scaler.pkl')  
    X[numerical_columns] = scaler.transform(X[numerical_columns])

    # Model Selection
    st.text("Model Selection")
    model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF",
                                                                 "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"])
 
    # Load pre-trained model
    model = None
    if model_choice == "SVM - Linear":
        model = joblib.load('Model Training\svm_model_linear.pkl')
    elif model_choice == "SVM - Polynomial":
        model = joblib.load('Model Training\svm_model_poly.pkl')
    elif model_choice == "SVM - RBF":
        model = joblib.load('Model Training\svm_model_rbf.pkl')
    elif model_choice == "Random Forest":
        model = joblib.load('Model Training\rf_model.pkl')
    elif model_choice == "Random Forest Boosted":
        model = joblib.load('Model Training\rf_boosted.pkl')
    elif model_choice == "Logistic Regression":
        model = joblib.load('Model Training\lr_model.pkl')
    elif model_choice == "GDA":
        model = joblib.load('Model Training\gda.pkl')


    if model:
        # Make Predictions
        y_pred = model.predict(X)
        col1, col2 = st.columns(2)
        with col1:
            st.subheader("### Predictions on the Test Data:")
            st.dataframe(pd.DataFrame({"Actual": y, "Predicted": y_pred}))

        with col2:
            st.subheader("Classification Report")
            report = classification_report(y, y_pred, output_dict=True)
            report_df = pd.DataFrame(report).transpose().reset_index()
            report_df.drop('support', axis=1, inplace=True)
            report_df.set_index(['index'], inplace=True)
            report_df.rename(index={'0.0': 'Negative', '1.0': 'Positive'}, inplace=True)
            report_df.iloc[report_df.index.get_loc('accuracy'), 0:2] = ''
            st.table(report_df)

        st.subheader("Confusion Matrix")
        conf_matrix = confusion_matrix(y, y_pred)
        # Generate text annotations for the confusion matrix
        text_annotations = np.array([[str(value) for value in row] for row in conf_matrix])

        col1, col2 = st.columns(2)
        with col1:
            # Create the heatmap using seaborn
            plt.figure(figsize=(3 , 3))
            sns.heatmap(conf_matrix, annot=text_annotations, fmt="", cmap="Blues", cbar=False, square=True)
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.title("Confusion Matrix")
            st.pyplot(plt)

 
def prediction_page():
    st.title("Get Your Diagnosis")
    st.subheader("Symptoms Entry Form")
    # Model Selection
    model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF",
                                                                "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"])

    # Load pre-trained model
    model = None
    if model_choice == "SVM - Linear":
        model = joblib.load('Model Training\svm_model_linear.pkl')
    elif model_choice == "SVM - Polynomial":
        model = joblib.load('Model Training\svm_model_poly.pkl')
    elif model_choice == "SVM - RBF":
        model = joblib.load('Model Training\svm_model_rbf.pkl')
    elif model_choice == "Random Forest":
        model = joblib.load('Model Training\rf_model.pkl')
    elif model_choice == "Random Forest Boosted":
        model = joblib.load('Model Training\rf_boosted.pkl')
    elif model_choice == "Logistic Regression":
        model = joblib.load('Model Training\lr_model.pkl')
    elif model_choice == "GDA":
        model = joblib.load('Model Training\gda.pkl')

    with st.form(key="health_data_form"):
        col1, col2, col3, col4 = st.columns(4)

        with col1:
            # Categorical features with dropdown selection
            gender = st.selectbox("Gender", ["Male", "Female"], key="gender")
            weight = st.number_input("Weight (kg)", min_value=0, step=1, key="weight")
            cholesterol = st.number_input("Cholesterol (mg/dL)", min_value=0, step=1, key="cholesterol")
        with col2:
            family_history = st.selectbox("Family History of Illness", ["Yes", "No"], key="family_history")
            bmi = st.number_input("BMI", min_value=0.0, step=0.1, key="bmi")
            triglycerides = st.number_input("Triglycerides Level (mg/dL)", min_value=0, step=1, key="triglycerides")
            
        with col3:
            height = st.number_input("Height (cm)", min_value=0.0, step=0.1, key="height")
            obese_status = st.selectbox("Obese/Non Obese", ["Obese", "Non-Obese"], key="obese_status")
            ldl = st.number_input("LDL Level (mg/dL)", min_value=0.0, step=0.1, key="ldl")

        with col4:
            vldl = st.number_input("VLDL Level (mg/dL)", min_value=0.0, step=0.1, key="vldl")


        
        # Submit button
        submit_button = st.form_submit_button(label="Submit" )

    if submit_button:
        # Create a DataFrame directly with the user input data
        data = pd.DataFrame({
            "Gender": [gender],
            "Family history": [family_history],
            "Height": [height],
            "Weight": [weight],
            "BMI": [bmi],
            "Obese/non obese": [obese_status],
            "Cholesterol": [cholesterol],
            "Triglycerides": [triglycerides],
            "LDL level": [ldl],
            "VLDL level": [vldl]
        })
        

        columns = ['Gender', 'Family history', 'Height', 'Weight', 'BMI', 'Obese/non obese', 'Cholesterol', 'Triglycerides level', 'LDL level', 'VLDL level']
        data = data.reindex(columns=columns, fill_value=0)
        
        categorical_columns = ['Gender','Family history','Obese/non obese']
        numerical_columns = [col_name for col_name in data.columns if col_name not in categorical_columns]
        # Encoding categorical data
        encoder = joblib.load('Model Training\encoder.pkl')  
        data[categorical_columns] = encoder.transform(data[categorical_columns])

        # Scaling the numeric features
        scaler = joblib.load('Model Training\scaler.pkl')
        data[numerical_columns] = scaler.transform(data[numerical_columns])

    
        
        prediction = int(model.predict(data)[0])
        st.write(f"### Predicted Diagnosis: {'Positive' if prediction == 1 else 'Negative'}")
        

def conclusion_page():
    st.title("Conclusion")
    st.markdown("""

    ## Key Takeaways

    - Comprehensive EDA provides actionable insights into the data.

    - Pre-trained machine learning models allow efficient predictions.

    - The interactive app makes the analysis accessible and engaging.



    Thank you for exploring this project!

    """)

# Sidebar Navigation Menu with radio buttons for page selection
page = st.sidebar.radio("Navigation Menu", ["Introduction","Descriptive Statistics", "Data Analytics", "Model Evaluation", "Get Your Diagnosis", "Conclusion"])

if page == "Introduction":
    introduction_page()
elif page == "Descriptive Statistics":
    stats_page()
elif page == "Data Analytics":
    eda_page()
elif page == "Model Evaluation":
    model_page()
elif page == "Get Your Diagnosis":
    prediction_page()
elif page == "Conclusion":
    conclusion_page()