louiecerv's picture
sync with remote
6f46c85
import os
import streamlit as st
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
# Streamlit UI
dataset_name = "louiecerv/student-adaptivity-dataset"
# Retrieve Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
st.stop()
# Login to Hugging Face Hub
login(token=hf_token)
# Load dataset
try:
with st.spinner("Loading dataset..."):
dataset = load_dataset(dataset_name)
st.success("Dataset loaded successfully.")
except ValueError:
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
st.stop()
except PermissionError:
st.error("Authentication failed. Check if your Hugging Face token is correct.")
st.stop()
except Exception as e:
st.error(f"Unexpected error: {e}")
st.stop()
st.title("Online Adaptability Analysis Using ML Approaches")
# Display image
image = Image.open("adaptability.jpg")
st.image(image, caption="Adaptability", use_container_width=True)
# About this app
with st.expander("About this App"):
st.markdown("""
### Overview
This app is designed to analyze and predict student adaptability based on various factors such as age, education level, institution type, and more. The app utilizes machine learning models to provide insights and predictions.
### Features
- **Data Preprocessing**: Handles missing values, outliers, and ensures consistent data types.
- **Model Training**: Trains multiple machine learning models including Logistic Regression, Naive Bayes, SVM, Decision Tree, Random Forest, Gradient Boosting, KNN, and MLP Neural Network.
- **Model Evaluation**: Provides confusion matrices and classification reports for each model to evaluate performance.
- **Interactive Visualization**: Uses Matplotlib and Seaborn to enhance the appearance of the confusion matrix and classification report.
### How to Use
1. Upload your dataset in CSV format.
2. The app will automatically preprocess the data and train the models.
3. Navigate through the tabs to view the performance of each model.
### References
For more information, please refer to the [IEEE Xplore document](https://ieeexplore.ieee.org/document/9579741).
""")
df = dataset["train"].to_pandas()
# Convert all columns to pandas 'string' dtype
df = df.astype('string')
# Ensure consistent data types: Convert object columns to string dtype
for col in df.columns:
if df[col].dtype == 'object' or pd.api.types.is_object_dtype(df[col]):
df[col] = df[col].astype('string')
# Check for null values and replace with mean for numeric columns
if df.isnull().values.any():
for col in df.columns:
if df[col].isnull().sum() > 0 and pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].mean())
elif df[col].isnull().sum() > 0:
# Fill NaN in string columns with 'Unknown' or any placeholder
df[col] = df[col].fillna('Unknown')
# Display the DataFrame
st.write("### DataFrame:")
st.write(df)
# Show statistics
st.write("### Statistics:")
st.write(df.describe(include='all'))
# List numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['string']).columns.tolist()
# Display column types
st.write("### Numeric Columns:")
st.write(numeric_cols)
st.write("### Categorical Columns:")
st.write(categorical_cols)
# One hot encoding for text columns
df = pd.get_dummies(df, columns=categorical_cols[:-1])
# Label encode the target column 'Adaptivity'
label_encoder = LabelEncoder()
df['Adaptivity'] = label_encoder.fit_transform(df['Adaptivity'])
# Apply standard scaling for all the columns except the target column
scaler = StandardScaler()
df[df.columns.difference(['Adaptivity'])] = scaler.fit_transform(df[df.columns.difference(['Adaptivity'])])
# Split the dataset into training and testing sets (80% training, 20% testing)
X = df.drop('Adaptivity', axis=1)
y = df['Adaptivity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define models to be used
models = {
"Logistic Regression": LogisticRegression(),
"Naive Bayes": GaussianNB(),
"SVM": SVC(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier(),
"KNN": KNeighborsClassifier(),
"MLP Neural Network": MLPClassifier(max_iter=500) # Increased max_iter to 500
}
# Create tabs for each model to display results
tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Logistic Regression", "Naive Bayes", "SVM", "Decision Tree", "Random Forest", "Gradient Boosting", "KNN", "MLP Neural Network"])
tabs = [tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8]
for (model_name, model), tab in zip(models.items(), tabs):
with tab:
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Reverse label encoding for predicted values and true values
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)
# Confusion matrix and classification report
cm = confusion_matrix(y_test_labels, y_pred_labels)
cr = classification_report(y_test_labels, y_pred_labels, output_dict=True)
# Plot confusion matrix using seaborn heatmap
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title(f'Confusion Matrix - {model_name}')
# Display confusion matrix and classification report in the tab
st.pyplot(fig)
# Display classification report as dataframe
cr_df = pd.DataFrame(cr).transpose()
st.write(f"Classification Report - {model_name}")
st.write(cr_df)
# Short remark on model performance
st.write(f"Model Performance Remark for {model_name}:")
st.write(f"The {model_name} model shows the following performance metrics based on the classification report above.")