|
import os |
|
import streamlit as st |
|
from datasets import load_dataset |
|
import pandas as pd |
|
from huggingface_hub import login |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.naive_bayes import GaussianNB |
|
from sklearn.svm import SVC |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
from sklearn.neighbors import KNeighborsClassifier |
|
from sklearn.neural_network import MLPClassifier |
|
from sklearn.metrics import confusion_matrix, classification_report |
|
from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from PIL import Image |
|
|
|
|
|
dataset_name = "louiecerv/student-adaptivity-dataset" |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
if not hf_token: |
|
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.") |
|
st.stop() |
|
|
|
|
|
login(token=hf_token) |
|
|
|
|
|
try: |
|
with st.spinner("Loading dataset..."): |
|
dataset = load_dataset(dataset_name) |
|
st.success("Dataset loaded successfully.") |
|
except ValueError: |
|
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.") |
|
st.stop() |
|
except PermissionError: |
|
st.error("Authentication failed. Check if your Hugging Face token is correct.") |
|
st.stop() |
|
except Exception as e: |
|
st.error(f"Unexpected error: {e}") |
|
st.stop() |
|
|
|
st.title("Online Adaptability Analysis Using ML Approaches") |
|
|
|
|
|
image = Image.open("adaptability.jpg") |
|
st.image(image, caption="Adaptability", use_container_width=True) |
|
|
|
|
|
|
|
with st.expander("About this App"): |
|
st.markdown(""" |
|
### Overview |
|
This app is designed to analyze and predict student adaptability based on various factors such as age, education level, institution type, and more. The app utilizes machine learning models to provide insights and predictions. |
|
|
|
### Features |
|
- **Data Preprocessing**: Handles missing values, outliers, and ensures consistent data types. |
|
- **Model Training**: Trains multiple machine learning models including Logistic Regression, Naive Bayes, SVM, Decision Tree, Random Forest, Gradient Boosting, KNN, and MLP Neural Network. |
|
- **Model Evaluation**: Provides confusion matrices and classification reports for each model to evaluate performance. |
|
- **Interactive Visualization**: Uses Matplotlib and Seaborn to enhance the appearance of the confusion matrix and classification report. |
|
|
|
### How to Use |
|
1. Upload your dataset in CSV format. |
|
2. The app will automatically preprocess the data and train the models. |
|
3. Navigate through the tabs to view the performance of each model. |
|
|
|
### References |
|
For more information, please refer to the [IEEE Xplore document](https://ieeexplore.ieee.org/document/9579741). |
|
""") |
|
|
|
df = dataset["train"].to_pandas() |
|
|
|
|
|
df = df.astype('string') |
|
|
|
|
|
for col in df.columns: |
|
if df[col].dtype == 'object' or pd.api.types.is_object_dtype(df[col]): |
|
df[col] = df[col].astype('string') |
|
|
|
|
|
if df.isnull().values.any(): |
|
for col in df.columns: |
|
if df[col].isnull().sum() > 0 and pd.api.types.is_numeric_dtype(df[col]): |
|
df[col] = df[col].fillna(df[col].mean()) |
|
elif df[col].isnull().sum() > 0: |
|
|
|
df[col] = df[col].fillna('Unknown') |
|
|
|
|
|
st.write("### DataFrame:") |
|
st.write(df) |
|
|
|
|
|
st.write("### Statistics:") |
|
st.write(df.describe(include='all')) |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() |
|
categorical_cols = df.select_dtypes(include=['string']).columns.tolist() |
|
|
|
|
|
st.write("### Numeric Columns:") |
|
st.write(numeric_cols) |
|
|
|
st.write("### Categorical Columns:") |
|
st.write(categorical_cols) |
|
|
|
|
|
df = pd.get_dummies(df, columns=categorical_cols[:-1]) |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
df['Adaptivity'] = label_encoder.fit_transform(df['Adaptivity']) |
|
|
|
|
|
scaler = StandardScaler() |
|
df[df.columns.difference(['Adaptivity'])] = scaler.fit_transform(df[df.columns.difference(['Adaptivity'])]) |
|
|
|
|
|
X = df.drop('Adaptivity', axis=1) |
|
y = df['Adaptivity'] |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
models = { |
|
"Logistic Regression": LogisticRegression(), |
|
"Naive Bayes": GaussianNB(), |
|
"SVM": SVC(), |
|
"Decision Tree": DecisionTreeClassifier(), |
|
"Random Forest": RandomForestClassifier(), |
|
"Gradient Boosting": GradientBoostingClassifier(), |
|
"KNN": KNeighborsClassifier(), |
|
"MLP Neural Network": MLPClassifier(max_iter=500) |
|
} |
|
|
|
|
|
tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Logistic Regression", "Naive Bayes", "SVM", "Decision Tree", "Random Forest", "Gradient Boosting", "KNN", "MLP Neural Network"]) |
|
|
|
tabs = [tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8] |
|
|
|
for (model_name, model), tab in zip(models.items(), tabs): |
|
with tab: |
|
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
|
|
|
y_pred_labels = label_encoder.inverse_transform(y_pred) |
|
y_test_labels = label_encoder.inverse_transform(y_test) |
|
|
|
|
|
cm = confusion_matrix(y_test_labels, y_pred_labels) |
|
cr = classification_report(y_test_labels, y_pred_labels, output_dict=True) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) |
|
ax.set_xlabel('Predicted') |
|
ax.set_ylabel('True') |
|
ax.set_title(f'Confusion Matrix - {model_name}') |
|
|
|
|
|
st.pyplot(fig) |
|
|
|
|
|
cr_df = pd.DataFrame(cr).transpose() |
|
st.write(f"Classification Report - {model_name}") |
|
st.write(cr_df) |
|
|
|
|
|
st.write(f"Model Performance Remark for {model_name}:") |
|
st.write(f"The {model_name} model shows the following performance metrics based on the classification report above.") |