Spaces:

louiecerv
/

student_adaptivity_ML_approach

Sleeping

App Files Files Community

student_adaptivity_ML_approach / app.py

louiecerv

sync with remote

6f46c85 about 1 month ago

raw

history blame contribute delete

7.05 kB

	import os
	import streamlit as st
	from datasets import load_dataset
	import pandas as pd
	from huggingface_hub import login
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import GaussianNB
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.metrics import confusion_matrix, classification_report
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np
	from PIL import Image

	# Streamlit UI
	dataset_name = "louiecerv/student-adaptivity-dataset"

	# Retrieve Hugging Face token from environment variable
	hf_token = os.getenv("HF_TOKEN")

	if not hf_token:
	st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
	st.stop()

	# Login to Hugging Face Hub
	login(token=hf_token)

	# Load dataset
	try:
	with st.spinner("Loading dataset..."):
	dataset = load_dataset(dataset_name)
	st.success("Dataset loaded successfully.")
	except ValueError:
	st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
	st.stop()
	except PermissionError:
	st.error("Authentication failed. Check if your Hugging Face token is correct.")
	st.stop()
	except Exception as e:
	st.error(f"Unexpected error: {e}")
	st.stop()

	st.title("Online Adaptability Analysis Using ML Approaches")

	# Display image
	image = Image.open("adaptability.jpg")
	st.image(image, caption="Adaptability", use_container_width=True)


	# About this app
	with st.expander("About this App"):
	st.markdown("""
	### Overview
	This app is designed to analyze and predict student adaptability based on various factors such as age, education level, institution type, and more. The app utilizes machine learning models to provide insights and predictions.

	### Features
	- Data Preprocessing: Handles missing values, outliers, and ensures consistent data types.
	- Model Training: Trains multiple machine learning models including Logistic Regression, Naive Bayes, SVM, Decision Tree, Random Forest, Gradient Boosting, KNN, and MLP Neural Network.
	- Model Evaluation: Provides confusion matrices and classification reports for each model to evaluate performance.
	- Interactive Visualization: Uses Matplotlib and Seaborn to enhance the appearance of the confusion matrix and classification report.

	### How to Use
	1. Upload your dataset in CSV format.
	2. The app will automatically preprocess the data and train the models.
	3. Navigate through the tabs to view the performance of each model.

	### References
	For more information, please refer to the [IEEE Xplore document](https://ieeexplore.ieee.org/document/9579741).
	""")

	df = dataset["train"].to_pandas()

	# Convert all columns to pandas 'string' dtype
	df = df.astype('string')

	# Ensure consistent data types: Convert object columns to string dtype
	for col in df.columns:
	if df[col].dtype == 'object' or pd.api.types.is_object_dtype(df[col]):
	df[col] = df[col].astype('string')

	# Check for null values and replace with mean for numeric columns
	if df.isnull().values.any():
	for col in df.columns:
	if df[col].isnull().sum() > 0 and pd.api.types.is_numeric_dtype(df[col]):
	df[col] = df[col].fillna(df[col].mean())
	elif df[col].isnull().sum() > 0:
	# Fill NaN in string columns with 'Unknown' or any placeholder
	df[col] = df[col].fillna('Unknown')

	# Display the DataFrame
	st.write("### DataFrame:")
	st.write(df)

	# Show statistics
	st.write("### Statistics:")
	st.write(df.describe(include='all'))

	# List numeric and categorical columns
	numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
	categorical_cols = df.select_dtypes(include=['string']).columns.tolist()

	# Display column types
	st.write("### Numeric Columns:")
	st.write(numeric_cols)

	st.write("### Categorical Columns:")
	st.write(categorical_cols)

	# One hot encoding for text columns
	df = pd.get_dummies(df, columns=categorical_cols[:-1])

	# Label encode the target column 'Adaptivity'
	label_encoder = LabelEncoder()
	df['Adaptivity'] = label_encoder.fit_transform(df['Adaptivity'])

	# Apply standard scaling for all the columns except the target column
	scaler = StandardScaler()
	df[df.columns.difference(['Adaptivity'])] = scaler.fit_transform(df[df.columns.difference(['Adaptivity'])])

	# Split the dataset into training and testing sets (80% training, 20% testing)
	X = df.drop('Adaptivity', axis=1)
	y = df['Adaptivity']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define models to be used
	models = {
	"Logistic Regression": LogisticRegression(),
	"Naive Bayes": GaussianNB(),
	"SVM": SVC(),
	"Decision Tree": DecisionTreeClassifier(),
	"Random Forest": RandomForestClassifier(),
	"Gradient Boosting": GradientBoostingClassifier(),
	"KNN": KNeighborsClassifier(),
	"MLP Neural Network": MLPClassifier(max_iter=500) # Increased max_iter to 500
	}

	# Create tabs for each model to display results
	tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Logistic Regression", "Naive Bayes", "SVM", "Decision Tree", "Random Forest", "Gradient Boosting", "KNN", "MLP Neural Network"])

	tabs = [tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8]

	for (model_name, model), tab in zip(models.items(), tabs):
	with tab:
	# Train the model
	model.fit(X_train, y_train)

	# Make predictions
	y_pred = model.predict(X_test)

	# Reverse label encoding for predicted values and true values
	y_pred_labels = label_encoder.inverse_transform(y_pred)
	y_test_labels = label_encoder.inverse_transform(y_test)

	# Confusion matrix and classification report
	cm = confusion_matrix(y_test_labels, y_pred_labels)
	cr = classification_report(y_test_labels, y_pred_labels, output_dict=True)

	# Plot confusion matrix using seaborn heatmap
	fig, ax = plt.subplots()
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
	ax.set_xlabel('Predicted')
	ax.set_ylabel('True')
	ax.set_title(f'Confusion Matrix - {model_name}')

	# Display confusion matrix and classification report in the tab
	st.pyplot(fig)

	# Display classification report as dataframe
	cr_df = pd.DataFrame(cr).transpose()
	st.write(f"Classification Report - {model_name}")
	st.write(cr_df)

	# Short remark on model performance
	st.write(f"Model Performance Remark for {model_name}:")
	st.write(f"The {model_name} model shows the following performance metrics based on the classification report above.")