import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import warnings | |
import tempfile | |
import os | |
import dash | |
import dash_core_components as dcc | |
import dash_html_components as html | |
import dash_table | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans, DBSCAN | |
from sklearn.metrics import classification_report, accuracy_score, silhouette_score | |
from sklearn.decomposition import PCA | |
from sklearn.manifold import TSNE | |
# Suppress specific FutureWarnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
# Set seaborn style for better aesthetics | |
sns.set(style="whitegrid") | |
def enhanced_preprocessing(df): | |
# Handling missing values | |
df = df.fillna('Unknown') | |
# Encoding categorical features | |
categorical_cols = df.select_dtypes(include=['object']).columns.tolist() | |
for col in categorical_cols: | |
if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality | |
label_encoder = LabelEncoder() | |
df[col] = label_encoder.fit_transform(df[col]) | |
else: # One-Hot Encoding for high-cardinality features | |
one_hot = pd.get_dummies(df[col], prefix=col) | |
df = pd.concat([df, one_hot], axis=1).drop(col, axis=1) | |
# Vectorizing free-text columns (example: interventions column) | |
if 'interventions' in df.columns: | |
tfidf = TfidfVectorizer() | |
tfidf_matrix = tfidf.fit_transform(df['interventions']) | |
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()) | |
df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1) | |
return df | |
def calculate_correlations(df, threshold=0.3): | |
correlations = df.corr() | |
significant_corr = correlations[abs(correlations) > threshold].stack().reset_index() | |
significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']] | |
significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation'] | |
return significant_corr | |
def perform_clustering(df): | |
# Normalize the data for clustering | |
scaler = StandardScaler() | |
df_scaled = scaler.fit_transform(df) | |
# Determine best clustering method based on dataset characteristics | |
kmeans = KMeans(n_clusters=4, random_state=42) | |
dbscan = DBSCAN(eps=0.5, min_samples=5) | |
kmeans_labels = kmeans.fit_predict(df_scaled) | |
dbscan_labels = dbscan.fit_predict(df_scaled) | |
kmeans_score = silhouette_score(df_scaled, kmeans_labels) | |
dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1 | |
if kmeans_score > dbscan_score: | |
df['Cluster'] = kmeans_labels | |
best_model = 'K-Means' | |
else: | |
df['Cluster'] = dbscan_labels | |
best_model = 'DBSCAN' | |
# Use PCA for visualization | |
pca = PCA(n_components=2) | |
pca_components = pca.fit_transform(df_scaled) | |
df['PCA1'] = pca_components[:, 0] | |
df['PCA2'] = pca_components[:, 1] | |
return df, best_model | |
def perform_predictions(df): | |
results = [] | |
target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']] | |
for target in target_cols: | |
X = df.drop(target, axis=1) | |
y = df[target] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Model 1: Random Forest | |
rf_model = RandomForestClassifier(random_state=42) | |, y_train) | |
rf_pred = rf_model.predict(X_test) | |
rf_accuracy = accuracy_score(y_test, rf_pred) | |
# Model 2: Logistic Regression | |
lr_model = LogisticRegression(max_iter=1000) | |, y_train) | |
lr_pred = lr_model.predict(X_test) | |
lr_accuracy = accuracy_score(y_test, lr_pred) | |
if rf_accuracy > lr_accuracy: | |
results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy}) | |
else: | |
results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy}) | |
return results | |
def create_dashboard(df, correlation_data, clustering_data, prediction_results): | |
app = dash.Dash(__name__) | |
app.layout = html.Div([ | |
html.H1('Comprehensive Student Data Analysis'), | |
html.Div([ | |
html.H2('Correlation Analysis'), | |
dash_table.DataTable( | |
id='correlation_table', | |
columns=[{'name': i, 'id': i} for i in correlation_data.columns], | |
data=correlation_data.to_dict('records') | |
) | |
]), | |
html.Div([ | |
html.H2('Clustering Analysis'), | |
html.P(f'Best Clustering Algorithm: {clustering_data["best_model"]}'), | |
dcc.Graph( | |
id='clustering_scatter', | |
figure={ | |
'data': [ | |
{ | |
'x': df['PCA1'], | |
'y': df['PCA2'], | |
'mode': 'markers', | |
'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10}, | |
'text': df['Cluster'], | |
'type': 'scatter' | |
} | |
], | |
'layout': { | |
'title': 'Cluster Visualization using PCA', | |
'xaxis': {'title': 'PCA Component 1'}, | |
'yaxis': {'title': 'PCA Component 2'} | |
} | |
} | |
) | |
]), | |
html.Div([ | |
html.H2('Prediction Models'), | |
dash_table.DataTable( | |
id='prediction_table', | |
columns=[{'name': i, 'id': i} for i in prediction_results.columns], | |
data=prediction_results.to_dict('records') | |
) | |
]) | |
]) | |
app.run_server(debug=True) | |
# Main execution | |
if __name__ == "__main__": | |
# Load dataset | |
df = pd.read_csv('student_data.csv') # Replace with your CSV file | |
# Preprocess the data | |
df = enhanced_preprocessing(df) | |
# Perform correlation analysis | |
correlation_data = calculate_correlations(df) | |
# Perform clustering analysis | |
df, best_model = perform_clustering(df) | |
clustering_data = {'best_model': best_model} | |
# Perform prediction analysis | |
prediction_results = pd.DataFrame(perform_predictions(df)) | |
# Create and launch the dashboard | |
create_dashboard(df, correlation_data, clustering_data, prediction_results) |