import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings import tempfile import os import dash import dash_core_components as dcc import dash_html_components as html import dash_table from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans, DBSCAN from sklearn.metrics import classification_report, accuracy_score, silhouette_score from sklearn.decomposition import PCA from sklearn.manifold import TSNE # Suppress specific FutureWarnings warnings.filterwarnings("ignore", category=FutureWarning) # Set seaborn style for better aesthetics sns.set(style="whitegrid") def enhanced_preprocessing(df): # Handling missing values df = df.fillna('Unknown') # Encoding categorical features categorical_cols = df.select_dtypes(include=['object']).columns.tolist() for col in categorical_cols: if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality label_encoder = LabelEncoder() df[col] = label_encoder.fit_transform(df[col]) else: # One-Hot Encoding for high-cardinality features one_hot = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, one_hot], axis=1).drop(col, axis=1) # Vectorizing free-text columns (example: interventions column) if 'interventions' in df.columns: tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(df['interventions']) tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()) df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1) return df def calculate_correlations(df, threshold=0.3): correlations = df.corr() significant_corr = correlations[abs(correlations) > threshold].stack().reset_index() significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']] significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation'] return significant_corr def perform_clustering(df): # Normalize the data for clustering scaler = StandardScaler() df_scaled = scaler.fit_transform(df) # Determine best clustering method based on dataset characteristics kmeans = KMeans(n_clusters=4, random_state=42) dbscan = DBSCAN(eps=0.5, min_samples=5) kmeans_labels = kmeans.fit_predict(df_scaled) dbscan_labels = dbscan.fit_predict(df_scaled) kmeans_score = silhouette_score(df_scaled, kmeans_labels) dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1 if kmeans_score > dbscan_score: df['Cluster'] = kmeans_labels best_model = 'K-Means' else: df['Cluster'] = dbscan_labels best_model = 'DBSCAN' # Use PCA for visualization pca = PCA(n_components=2) pca_components = pca.fit_transform(df_scaled) df['PCA1'] = pca_components[:, 0] df['PCA2'] = pca_components[:, 1] return df, best_model def perform_predictions(df): results = [] target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']] for target in target_cols: X = df.drop(target, axis=1) y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Model 1: Random Forest rf_model = RandomForestClassifier(random_state=42) rf_model.fit(X_train, y_train) rf_pred = rf_model.predict(X_test) rf_accuracy = accuracy_score(y_test, rf_pred) # Model 2: Logistic Regression lr_model = LogisticRegression(max_iter=1000) lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_test) lr_accuracy = accuracy_score(y_test, lr_pred) if rf_accuracy > lr_accuracy: results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy}) else: results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy}) return results def create_dashboard(df, correlation_data, clustering_data, prediction_results): app = dash.Dash(__name__) app.layout = html.Div([ html.H1('Comprehensive Student Data Analysis'), html.Div([ html.H2('Correlation Analysis'), dash_table.DataTable( id='correlation_table', columns=[{'name': i, 'id': i} for i in correlation_data.columns], data=correlation_data.to_dict('records') ) ]), html.Div([ html.H2('Clustering Analysis'), html.P(f'Best Clustering Algorithm: {clustering_data["best_model"]}'), dcc.Graph( id='clustering_scatter', figure={ 'data': [ { 'x': df['PCA1'], 'y': df['PCA2'], 'mode': 'markers', 'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10}, 'text': df['Cluster'], 'type': 'scatter' } ], 'layout': { 'title': 'Cluster Visualization using PCA', 'xaxis': {'title': 'PCA Component 1'}, 'yaxis': {'title': 'PCA Component 2'} } } ) ]), html.Div([ html.H2('Prediction Models'), dash_table.DataTable( id='prediction_table', columns=[{'name': i, 'id': i} for i in prediction_results.columns], data=prediction_results.to_dict('records') ) ]) ]) app.run_server(debug=True) # Main execution if __name__ == "__main__": # Load dataset df = pd.read_csv('student_data.csv') # Replace with your CSV file # Preprocess the data df = enhanced_preprocessing(df) # Perform correlation analysis correlation_data = calculate_correlations(df) # Perform clustering analysis df, best_model = perform_clustering(df) clustering_data = {'best_model': best_model} # Perform prediction analysis prediction_results = pd.DataFrame(perform_predictions(df)) # Create and launch the dashboard create_dashboard(df, correlation_data, clustering_data, prediction_results)