Tom
updated
4ee4ca0
raw
history blame
6.76 kB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tempfile
import os
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import classification_report, accuracy_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Set seaborn style for better aesthetics
sns.set(style="whitegrid")
def enhanced_preprocessing(df):
# Handling missing values
df = df.fillna('Unknown')
# Encoding categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality
label_encoder = LabelEncoder()
df[col] = label_encoder.fit_transform(df[col])
else: # One-Hot Encoding for high-cardinality features
one_hot = pd.get_dummies(df[col], prefix=col)
df = pd.concat([df, one_hot], axis=1).drop(col, axis=1)
# Vectorizing free-text columns (example: interventions column)
if 'interventions' in df.columns:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['interventions'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1)
return df
def calculate_correlations(df, threshold=0.3):
correlations = df.corr()
significant_corr = correlations[abs(correlations) > threshold].stack().reset_index()
significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']]
significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation']
return significant_corr
def perform_clustering(df):
# Normalize the data for clustering
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Determine best clustering method based on dataset characteristics
kmeans = KMeans(n_clusters=4, random_state=42)
dbscan = DBSCAN(eps=0.5, min_samples=5)
kmeans_labels = kmeans.fit_predict(df_scaled)
dbscan_labels = dbscan.fit_predict(df_scaled)
kmeans_score = silhouette_score(df_scaled, kmeans_labels)
dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
if kmeans_score > dbscan_score:
df['Cluster'] = kmeans_labels
best_model = 'K-Means'
else:
df['Cluster'] = dbscan_labels
best_model = 'DBSCAN'
# Use PCA for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df_scaled)
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]
return df, best_model
def perform_predictions(df):
results = []
target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']]
for target in target_cols:
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model 1: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
# Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
if rf_accuracy > lr_accuracy:
results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy})
else:
results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy})
return results
def create_dashboard(df, correlation_data, clustering_data, prediction_results):
app = dash.Dash(__name__)
app.layout = html.Div([
html.H1('Comprehensive Student Data Analysis'),
html.Div([
html.H2('Correlation Analysis'),
dash_table.DataTable(
id='correlation_table',
columns=[{'name': i, 'id': i} for i in correlation_data.columns],
data=correlation_data.to_dict('records')
)
]),
html.Div([
html.H2('Clustering Analysis'),
html.P(f'Best Clustering Algorithm: {clustering_data["best_model"]}'),
dcc.Graph(
id='clustering_scatter',
figure={
'data': [
{
'x': df['PCA1'],
'y': df['PCA2'],
'mode': 'markers',
'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10},
'text': df['Cluster'],
'type': 'scatter'
}
],
'layout': {
'title': 'Cluster Visualization using PCA',
'xaxis': {'title': 'PCA Component 1'},
'yaxis': {'title': 'PCA Component 2'}
}
}
)
]),
html.Div([
html.H2('Prediction Models'),
dash_table.DataTable(
id='prediction_table',
columns=[{'name': i, 'id': i} for i in prediction_results.columns],
data=prediction_results.to_dict('records')
)
])
])
app.run_server(debug=True)
# Main execution
if __name__ == "__main__":
# Load dataset
df = pd.read_csv('student_data.csv') # Replace with your CSV file
# Preprocess the data
df = enhanced_preprocessing(df)
# Perform correlation analysis
correlation_data = calculate_correlations(df)
# Perform clustering analysis
df, best_model = perform_clustering(df)
clustering_data = {'best_model': best_model}
# Perform prediction analysis
prediction_results = pd.DataFrame(perform_predictions(df))
# Create and launch the dashboard
create_dashboard(df, correlation_data, clustering_data, prediction_results)