import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings import tempfile import os import dash from dash import dcc from dash import html from dash import dash_table import gradio as gr from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans, DBSCAN from sklearn.metrics import classification_report, accuracy_score, silhouette_score from sklearn.decomposition import PCA from sklearn.manifold import TSNE # Suppress specific FutureWarnings warnings.filterwarnings("ignore", category=FutureWarning) # Set seaborn style for better aesthetics sns.set(style="whitegrid") def enhanced_preprocessing(df): # Handling missing values df = df.fillna('Unknown') # Encoding categorical features categorical_cols = df.select_dtypes(include=['object']).columns.tolist() for col in categorical_cols: if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality label_encoder = LabelEncoder() df[col] = label_encoder.fit_transform(df[col]) else: # One-Hot Encoding for high-cardinality features one_hot = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, one_hot], axis=1).drop(col, axis=1) # Vectorizing free-text columns (example: interventions column) if 'interventions' in df.columns: tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(df['interventions']) tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()) df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1) return df def calculate_correlations(df, threshold=0.3): correlations = df.corr() significant_corr = correlations[abs(correlations) > threshold].stack().reset_index() significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']] significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation'] return significant_corr def perform_clustering(df): # Normalize the data for clustering scaler = StandardScaler() df_scaled = scaler.fit_transform(df) # Determine best clustering method based on dataset characteristics kmeans = KMeans(n_clusters=4, random_state=42) dbscan = DBSCAN(eps=0.5, min_samples=5) kmeans_labels = kmeans.fit_predict(df_scaled) dbscan_labels = dbscan.fit_predict(df_scaled) kmeans_score = silhouette_score(df_scaled, kmeans_labels) dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1 if kmeans_score > dbscan_score: df['Cluster'] = kmeans_labels best_model = 'K-Means' else: df['Cluster'] = dbscan_labels best_model = 'DBSCAN' # Use PCA for visualization pca = PCA(n_components=2) pca_components = pca.fit_transform(df_scaled) df['PCA1'] = pca_components[:, 0] df['PCA2'] = pca_components[:, 1] return df, best_model def perform_predictions(df): results = [] target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']] for target in target_cols: X = df.drop(target, axis=1) y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Model 1: Random Forest rf_model = RandomForestClassifier(random_state=42) rf_model.fit(X_train, y_train) rf_pred = rf_model.predict(X_test) rf_accuracy = accuracy_score(y_test, rf_pred) # Model 2: Logistic Regression lr_model = LogisticRegression(max_iter=1000) lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_test) lr_accuracy = accuracy_score(y_test, lr_pred) if rf_accuracy > lr_accuracy: results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy}) else: results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy}) return results def create_dashboard(df, correlation_data, clustering_data, prediction_results): app = dash.Dash(__name__) app.layout = html.Div([ html.H1('Comprehensive Student Data Analysis'), html.Div([ html.H2('Correlation Analysis'), dash_table.DataTable( id='correlation_table', columns=[{'name': i, 'id': i} for i in correlation_data.columns], data=correlation_data.to_dict('records') ) ]), html.Div([ html.H2('Clustering Analysis'), html.P(f"Best Clustering Algorithm: {clustering_data['best_model']}"), dcc.Graph( id='clustering_scatter', figure={ 'data': [ { 'x': df['PCA1'], 'y': df['PCA2'], 'mode': 'markers', 'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10}, 'text': df['Cluster'], 'type': 'scatter' } ], 'layout': { 'title': 'Cluster Visualization using PCA', 'xaxis': {'title': 'PCA Component 1'}, 'yaxis': {'title': 'PCA Component 2'} } } ) ]), html.Div([ html.H2('Prediction Models'), dash_table.DataTable( id='prediction_table', columns=[{'name': i, 'id': i} for i in prediction_results.columns], data=prediction_results.to_dict('records') ) ]) ]) app.run_server(debug=True) def load_csv(file): df = pd.read_csv(file.name) df = enhanced_preprocessing(df) return df # Main execution iface = gr.Interface( fn=load_csv, inputs=gr.File(label="Upload CSV File"), outputs=gr.Dataframe(label="Preview of Uploaded Data"), description="Upload a CSV file to perform comprehensive student data analysis." ) iface.launch() # Note: The data loading is done through Gradio, no need for an additional file parameter. if __name__ == "__main__": pass