Spaces:
Sleeping
Sleeping
File size: 6,624 Bytes
cab20ec c04ece2 4ee4ca0 d8e93c4 c8d673b cab20ec 4ee4ca0 cab20ec 4ee4ca0 6325d94 cab20ec 6325d94 cab20ec 4ee4ca0 537021a 4ee4ca0 c04ece2 35d1c40 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 c04ece2 4ee4ca0 7d6fc0e 4ee4ca0 c8d673b 4ee4ca0 c8d673b 4ee4ca0 c8d673b 4ee4ca0 c8d673b d8e93c4 3048271 c8d673b 4ee4ca0 c8d673b 4ee4ca0 c8d673b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tempfile
import os
import dash
from dash import dcc
from dash import html
from dash import dash_table
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import classification_report, accuracy_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Set seaborn style for better aesthetics
sns.set(style="whitegrid")
def enhanced_preprocessing(df):
# Handling missing values
df = df.fillna('Unknown')
# Encoding categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality
label_encoder = LabelEncoder()
df[col] = label_encoder.fit_transform(df[col])
else: # One-Hot Encoding for high-cardinality features
one_hot = pd.get_dummies(df[col], prefix=col)
df = pd.concat([df, one_hot], axis=1).drop(col, axis=1)
# Vectorizing free-text columns (example: interventions column)
if 'interventions' in df.columns:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['interventions'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1)
return df
def calculate_correlations(df, threshold=0.3):
correlations = df.corr()
significant_corr = correlations[abs(correlations) > threshold].stack().reset_index()
significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']]
significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation']
return significant_corr
def perform_clustering(df):
# Normalize the data for clustering
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Determine best clustering method based on dataset characteristics
kmeans = KMeans(n_clusters=4, random_state=42)
dbscan = DBSCAN(eps=0.5, min_samples=5)
kmeans_labels = kmeans.fit_predict(df_scaled)
dbscan_labels = dbscan.fit_predict(df_scaled)
kmeans_score = silhouette_score(df_scaled, kmeans_labels)
dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
if kmeans_score > dbscan_score:
df['Cluster'] = kmeans_labels
best_model = 'K-Means'
else:
df['Cluster'] = dbscan_labels
best_model = 'DBSCAN'
# Use PCA for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df_scaled)
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]
return df, best_model
def perform_predictions(df):
results = []
target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']]
for target in target_cols:
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model 1: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
# Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
if rf_accuracy > lr_accuracy:
results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy})
else:
results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy})
return results
def create_dashboard(df, correlation_data, clustering_data, prediction_results):
app = dash.Dash(__name__)
app.layout = html.Div([
html.H1('Comprehensive Student Data Analysis'),
html.Div([
html.H2('Correlation Analysis'),
dash_table.DataTable(
id='correlation_table',
columns=[{'name': i, 'id': i} for i in correlation_data.columns],
data=correlation_data.to_dict('records')
)
]),
html.Div([
html.H2('Clustering Analysis'),
html.P(f"Best Clustering Algorithm: {clustering_data['best_model']}"),
dcc.Graph(
id='clustering_scatter',
figure={
'data': [
{
'x': df['PCA1'],
'y': df['PCA2'],
'mode': 'markers',
'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10},
'text': df['Cluster'],
'type': 'scatter'
}
],
'layout': {
'title': 'Cluster Visualization using PCA',
'xaxis': {'title': 'PCA Component 1'},
'yaxis': {'title': 'PCA Component 2'}
}
}
)
]),
html.Div([
html.H2('Prediction Models'),
dash_table.DataTable(
id='prediction_table',
columns=[{'name': i, 'id': i} for i in prediction_results.columns],
data=prediction_results.to_dict('records')
)
])
])
app.run_server(debug=True)
def load_csv(file):
df = pd.read_csv(file.name)
df = enhanced_preprocessing(df)
return df
# Main execution
iface = gr.Interface(
fn=load_csv,
inputs=gr.File(label="Upload CSV File"),
outputs=gr.Dataframe(label="Preview of Uploaded Data"),
description="Upload a CSV file to perform comprehensive student data analysis."
)
iface.launch()
# Note: The data loading is done through Gradio, no need for an additional file parameter.
if __name__ == "__main__":
pass |