import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tempfile
import os
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import classification_report, accuracy_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Set seaborn style for better aesthetics
sns.set(style="whitegrid")

def enhanced_preprocessing(df):
    # Handling missing values
    df = df.fillna('Unknown')
    
    # Encoding categorical features
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in categorical_cols:
        if len(df[col].unique()) < 20:  # Label Encoding for columns with low cardinality
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
        else:  # One-Hot Encoding for high-cardinality features
            one_hot = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, one_hot], axis=1).drop(col, axis=1)
            
    # Vectorizing free-text columns (example: interventions column)
    if 'interventions' in df.columns:
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(df['interventions'])
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
        df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1)
    
    return df

def calculate_correlations(df, threshold=0.3):
    correlations = df.corr()
    significant_corr = correlations[abs(correlations) > threshold].stack().reset_index()
    significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']]
    significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation']
    
    return significant_corr

def perform_clustering(df):
    # Normalize the data for clustering
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # Determine best clustering method based on dataset characteristics
    kmeans = KMeans(n_clusters=4, random_state=42)
    dbscan = DBSCAN(eps=0.5, min_samples=5)

    kmeans_labels = kmeans.fit_predict(df_scaled)
    dbscan_labels = dbscan.fit_predict(df_scaled)

    kmeans_score = silhouette_score(df_scaled, kmeans_labels)
    dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1

    if kmeans_score > dbscan_score:
        df['Cluster'] = kmeans_labels
        best_model = 'K-Means'
    else:
        df['Cluster'] = dbscan_labels
        best_model = 'DBSCAN'

    # Use PCA for visualization
    pca = PCA(n_components=2)
    pca_components = pca.fit_transform(df_scaled)
    df['PCA1'] = pca_components[:, 0]
    df['PCA2'] = pca_components[:, 1]

    return df, best_model

def perform_predictions(df):
    results = []
    target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']]

    for target in target_cols:
        X = df.drop(target, axis=1)
        y = df[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Model 1: Random Forest
        rf_model = RandomForestClassifier(random_state=42)
        rf_model.fit(X_train, y_train)
        rf_pred = rf_model.predict(X_test)
        rf_accuracy = accuracy_score(y_test, rf_pred)

        # Model 2: Logistic Regression
        lr_model = LogisticRegression(max_iter=1000)
        lr_model.fit(X_train, y_train)
        lr_pred = lr_model.predict(X_test)
        lr_accuracy = accuracy_score(y_test, lr_pred)

        if rf_accuracy > lr_accuracy:
            results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy})
        else:
            results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy})

    return results

def create_dashboard(df, correlation_data, clustering_data, prediction_results):
    app = dash.Dash(__name__)
    
    app.layout = html.Div([
        html.H1('Comprehensive Student Data Analysis'),

        html.Div([
            html.H2('Correlation Analysis'),
            dash_table.DataTable(
                id='correlation_table',
                columns=[{'name': i, 'id': i} for i in correlation_data.columns],
                data=correlation_data.to_dict('records')
            )
        ]),

        html.Div([
            html.H2('Clustering Analysis'),
            html.P(f'Best Clustering Algorithm: {clustering_data["best_model"]}'),
            dcc.Graph(
                id='clustering_scatter',
                figure={
                    'data': [
                        {
                            'x': df['PCA1'],
                            'y': df['PCA2'],
                            'mode': 'markers',
                            'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10},
                            'text': df['Cluster'],
                            'type': 'scatter'
                        }
                    ],
                    'layout': {
                        'title': 'Cluster Visualization using PCA',
                        'xaxis': {'title': 'PCA Component 1'},
                        'yaxis': {'title': 'PCA Component 2'}
                    }
                }
            )
        ]),

        html.Div([
            html.H2('Prediction Models'),
            dash_table.DataTable(
                id='prediction_table',
                columns=[{'name': i, 'id': i} for i in prediction_results.columns],
                data=prediction_results.to_dict('records')
            )
        ])
    ])

    app.run_server(debug=True)

# Main execution
if __name__ == "__main__":
    # Load dataset
    df = pd.read_csv('student_data.csv')  # Replace with your CSV file

    # Preprocess the data
    df = enhanced_preprocessing(df)

    # Perform correlation analysis
    correlation_data = calculate_correlations(df)

    # Perform clustering analysis
    df, best_model = perform_clustering(df)
    clustering_data = {'best_model': best_model}

    # Perform prediction analysis
    prediction_results = pd.DataFrame(perform_predictions(df))

    # Create and launch the dashboard
    create_dashboard(df, correlation_data, clustering_data, prediction_results)