import streamlit as st
import pandas as pd
import numpy as np
import joblib
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor

class GEMMPredictor:
    def __init__(self, model_path='model.joblib'):
        self.stacked_model = joblib.load(model_path)
        self.initialize_features()

    def initialize_features(self):
        """Initialize features used by the model"""
        # Core matrix features
        self.core_features = [
            'm', 'n', 'k',
            'blocksize1', 'blocksize2', 'blocksize3'
        ]
        # Derived features
        self.derived_features = [
            'arithmetic_intensity',
            'bytes_accessed',
            'total_flops'
        ]
        # Categorical features
        self.categorical_features = ['Layout']
        # Target features
        self.target_features = [
            'runtime',
            'power',
            'Energy',
            'TFlops'
        ]
        self.numerical_features = self.core_features + self.derived_features
        
    def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3):
        """Calculate GEMM-specific characteristics"""
        total_flops = 2 * m * n * k  # 2 operations per FMA
        bytes_accessed = (m * k + k * n + m * n) * 4  # Single precision
        arithmetic_intensity = total_flops / bytes_accessed
        bound_type = 'compute' if arithmetic_intensity > 59 else 'memory'
        
        return {
            'total_flops': total_flops,
            'bytes_accessed': bytes_accessed,
            'arithmetic_intensity': arithmetic_intensity,
            'bound_type': bound_type
        }
        
    def get_default_numeric_values(self):
        """Return default values for missing numeric features"""
        return {
            # Memory-related defaults
            'total_memory': 12288,  # 12GB for RTX 4070
            'free_memory': 10240,   # Assuming 80% free
            'used_memory': 2048,    # Assuming 20% used
            'mem_util': 20.0,       # 20% utilization
            'mem_util2': 20.0,      # Secondary memory utilization
            
            # GPU state defaults
            'temp': 65.0,           # Default temperature
            'gpu_util': 80.0,       # Default GPU utilization
            'gpu_util1': 80.0,      # Secondary GPU utilization
            'clock_sm': 2475,       # Default SM clock for RTX 4070
            'power_limit': 200.0,   # Default power limit
            'clocks.meme': 2000,    # Memory clock speed
            
            'alpha': 1.0,           # Default scaling factor
            'beta': 0.0,            # Default scaling factor
            'problem_size_m': 1024,
            'problem_size_n': 1024,
            'problem_size_k': 1024
        }

    def get_default_categorical_values(self):
        """Return default values for missing categorical features"""
        return {
            'stage': 'main',
            'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1',
            'computation_pattern': 'GEMM',
            'combination_type': 'standard',
            'state': 'active',
            'uses_shared_memory': 'true',
            'gpu_name': 'RTX4070'
        }
        
    def prepare_input_data(self, input_dict):
        """Prepare input data for prediction with default values for missing features"""
        numeric_defaults = self.get_default_numeric_values()
        categorical_defaults = self.get_default_categorical_values()
        
        complete_input = {**numeric_defaults, **categorical_defaults}
        
        complete_input.update(input_dict)
        
        df = pd.DataFrame([complete_input])
        
        characteristics = self.calculate_gemm_characteristics(
            df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0],
            df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0]
        )
        
        df['total_flops'] = characteristics['total_flops']
        df['bytes_accessed'] = characteristics['bytes_accessed']
        df['arithmetic_intensity'] = characteristics['arithmetic_intensity']
        
        for col in self.categorical_features:
            if col in df.columns:
                df[col] = df[col].astype(str)
        
        for col in self.numerical_features:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        return df

    def estimate_power(df):
        BASE_POWER = 30  
        MAX_POWER = 200 
        MAX_TFLOPS = 40
        
        df['estimated_power'] = BASE_POWER + (
            (MAX_POWER - BASE_POWER) * 
            (df['total_flops'] / (MAX_TFLOPS * 1e12))
        )
        
        df['power'] = df['power'].fillna(df['estimated_power'])
        
        return df

    def filter_power_bounds(df):
        MIN_POWER = 25  # Minimum idle power
        MAX_POWER = 200 # Maximum TDP
        
        df = df[
            (df['power'].between(MIN_POWER, MAX_POWER)) | 
            (df['power'].isna())
        ]
        
        return df
    
    def impute_power(df):
        df['total_elements'] = df['m'] * df['n'] * df['k']
        valid_power = df[df['power'].notna()]
        
        features = ['total_elements', 'total_flops', 'arithmetic_intensity']
        X = valid_power[features]
        y = valid_power['power']
        
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X, y)
        
        missing_power = df[df['power'].isna()]
        imputed_values = model.predict(missing_power[features])
        df.loc[df['power'].isna(), 'power'] = imputed_values
        
        return df

    def preprocess_data(self, df):
        """Preprocess data focusing on GEMM characteristics with improved power handling"""
        print("\nPreprocessing data...")
        
        try:
            df_processed = df.copy()
            df_processed = df_processed.replace('[N/A]', np.nan)
            df_processed = df_processed.replace('', np.nan)
            df_processed = self.calculate_gemm_characteristics(df_processed)

            df_processed['Layout'] = df_processed['Layout'].astype(str)
            
            df_processed = self.estimate_power(df_processed)
            df_processed = self.impute_power(df_processed)
            df_processed = self.filter_power_bounds(df_processed)
            
            for col in self.numerical_features:
                if col in df_processed.columns:
                    df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
                    Q1 = df_processed[col].quantile(0.01)
                    Q3 = df_processed[col].quantile(0.99)
                    df_processed[col] = df_processed[col].clip(Q1, Q3)
                    df_processed[col] = df_processed[col].fillna(df_processed[col].median())
            
            print("Data preprocessing completed successfully")
            print(f"Features summary:")
            print(df_processed[self.numerical_features].describe())
            
            return df_processed
            
        except Exception as e:
            print(f"Error in preprocess_data: {str(e)}")
            raise

    def predict(self, input_data):
        """Make predictions using the stacked model"""
        df = self.prepare_input_data(input_data)
        predictions = self.stacked_model.predict(df)

        # Map predictions to target features
        prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)}

        prediction_dict['characteristics'] = self.calculate_gemm_characteristics(
            input_data['m'], input_data['n'], input_data['k'],
            input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3']
        )

        return prediction_dict

def create_comparison_chart(current_metrics, optimal_metrics):
    """Create a comparison chart using plotly"""
    metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS']
    current_values = [
        current_metrics['runtime'],
        current_metrics['power'],
        current_metrics['Energy'],  
        current_metrics['TFlops']  
    ]
    optimal_values = [
        optimal_metrics['runtime'],
        optimal_metrics['power'],
        optimal_metrics['Energy'],  
        optimal_metrics['TFlops']
    ]
    
    fig = go.Figure(data=[
        go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'),
        go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38')
    ])
    
    fig.update_layout(
        barmode='group',
        title='Performance Comparison',
        xaxis_title='Metrics',
        yaxis_title='Values',
        height=400
    )
    
    return fig

def create_heatmap(m, n, k, block_m, block_n):
    """Create a heatmap visualization of the matrix blocking"""
    grid_m = int(np.ceil(m / block_m))
    grid_n = int(np.ceil(n / block_n))
    
    grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n))
    
    fig = go.Figure(data=go.Heatmap(
        z=grid,
        colorscale='Viridis',
        showscale=False
    ))
    
    fig.update_layout(
        title='Matrix Blocking Visualization',
        xaxis_title='N dimension (columns)',
        yaxis_title='M dimension (rows)',
        height=300,
        margin=dict(l=50, r=50, t=50, b=50)
    )
    
    return fig

def create_performance_metrics_chart(predictions):
    """Create a gauge chart for TFLOPS and other metrics"""
    max_tflops = 40  # RTX 4070 theoretical max
    tflops_percentage = (predictions['TFlops'] / max_tflops) * 100
    
    fig = go.Figure(go.Indicator(
        mode = "gauge+number",
        value = predictions['TFlops'],
        domain = {'x': [0, 1], 'y': [0, 1]},
        title = {'text': "TFLOPS Performance"},
        gauge = {
            'axis': {'range': [None, max_tflops]},
            'bar': {'color': "darkblue"},
            'steps': [
                {'range': [0, max_tflops/3], 'color': "red"},
                {'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"},
                {'range': [2*max_tflops/3, max_tflops], 'color': "green"}
            ],
            'threshold': {
                'line': {'color': "red", 'width': 4},
                'thickness': 0.75,
                'value': predictions['TFlops']
            }
        }
    ))
    
    fig.update_layout(height=300)
    return fig

def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization):
    """Create a spider chart showing various efficiency metrics"""
    fig = go.Figure()
    
    categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization']
    
    fig.add_trace(go.Scatterpolar(
        r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization],
        theta=categories,
        fill='toself',
        name='Current Configuration'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100]
            )),
        showlegend=False,
        height=300
    )
    
    return fig

def main():
    st.set_page_config(page_title="GEMM Performance Predictor", layout="wide")
    st.markdown("""
        <style>
        .main {
            padding: 2rem 1rem;
            max-width: 100%;
        }
        .metric-card {
            background-color: #f0f2f6;
            padding: 1rem;
            border-radius: 0.5rem;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        </style>
    """, unsafe_allow_html=True)
    
    st.title("GEMM Performance Predictor for RTX 4070")
    
    try:
        predictor = GEMMPredictor()
        col1, col2, col3 = st.columns([1,1,1])
        
        with col1:
            st.subheader("Matrix Dimensions")
            with st.expander("Set Matrix Dimensions", expanded=True):
                m = st.number_input("M", min_value=1, value=512)
                n = st.number_input("N", min_value=1, value=512)
                k = st.number_input("K", min_value=1, value=1024)
        
        with col2:
            st.subheader("Block Sizes")
            with st.expander("Set Block Dimensions", expanded=True):
                blocksize1 = st.number_input("Block Size 1", min_value=1, value=512)
                blocksize2 = st.number_input("Block Size 2", min_value=1, value=128)
                blocksize3 = st.number_input("Block Size 3", min_value=1, value=512)
        
        with col3:
            st.subheader("Configuration")
            with st.expander("Additional Settings", expanded=True):
                layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt'])
                kernel_name = st.selectbox(
                    "CUTLASS Kernel",
                    [
                        'cutlass_simt_sgemm_128x128_8x2_nn_align1',
                        'cutlass_simt_sgemm_128x128_8x2_nt_align1',
                        'cutlass_simt_sgemm_128x128_8x2_tn_align1',
                        'cutlass_simt_sgemm_128x128_8x2_tt_align1'
                    ]
                )
                alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25)
                beta = st.number_input("Beta Scalar", value=0.50, step=0.25)
        
        if st.button("Analyze Performance", use_container_width=True):
            with st.spinner("Analyzing performance..."):
                input_data = {
                    'm': m, 'n': n, 'k': k,
                    'blocksize1': blocksize1,
                    'blocksize2': blocksize2,
                    'blocksize3': blocksize3,
                    'Layout': layout,
                    'kernel_name': kernel_name,
                    'alpha': alpha,
                    'beta': beta
                }
                predictions = predictor.predict(input_data)
                
                tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"])
                
                with tab1:
                    st.subheader("GEMM Characteristics")
                    metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
                    
                    with metric_col1:
                        st.metric(
                            "Arithmetic Intensity",
                            f"{predictions['characteristics']['arithmetic_intensity']:.2f}",
                            f"{predictions['characteristics']['bound_type'].upper()} bound"
                        )
                    
                    with metric_col2:
                        st.metric(
                            "Total FLOPS",
                            f"{predictions['characteristics']['total_flops']/1e9:.2f}G",
                            "Operations"
                        )
                    
                    with metric_col3:
                        st.metric(
                            "Memory Accessed",
                            f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB",
                            "Total Data Movement"
                        )
                    
                    with metric_col4:
                        memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
                        st.metric(
                            "Memory Efficiency",
                            f"{memory_efficiency:.1f}%",
                            "vs Peak Bandwidth"
                        )
                    
                    st.markdown("---")
                    
                    perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4)
                    
                    with perf_col1:
                        st.metric(
                            "Runtime",
                            f"{max(0.01, predictions['runtime']):.2f} ms",
                            "Execution Time"
                        )
                    
                    with perf_col2:
                        st.metric(
                            "Power",
                            f"{max(1.0, predictions['power']):.2f} W",
                            "Power Consumption"
                        )
                    
                    with perf_col3:
                        st.metric(
                            "Energy",
                            f"{max(0.01, predictions['Energy']):.2f} J", 
                            "Total Energy"
                        )
                    
                    with perf_col4:
                        efficiency = (predictions['TFlops'] / 40) * 100
                        st.metric(
                            "TFLOPS",
                            f"{predictions['TFlops']:.2f}", 
                            f"{efficiency:.1f}% of Peak"
                        )
                
                with tab2:
                    st.subheader("Detailed Performance Analysis")
                    
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.markdown("#### Matrix Configuration")
                        st.markdown(f"""
                        - Total Matrix Elements: {m*n:,}
                        - Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB
                        - Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3}
                        - Grid Size: {m//blocksize1}x{n//blocksize2} blocks
                        """)
                    
                    with col2:
                        st.markdown("#### Performance Bottlenecks")
                        ai = predictions['characteristics']['arithmetic_intensity']
                        if ai > 59:
                            st.success("✅ Compute Bound - Optimal for GPU")
                        else:
                            st.warning("⚠️ Memory Bound - Consider Optimization")
                        
                        efficiency = (predictions['TFlops'] / 40) * 100
                        if efficiency < 30:
                            st.error("🔴 Low Compute Efficiency - Check Configuration")
                        elif efficiency < 60:
                            st.warning("🟡 Moderate Efficiency - Room for Improvement")
                        else:
                            st.success("🟢 Good Efficiency")
                
                with tab3:
                    st.subheader("Performance Visualizations")
                    
                    viz_col1, viz_col2 = st.columns(2)
                    
                    with viz_col1:
                        st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True)
                    
                    with viz_col2:
                        mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
                        compute_util = min(100, (predictions['TFlops'] / 40) * 100)
                        st.plotly_chart(
                            create_efficiency_chart(
                                predictions['characteristics']['arithmetic_intensity'],
                                mem_bw_util,
                                compute_util
                            ),
                            use_container_width=True
                        )
                    
                    st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True)
                    
                    st.markdown("### Recommendations")
                    
                    recommendations = []
                    if blocksize1 * blocksize2 > 1024:
                        recommendations.append("⚠️ Block size might be too large for optimal occupancy")
                    if predictions['characteristics']['arithmetic_intensity'] < 30:
                        recommendations.append("Consider increasing arithmetic intensity through blocking")
                    if efficiency < 50:
                        recommendations.append("Performance is below 50% of peak - try different block sizes")
                    
                    if recommendations:
                        for rec in recommendations:
                            st.markdown(f"- {rec}")
                    else:
                        st.success("Current configuration appears optimal!")
                
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.")
        st.write("If the error persists, check the input parameters and model compatibility.")

if __name__ == "__main__":
    main()