import streamlit as st import pandas as pd import numpy as np import joblib import plotly.graph_objects as go from sklearn.ensemble import RandomForestRegressor class GEMMPredictor: def __init__(self, model_path='model.joblib'): self.stacked_model = joblib.load(model_path) self.initialize_features() def initialize_features(self): """Initialize features used by the model""" # Core matrix features self.core_features = [ 'm', 'n', 'k', 'blocksize1', 'blocksize2', 'blocksize3' ] # Derived features self.derived_features = [ 'arithmetic_intensity', 'bytes_accessed', 'total_flops' ] # Categorical features self.categorical_features = ['Layout'] # Target features self.target_features = [ 'runtime', 'power', 'Energy', 'TFlops' ] self.numerical_features = self.core_features + self.derived_features def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3): """Calculate GEMM-specific characteristics""" total_flops = 2 * m * n * k # 2 operations per FMA bytes_accessed = (m * k + k * n + m * n) * 4 # Single precision arithmetic_intensity = total_flops / bytes_accessed bound_type = 'compute' if arithmetic_intensity > 59 else 'memory' return { 'total_flops': total_flops, 'bytes_accessed': bytes_accessed, 'arithmetic_intensity': arithmetic_intensity, 'bound_type': bound_type } def get_default_numeric_values(self): """Return default values for missing numeric features""" return { # Memory-related defaults 'total_memory': 12288, # 12GB for RTX 4070 'free_memory': 10240, # Assuming 80% free 'used_memory': 2048, # Assuming 20% used 'mem_util': 20.0, # 20% utilization 'mem_util2': 20.0, # Secondary memory utilization # GPU state defaults 'temp': 65.0, # Default temperature 'gpu_util': 80.0, # Default GPU utilization 'gpu_util1': 80.0, # Secondary GPU utilization 'clock_sm': 2475, # Default SM clock for RTX 4070 'power_limit': 200.0, # Default power limit 'clocks.meme': 2000, # Memory clock speed 'alpha': 1.0, # Default scaling factor 'beta': 0.0, # Default scaling factor 'problem_size_m': 1024, 'problem_size_n': 1024, 'problem_size_k': 1024 } def get_default_categorical_values(self): """Return default values for missing categorical features""" return { 'stage': 'main', 'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1', 'computation_pattern': 'GEMM', 'combination_type': 'standard', 'state': 'active', 'uses_shared_memory': 'true', 'gpu_name': 'RTX4070' } def prepare_input_data(self, input_dict): """Prepare input data for prediction with default values for missing features""" numeric_defaults = self.get_default_numeric_values() categorical_defaults = self.get_default_categorical_values() complete_input = {**numeric_defaults, **categorical_defaults} complete_input.update(input_dict) df = pd.DataFrame([complete_input]) characteristics = self.calculate_gemm_characteristics( df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0], df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0] ) df['total_flops'] = characteristics['total_flops'] df['bytes_accessed'] = characteristics['bytes_accessed'] df['arithmetic_intensity'] = characteristics['arithmetic_intensity'] for col in self.categorical_features: if col in df.columns: df[col] = df[col].astype(str) for col in self.numerical_features: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') return df def estimate_power(df): BASE_POWER = 30 MAX_POWER = 200 MAX_TFLOPS = 40 df['estimated_power'] = BASE_POWER + ( (MAX_POWER - BASE_POWER) * (df['total_flops'] / (MAX_TFLOPS * 1e12)) ) df['power'] = df['power'].fillna(df['estimated_power']) return df def filter_power_bounds(df): MIN_POWER = 25 # Minimum idle power MAX_POWER = 200 # Maximum TDP df = df[ (df['power'].between(MIN_POWER, MAX_POWER)) | (df['power'].isna()) ] return df def impute_power(df): df['total_elements'] = df['m'] * df['n'] * df['k'] valid_power = df[df['power'].notna()] features = ['total_elements', 'total_flops', 'arithmetic_intensity'] X = valid_power[features] y = valid_power['power'] model = RandomForestRegressor(n_estimators=100) model.fit(X, y) missing_power = df[df['power'].isna()] imputed_values = model.predict(missing_power[features]) df.loc[df['power'].isna(), 'power'] = imputed_values return df def preprocess_data(self, df): """Preprocess data focusing on GEMM characteristics with improved power handling""" print("\nPreprocessing data...") try: df_processed = df.copy() df_processed = df_processed.replace('[N/A]', np.nan) df_processed = df_processed.replace('', np.nan) df_processed = self.calculate_gemm_characteristics(df_processed) df_processed['Layout'] = df_processed['Layout'].astype(str) df_processed = self.estimate_power(df_processed) df_processed = self.impute_power(df_processed) df_processed = self.filter_power_bounds(df_processed) for col in self.numerical_features: if col in df_processed.columns: df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce') Q1 = df_processed[col].quantile(0.01) Q3 = df_processed[col].quantile(0.99) df_processed[col] = df_processed[col].clip(Q1, Q3) df_processed[col] = df_processed[col].fillna(df_processed[col].median()) print("Data preprocessing completed successfully") print(f"Features summary:") print(df_processed[self.numerical_features].describe()) return df_processed except Exception as e: print(f"Error in preprocess_data: {str(e)}") raise def predict(self, input_data): """Make predictions using the stacked model""" df = self.prepare_input_data(input_data) predictions = self.stacked_model.predict(df) # Map predictions to target features prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)} prediction_dict['characteristics'] = self.calculate_gemm_characteristics( input_data['m'], input_data['n'], input_data['k'], input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3'] ) return prediction_dict def create_comparison_chart(current_metrics, optimal_metrics): """Create a comparison chart using plotly""" metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS'] current_values = [ current_metrics['runtime'], current_metrics['power'], current_metrics['Energy'], current_metrics['TFlops'] ] optimal_values = [ optimal_metrics['runtime'], optimal_metrics['power'], optimal_metrics['Energy'], optimal_metrics['TFlops'] ] fig = go.Figure(data=[ go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'), go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38') ]) fig.update_layout( barmode='group', title='Performance Comparison', xaxis_title='Metrics', yaxis_title='Values', height=400 ) return fig def create_heatmap(m, n, k, block_m, block_n): """Create a heatmap visualization of the matrix blocking""" grid_m = int(np.ceil(m / block_m)) grid_n = int(np.ceil(n / block_n)) grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n)) fig = go.Figure(data=go.Heatmap( z=grid, colorscale='Viridis', showscale=False )) fig.update_layout( title='Matrix Blocking Visualization', xaxis_title='N dimension (columns)', yaxis_title='M dimension (rows)', height=300, margin=dict(l=50, r=50, t=50, b=50) ) return fig def create_performance_metrics_chart(predictions): """Create a gauge chart for TFLOPS and other metrics""" max_tflops = 40 # RTX 4070 theoretical max tflops_percentage = (predictions['TFlops'] / max_tflops) * 100 fig = go.Figure(go.Indicator( mode = "gauge+number", value = predictions['TFlops'], domain = {'x': [0, 1], 'y': [0, 1]}, title = {'text': "TFLOPS Performance"}, gauge = { 'axis': {'range': [None, max_tflops]}, 'bar': {'color': "darkblue"}, 'steps': [ {'range': [0, max_tflops/3], 'color': "red"}, {'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"}, {'range': [2*max_tflops/3, max_tflops], 'color': "green"} ], 'threshold': { 'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': predictions['TFlops'] } } )) fig.update_layout(height=300) return fig def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization): """Create a spider chart showing various efficiency metrics""" fig = go.Figure() categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization'] fig.add_trace(go.Scatterpolar( r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization], theta=categories, fill='toself', name='Current Configuration' )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 100] )), showlegend=False, height=300 ) return fig def main(): st.set_page_config(page_title="GEMM Performance Predictor", layout="wide") st.markdown(""" """, unsafe_allow_html=True) st.title("GEMM Performance Predictor for RTX 4070") try: predictor = GEMMPredictor() col1, col2, col3 = st.columns([1,1,1]) with col1: st.subheader("Matrix Dimensions") with st.expander("Set Matrix Dimensions", expanded=True): m = st.number_input("M", min_value=1, value=512) n = st.number_input("N", min_value=1, value=512) k = st.number_input("K", min_value=1, value=1024) with col2: st.subheader("Block Sizes") with st.expander("Set Block Dimensions", expanded=True): blocksize1 = st.number_input("Block Size 1", min_value=1, value=512) blocksize2 = st.number_input("Block Size 2", min_value=1, value=128) blocksize3 = st.number_input("Block Size 3", min_value=1, value=512) with col3: st.subheader("Configuration") with st.expander("Additional Settings", expanded=True): layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt']) kernel_name = st.selectbox( "CUTLASS Kernel", [ 'cutlass_simt_sgemm_128x128_8x2_nn_align1', 'cutlass_simt_sgemm_128x128_8x2_nt_align1', 'cutlass_simt_sgemm_128x128_8x2_tn_align1', 'cutlass_simt_sgemm_128x128_8x2_tt_align1' ] ) alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25) beta = st.number_input("Beta Scalar", value=0.50, step=0.25) if st.button("Analyze Performance", use_container_width=True): with st.spinner("Analyzing performance..."): input_data = { 'm': m, 'n': n, 'k': k, 'blocksize1': blocksize1, 'blocksize2': blocksize2, 'blocksize3': blocksize3, 'Layout': layout, 'kernel_name': kernel_name, 'alpha': alpha, 'beta': beta } predictions = predictor.predict(input_data) tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"]) with tab1: st.subheader("GEMM Characteristics") metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4) with metric_col1: st.metric( "Arithmetic Intensity", f"{predictions['characteristics']['arithmetic_intensity']:.2f}", f"{predictions['characteristics']['bound_type'].upper()} bound" ) with metric_col2: st.metric( "Total FLOPS", f"{predictions['characteristics']['total_flops']/1e9:.2f}G", "Operations" ) with metric_col3: st.metric( "Memory Accessed", f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB", "Total Data Movement" ) with metric_col4: memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) st.metric( "Memory Efficiency", f"{memory_efficiency:.1f}%", "vs Peak Bandwidth" ) st.markdown("---") perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4) with perf_col1: st.metric( "Runtime", f"{max(0.01, predictions['runtime']):.2f} ms", "Execution Time" ) with perf_col2: st.metric( "Power", f"{max(1.0, predictions['power']):.2f} W", "Power Consumption" ) with perf_col3: st.metric( "Energy", f"{max(0.01, predictions['Energy']):.2f} J", "Total Energy" ) with perf_col4: efficiency = (predictions['TFlops'] / 40) * 100 st.metric( "TFLOPS", f"{predictions['TFlops']:.2f}", f"{efficiency:.1f}% of Peak" ) with tab2: st.subheader("Detailed Performance Analysis") col1, col2 = st.columns(2) with col1: st.markdown("#### Matrix Configuration") st.markdown(f""" - Total Matrix Elements: {m*n:,} - Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB - Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3} - Grid Size: {m//blocksize1}x{n//blocksize2} blocks """) with col2: st.markdown("#### Performance Bottlenecks") ai = predictions['characteristics']['arithmetic_intensity'] if ai > 59: st.success("✅ Compute Bound - Optimal for GPU") else: st.warning("⚠️ Memory Bound - Consider Optimization") efficiency = (predictions['TFlops'] / 40) * 100 if efficiency < 30: st.error("🔴 Low Compute Efficiency - Check Configuration") elif efficiency < 60: st.warning("🟡 Moderate Efficiency - Room for Improvement") else: st.success("🟢 Good Efficiency") with tab3: st.subheader("Performance Visualizations") viz_col1, viz_col2 = st.columns(2) with viz_col1: st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True) with viz_col2: mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) compute_util = min(100, (predictions['TFlops'] / 40) * 100) st.plotly_chart( create_efficiency_chart( predictions['characteristics']['arithmetic_intensity'], mem_bw_util, compute_util ), use_container_width=True ) st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True) st.markdown("### Recommendations") recommendations = [] if blocksize1 * blocksize2 > 1024: recommendations.append("⚠️ Block size might be too large for optimal occupancy") if predictions['characteristics']['arithmetic_intensity'] < 30: recommendations.append("Consider increasing arithmetic intensity through blocking") if efficiency < 50: recommendations.append("Performance is below 50% of peak - try different block sizes") if recommendations: for rec in recommendations: st.markdown(f"- {rec}") else: st.success("Current configuration appears optimal!") except Exception as e: st.error(f"An error occurred: {str(e)}") st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.") st.write("If the error persists, check the input parameters and model compatibility.") if __name__ == "__main__": main()