Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import joblib | |
import plotly.graph_objects as go | |
from sklearn.ensemble import RandomForestRegressor | |
class GEMMPredictor: | |
def __init__(self, model_path='model.joblib'): | |
self.stacked_model = joblib.load(model_path) | |
self.initialize_features() | |
def initialize_features(self): | |
"""Initialize features used by the model""" | |
# Core matrix features | |
self.core_features = [ | |
'm', 'n', 'k', | |
'blocksize1', 'blocksize2', 'blocksize3' | |
] | |
# Derived features | |
self.derived_features = [ | |
'arithmetic_intensity', | |
'bytes_accessed', | |
'total_flops' | |
] | |
# Categorical features | |
self.categorical_features = ['Layout'] | |
# Target features | |
self.target_features = [ | |
'runtime', | |
'power', | |
'Energy', | |
'TFlops' | |
] | |
self.numerical_features = self.core_features + self.derived_features | |
def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3): | |
"""Calculate GEMM-specific characteristics""" | |
total_flops = 2 * m * n * k # 2 operations per FMA | |
bytes_accessed = (m * k + k * n + m * n) * 4 # Single precision | |
arithmetic_intensity = total_flops / bytes_accessed | |
bound_type = 'compute' if arithmetic_intensity > 59 else 'memory' | |
return { | |
'total_flops': total_flops, | |
'bytes_accessed': bytes_accessed, | |
'arithmetic_intensity': arithmetic_intensity, | |
'bound_type': bound_type | |
} | |
def get_default_numeric_values(self): | |
"""Return default values for missing numeric features""" | |
return { | |
# Memory-related defaults | |
'total_memory': 12288, # 12GB for RTX 4070 | |
'free_memory': 10240, # Assuming 80% free | |
'used_memory': 2048, # Assuming 20% used | |
'mem_util': 20.0, # 20% utilization | |
'mem_util2': 20.0, # Secondary memory utilization | |
# GPU state defaults | |
'temp': 65.0, # Default temperature | |
'gpu_util': 80.0, # Default GPU utilization | |
'gpu_util1': 80.0, # Secondary GPU utilization | |
'clock_sm': 2475, # Default SM clock for RTX 4070 | |
'power_limit': 200.0, # Default power limit | |
'clocks.meme': 2000, # Memory clock speed | |
'alpha': 1.0, # Default scaling factor | |
'beta': 0.0, # Default scaling factor | |
'problem_size_m': 1024, | |
'problem_size_n': 1024, | |
'problem_size_k': 1024 | |
} | |
def get_default_categorical_values(self): | |
"""Return default values for missing categorical features""" | |
return { | |
'stage': 'main', | |
'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1', | |
'computation_pattern': 'GEMM', | |
'combination_type': 'standard', | |
'state': 'active', | |
'uses_shared_memory': 'true', | |
'gpu_name': 'RTX4070' | |
} | |
def prepare_input_data(self, input_dict): | |
"""Prepare input data for prediction with default values for missing features""" | |
numeric_defaults = self.get_default_numeric_values() | |
categorical_defaults = self.get_default_categorical_values() | |
complete_input = {**numeric_defaults, **categorical_defaults} | |
complete_input.update(input_dict) | |
df = pd.DataFrame([complete_input]) | |
characteristics = self.calculate_gemm_characteristics( | |
df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0], | |
df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0] | |
) | |
df['total_flops'] = characteristics['total_flops'] | |
df['bytes_accessed'] = characteristics['bytes_accessed'] | |
df['arithmetic_intensity'] = characteristics['arithmetic_intensity'] | |
for col in self.categorical_features: | |
if col in df.columns: | |
df[col] = df[col].astype(str) | |
for col in self.numerical_features: | |
if col in df.columns: | |
df[col] = pd.to_numeric(df[col], errors='coerce') | |
return df | |
def estimate_power(df): | |
BASE_POWER = 30 | |
MAX_POWER = 200 | |
MAX_TFLOPS = 40 | |
df['estimated_power'] = BASE_POWER + ( | |
(MAX_POWER - BASE_POWER) * | |
(df['total_flops'] / (MAX_TFLOPS * 1e12)) | |
) | |
df['power'] = df['power'].fillna(df['estimated_power']) | |
return df | |
def filter_power_bounds(df): | |
MIN_POWER = 25 # Minimum idle power | |
MAX_POWER = 200 # Maximum TDP | |
df = df[ | |
(df['power'].between(MIN_POWER, MAX_POWER)) | | |
(df['power'].isna()) | |
] | |
return df | |
def impute_power(df): | |
df['total_elements'] = df['m'] * df['n'] * df['k'] | |
valid_power = df[df['power'].notna()] | |
features = ['total_elements', 'total_flops', 'arithmetic_intensity'] | |
X = valid_power[features] | |
y = valid_power['power'] | |
model = RandomForestRegressor(n_estimators=100) | |
model.fit(X, y) | |
missing_power = df[df['power'].isna()] | |
imputed_values = model.predict(missing_power[features]) | |
df.loc[df['power'].isna(), 'power'] = imputed_values | |
return df | |
def preprocess_data(self, df): | |
"""Preprocess data focusing on GEMM characteristics with improved power handling""" | |
print("\nPreprocessing data...") | |
try: | |
df_processed = df.copy() | |
df_processed = df_processed.replace('[N/A]', np.nan) | |
df_processed = df_processed.replace('', np.nan) | |
df_processed = self.calculate_gemm_characteristics(df_processed) | |
df_processed['Layout'] = df_processed['Layout'].astype(str) | |
df_processed = self.estimate_power(df_processed) | |
df_processed = self.impute_power(df_processed) | |
df_processed = self.filter_power_bounds(df_processed) | |
for col in self.numerical_features: | |
if col in df_processed.columns: | |
df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce') | |
Q1 = df_processed[col].quantile(0.01) | |
Q3 = df_processed[col].quantile(0.99) | |
df_processed[col] = df_processed[col].clip(Q1, Q3) | |
df_processed[col] = df_processed[col].fillna(df_processed[col].median()) | |
print("Data preprocessing completed successfully") | |
print(f"Features summary:") | |
print(df_processed[self.numerical_features].describe()) | |
return df_processed | |
except Exception as e: | |
print(f"Error in preprocess_data: {str(e)}") | |
raise | |
def predict(self, input_data): | |
"""Make predictions using the stacked model""" | |
df = self.prepare_input_data(input_data) | |
predictions = self.stacked_model.predict(df) | |
# Map predictions to target features | |
prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)} | |
prediction_dict['characteristics'] = self.calculate_gemm_characteristics( | |
input_data['m'], input_data['n'], input_data['k'], | |
input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3'] | |
) | |
return prediction_dict | |
def create_comparison_chart(current_metrics, optimal_metrics): | |
"""Create a comparison chart using plotly""" | |
metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS'] | |
current_values = [ | |
current_metrics['runtime'], | |
current_metrics['power'], | |
current_metrics['Energy'], | |
current_metrics['TFlops'] | |
] | |
optimal_values = [ | |
optimal_metrics['runtime'], | |
optimal_metrics['power'], | |
optimal_metrics['Energy'], | |
optimal_metrics['TFlops'] | |
] | |
fig = go.Figure(data=[ | |
go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'), | |
go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38') | |
]) | |
fig.update_layout( | |
barmode='group', | |
title='Performance Comparison', | |
xaxis_title='Metrics', | |
yaxis_title='Values', | |
height=400 | |
) | |
return fig | |
def create_heatmap(m, n, k, block_m, block_n): | |
"""Create a heatmap visualization of the matrix blocking""" | |
grid_m = int(np.ceil(m / block_m)) | |
grid_n = int(np.ceil(n / block_n)) | |
grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n)) | |
fig = go.Figure(data=go.Heatmap( | |
z=grid, | |
colorscale='Viridis', | |
showscale=False | |
)) | |
fig.update_layout( | |
title='Matrix Blocking Visualization', | |
xaxis_title='N dimension (columns)', | |
yaxis_title='M dimension (rows)', | |
height=300, | |
margin=dict(l=50, r=50, t=50, b=50) | |
) | |
return fig | |
def create_performance_metrics_chart(predictions): | |
"""Create a gauge chart for TFLOPS and other metrics""" | |
max_tflops = 40 # RTX 4070 theoretical max | |
tflops_percentage = (predictions['TFlops'] / max_tflops) * 100 | |
fig = go.Figure(go.Indicator( | |
mode = "gauge+number", | |
value = predictions['TFlops'], | |
domain = {'x': [0, 1], 'y': [0, 1]}, | |
title = {'text': "TFLOPS Performance"}, | |
gauge = { | |
'axis': {'range': [None, max_tflops]}, | |
'bar': {'color': "darkblue"}, | |
'steps': [ | |
{'range': [0, max_tflops/3], 'color': "red"}, | |
{'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"}, | |
{'range': [2*max_tflops/3, max_tflops], 'color': "green"} | |
], | |
'threshold': { | |
'line': {'color': "red", 'width': 4}, | |
'thickness': 0.75, | |
'value': predictions['TFlops'] | |
} | |
} | |
)) | |
fig.update_layout(height=300) | |
return fig | |
def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization): | |
"""Create a spider chart showing various efficiency metrics""" | |
fig = go.Figure() | |
categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization'] | |
fig.add_trace(go.Scatterpolar( | |
r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization], | |
theta=categories, | |
fill='toself', | |
name='Current Configuration' | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
range=[0, 100] | |
)), | |
showlegend=False, | |
height=300 | |
) | |
return fig | |
def main(): | |
st.set_page_config(page_title="GEMM Performance Predictor", layout="wide") | |
st.markdown(""" | |
<style> | |
.main { | |
padding: 2rem 1rem; | |
max-width: 100%; | |
} | |
.metric-card { | |
background-color: #f0f2f6; | |
padding: 1rem; | |
border-radius: 0.5rem; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.title("GEMM Performance Predictor for RTX 4070") | |
try: | |
predictor = GEMMPredictor() | |
col1, col2, col3 = st.columns([1,1,1]) | |
with col1: | |
st.subheader("Matrix Dimensions") | |
with st.expander("Set Matrix Dimensions", expanded=True): | |
m = st.number_input("M", min_value=1, value=512) | |
n = st.number_input("N", min_value=1, value=512) | |
k = st.number_input("K", min_value=1, value=1024) | |
with col2: | |
st.subheader("Block Sizes") | |
with st.expander("Set Block Dimensions", expanded=True): | |
blocksize1 = st.number_input("Block Size 1", min_value=1, value=512) | |
blocksize2 = st.number_input("Block Size 2", min_value=1, value=128) | |
blocksize3 = st.number_input("Block Size 3", min_value=1, value=512) | |
with col3: | |
st.subheader("Configuration") | |
with st.expander("Additional Settings", expanded=True): | |
layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt']) | |
kernel_name = st.selectbox( | |
"CUTLASS Kernel", | |
[ | |
'cutlass_simt_sgemm_128x128_8x2_nn_align1', | |
'cutlass_simt_sgemm_128x128_8x2_nt_align1', | |
'cutlass_simt_sgemm_128x128_8x2_tn_align1', | |
'cutlass_simt_sgemm_128x128_8x2_tt_align1' | |
] | |
) | |
alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25) | |
beta = st.number_input("Beta Scalar", value=0.50, step=0.25) | |
if st.button("Analyze Performance", use_container_width=True): | |
with st.spinner("Analyzing performance..."): | |
input_data = { | |
'm': m, 'n': n, 'k': k, | |
'blocksize1': blocksize1, | |
'blocksize2': blocksize2, | |
'blocksize3': blocksize3, | |
'Layout': layout, | |
'kernel_name': kernel_name, | |
'alpha': alpha, | |
'beta': beta | |
} | |
predictions = predictor.predict(input_data) | |
tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"]) | |
with tab1: | |
st.subheader("GEMM Characteristics") | |
metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4) | |
with metric_col1: | |
st.metric( | |
"Arithmetic Intensity", | |
f"{predictions['characteristics']['arithmetic_intensity']:.2f}", | |
f"{predictions['characteristics']['bound_type'].upper()} bound" | |
) | |
with metric_col2: | |
st.metric( | |
"Total FLOPS", | |
f"{predictions['characteristics']['total_flops']/1e9:.2f}G", | |
"Operations" | |
) | |
with metric_col3: | |
st.metric( | |
"Memory Accessed", | |
f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB", | |
"Total Data Movement" | |
) | |
with metric_col4: | |
memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) | |
st.metric( | |
"Memory Efficiency", | |
f"{memory_efficiency:.1f}%", | |
"vs Peak Bandwidth" | |
) | |
st.markdown("---") | |
perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4) | |
with perf_col1: | |
st.metric( | |
"Runtime", | |
f"{max(0.01, predictions['runtime']):.2f} ms", | |
"Execution Time" | |
) | |
with perf_col2: | |
st.metric( | |
"Power", | |
f"{max(1.0, predictions['power']):.2f} W", | |
"Power Consumption" | |
) | |
with perf_col3: | |
st.metric( | |
"Energy", | |
f"{max(0.01, predictions['Energy']):.2f} J", | |
"Total Energy" | |
) | |
with perf_col4: | |
efficiency = (predictions['TFlops'] / 40) * 100 | |
st.metric( | |
"TFLOPS", | |
f"{predictions['TFlops']:.2f}", | |
f"{efficiency:.1f}% of Peak" | |
) | |
with tab2: | |
st.subheader("Detailed Performance Analysis") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("#### Matrix Configuration") | |
st.markdown(f""" | |
- Total Matrix Elements: {m*n:,} | |
- Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB | |
- Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3} | |
- Grid Size: {m//blocksize1}x{n//blocksize2} blocks | |
""") | |
with col2: | |
st.markdown("#### Performance Bottlenecks") | |
ai = predictions['characteristics']['arithmetic_intensity'] | |
if ai > 59: | |
st.success("✅ Compute Bound - Optimal for GPU") | |
else: | |
st.warning("⚠️ Memory Bound - Consider Optimization") | |
efficiency = (predictions['TFlops'] / 40) * 100 | |
if efficiency < 30: | |
st.error("🔴 Low Compute Efficiency - Check Configuration") | |
elif efficiency < 60: | |
st.warning("🟡 Moderate Efficiency - Room for Improvement") | |
else: | |
st.success("🟢 Good Efficiency") | |
with tab3: | |
st.subheader("Performance Visualizations") | |
viz_col1, viz_col2 = st.columns(2) | |
with viz_col1: | |
st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True) | |
with viz_col2: | |
mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) | |
compute_util = min(100, (predictions['TFlops'] / 40) * 100) | |
st.plotly_chart( | |
create_efficiency_chart( | |
predictions['characteristics']['arithmetic_intensity'], | |
mem_bw_util, | |
compute_util | |
), | |
use_container_width=True | |
) | |
st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True) | |
st.markdown("### Recommendations") | |
recommendations = [] | |
if blocksize1 * blocksize2 > 1024: | |
recommendations.append("⚠️ Block size might be too large for optimal occupancy") | |
if predictions['characteristics']['arithmetic_intensity'] < 30: | |
recommendations.append("Consider increasing arithmetic intensity through blocking") | |
if efficiency < 50: | |
recommendations.append("Performance is below 50% of peak - try different block sizes") | |
if recommendations: | |
for rec in recommendations: | |
st.markdown(f"- {rec}") | |
else: | |
st.success("Current configuration appears optimal!") | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.") | |
st.write("If the error persists, check the input parameters and model compatibility.") | |
if __name__ == "__main__": | |
main() |