DPSGDTool / app /training /mock_trainer.py
Shuya Feng
Replace the dummy data with true dataset
e3e63bf
import numpy as np
import time
from typing import Dict, List, Any
class MockTrainer:
def __init__(self):
# More realistic base accuracy for DP-SGD on MNIST (should achieve 85-98% like research shows)
self.base_accuracy = 0.98 # Non-private MNIST accuracy
self.base_loss = 0.08 # Corresponding base loss
def train(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Simulate DP-SGD training with given parameters using realistic privacy trade-offs.
Args:
params: Dictionary containing training parameters:
- clipping_norm: float
- noise_multiplier: float
- batch_size: int
- learning_rate: float
- epochs: int
Returns:
Dictionary containing training results and metrics
"""
# Extract parameters
clipping_norm = params['clipping_norm']
noise_multiplier = params['noise_multiplier']
batch_size = params['batch_size']
learning_rate = params['learning_rate']
epochs = params['epochs']
# Calculate realistic privacy impact on performance
privacy_factor = self._calculate_realistic_privacy_factor(clipping_norm, noise_multiplier, batch_size, epochs)
# Generate epoch-wise data
epochs_data = self._generate_epoch_data(epochs, privacy_factor)
# Generate iteration-wise data (mock version for consistency)
iterations_data = self._generate_iteration_data(epochs, privacy_factor, batch_size)
# Calculate final metrics (must be consistent with epoch data)
final_metrics = self._calculate_final_metrics(epochs_data, privacy_factor)
# Generate recommendations
recommendations = self._generate_recommendations(params, final_metrics)
# Generate gradient information
gradient_info = {
'before_clipping': self.generate_gradient_norms(clipping_norm),
'after_clipping': self.generate_clipped_gradients(clipping_norm)
}
# Calculate realistic privacy budget
privacy_budget = self._calculate_mock_privacy_budget(params)
return {
'epochs_data': epochs_data,
'iterations_data': iterations_data,
'final_metrics': final_metrics,
'recommendations': recommendations,
'gradient_info': gradient_info,
'privacy_budget': privacy_budget
}
def _calculate_mock_privacy_budget(self, params: Dict[str, Any]) -> float:
"""Calculate a realistic mock privacy budget based on DP-SGD theory."""
noise_multiplier = params['noise_multiplier']
epochs = params['epochs']
batch_size = params['batch_size']
# More realistic calculation based on DP-SGD research
q = batch_size / 60000 # Sampling rate for MNIST
steps = epochs * (60000 // batch_size)
# Simplified but more accurate RDP calculation
# Based on research: Ξ΅ β‰ˆ q*sqrt(steps*log(1/Ξ΄)) / Οƒ for large Οƒ
import math
delta = 1e-5
epsilon = (q * math.sqrt(steps * math.log(1/delta))) / noise_multiplier
# Add some realistic variation
epsilon *= (1 + np.random.normal(0, 0.1))
return max(0.1, min(50.0, epsilon))
def _calculate_realistic_privacy_factor(self, clipping_norm: float, noise_multiplier: float, batch_size: int, epochs: int) -> float:
"""Calculate realistic privacy impact based on DP-SGD research."""
# Research shows DP-SGD can achieve 85-98% accuracy with proper parameters
# The privacy impact should be much less severe than previously modeled
# Base degradation from noise (much less severe)
if noise_multiplier <= 0.5:
noise_degradation = 0.02 # Very little impact with low noise
elif noise_multiplier <= 1.0:
noise_degradation = 0.05 # Small impact with medium noise
elif noise_multiplier <= 1.5:
noise_degradation = 0.12 # Moderate impact
else:
noise_degradation = min(0.25, 0.1 + 0.05 * noise_multiplier) # Higher impact with very high noise
# Clipping degradation (much less severe)
if clipping_norm >= 2.0:
clipping_degradation = 0.01 # Minimal impact with good clipping
elif clipping_norm >= 1.0:
clipping_degradation = 0.03 # Small impact
else:
clipping_degradation = min(0.15, 0.2 / clipping_norm) # More impact with very low clipping
# Batch size effect (larger batches help significantly)
if batch_size >= 256:
batch_factor = -0.02 # Bonus for large batches
elif batch_size >= 128:
batch_factor = 0.01 # Small penalty
else:
batch_factor = min(0.08, 0.001 * (128 - batch_size))
# Epochs effect (more training helps overcome noise)
if epochs >= 10:
epoch_factor = -0.03 # Bonus for sufficient training
elif epochs >= 5:
epoch_factor = 0.01 # Small penalty
else:
epoch_factor = 0.05 # Penalty for insufficient training
total_degradation = noise_degradation + clipping_degradation + batch_factor + epoch_factor
privacy_factor = 1.0 - max(0, total_degradation) # Much less degradation overall
return max(0.7, privacy_factor) # Ensure minimum 70% of original performance (can achieve 85%+ with good params)
def _generate_iteration_data(self, epochs: int, privacy_factor: float, batch_size: int) -> List[Dict[str, float]]:
"""Generate realistic iteration-wise training metrics."""
iterations_data = []
# Simulate ~60,000 training samples, so iterations_per_epoch = 60000 / batch_size
dataset_size = 60000
iterations_per_epoch = dataset_size // batch_size
# Realistic base learning curve parameters
base_accuracy = self.base_accuracy * privacy_factor
base_loss = self.base_loss / privacy_factor
current_iteration = 0
for epoch in range(1, epochs + 1):
for iteration_in_epoch in range(0, iterations_per_epoch, 10): # Sample every 10th
current_iteration += 10
# Overall progress through all training
total_iterations = epochs * iterations_per_epoch
overall_progress = current_iteration / total_iterations
# More realistic learning curve: slower start, plateau effect
learning_progress = 1 - np.exp(-3 * overall_progress) # Exponential approach to target
# Add realistic variation (DP-SGD has more noise)
noise_std = 0.08 if privacy_factor < 0.7 else 0.04 # More noise for high privacy
noise = np.random.normal(0, noise_std)
# Calculate realistic accuracy progression
target_accuracy = base_accuracy * (0.4 + 0.6 * learning_progress)
accuracy = target_accuracy + noise
# Calculate corresponding loss
target_loss = base_loss * (1.5 - 0.5 * learning_progress)
loss = target_loss - noise * 0.3 # Loss inversely correlated with accuracy
# Add some iteration-level oscillations (typical of SGD)
oscillation = 0.015 * np.sin(current_iteration * 0.05)
accuracy += oscillation
loss -= oscillation * 0.5
iterations_data.append({
'iteration': current_iteration,
'epoch': epoch,
'accuracy': max(5, min(95, accuracy * 100)), # Realistic bounds
'loss': max(0.05, loss),
'train_accuracy': max(5, min(95, (accuracy + np.random.normal(0, 0.02)) * 100)),
'train_loss': max(0.05, loss + np.random.normal(0, 0.1))
})
return iterations_data
def _generate_epoch_data(self, epochs: int, privacy_factor: float) -> List[Dict[str, float]]:
"""Generate realistic training metrics for each epoch."""
epochs_data = []
# Realistic base learning curve parameters
base_accuracy = self.base_accuracy * privacy_factor
base_loss = self.base_loss / privacy_factor
for epoch in range(1, epochs + 1):
# Realistic learning curve: fast early improvement, then plateau
progress = epoch / epochs
learning_factor = 1 - np.exp(-2.5 * progress) # Exponential learning curve
# Add realistic epoch-to-epoch variation
noise_std = 0.03 if privacy_factor < 0.7 else 0.015
noise = np.random.normal(0, noise_std)
# Calculate realistic metrics
accuracy = base_accuracy * (0.4 + 0.6 * learning_factor) + noise
loss = base_loss * (1.4 - 0.4 * learning_factor) - noise * 0.3
epochs_data.append({
'epoch': epoch,
'accuracy': max(5, min(95, accuracy * 100)), # Convert to percentage with bounds
'loss': max(0.05, loss),
'train_accuracy': max(5, min(95, (accuracy + np.random.normal(0, 0.01)) * 100)),
'train_loss': max(0.05, loss + np.random.normal(0, 0.05))
})
return epochs_data
def _calculate_final_metrics(self, epochs_data: List[Dict[str, float]], privacy_factor: float) -> Dict[str, float]:
"""Calculate final training metrics that are CONSISTENT with epoch data."""
if not epochs_data:
return {'accuracy': 50.0, 'loss': 1.0, 'training_time': 1.0}
# Use the LAST epoch's results as final metrics (consistency!)
final_epoch = epochs_data[-1]
# Training time should be realistic for DP-SGD (slower than normal)
base_time = len(epochs_data) * 0.8 # Base time per epoch
privacy_slowdown = (2.0 - privacy_factor) # DP-SGD is slower
time_variation = 1.0 + np.random.normal(0, 0.1)
return {
'accuracy': final_epoch['accuracy'], # Consistent with training progress!
'loss': final_epoch['loss'],
'training_time': base_time * privacy_slowdown * time_variation
}
def _generate_recommendations(self, params: Dict[str, Any], metrics: Dict[str, float]) -> List[Dict[str, str]]:
"""Generate realistic recommendations based on DP-SGD best practices."""
recommendations = []
# Noise multiplier recommendations (critical for DP-SGD)
if params['noise_multiplier'] < 0.5:
recommendations.append({
'icon': 'πŸ”’',
'text': 'Very low noise provides minimal privacy. Consider Οƒ β‰₯ 0.8 for meaningful privacy.'
})
elif params['noise_multiplier'] > 2.0:
recommendations.append({
'icon': '⚠️',
'text': 'High noise (Οƒ > 2.0) significantly degrades accuracy. Try reducing to 0.8-1.5.'
})
elif params['noise_multiplier'] > 1.5:
recommendations.append({
'icon': 'πŸ’‘',
'text': 'Consider reducing noise multiplier to 0.8-1.2 for better utility-privacy trade-off.'
})
# Clipping norm recommendations
if params['clipping_norm'] < 0.5:
recommendations.append({
'icon': '⚠️',
'text': 'Very low clipping norm can prevent learning. Try C = 1.0-2.0.'
})
elif params['clipping_norm'] > 3.0:
recommendations.append({
'icon': 'πŸ”’',
'text': 'Large clipping norm reduces privacy protection. Consider C ≀ 2.0.'
})
# Batch size recommendations (important for DP-SGD)
if params['batch_size'] < 64:
recommendations.append({
'icon': '⚑',
'text': 'Small batch sizes amplify noise effects. Try batch size β‰₯ 128 for better stability.'
})
elif params['batch_size'] > 512:
recommendations.append({
'icon': 'πŸ’Ύ',
'text': 'Very large batch sizes may require more memory and longer training time.'
})
# Learning rate recommendations
if params['learning_rate'] > 0.05:
recommendations.append({
'icon': '⚠️',
'text': 'High learning rate with noise can destabilize training. Try ≀ 0.02.'
})
elif params['learning_rate'] < 0.005:
recommendations.append({
'icon': '⏳',
'text': 'Very low learning rate may require more epochs for convergence.'
})
# Epochs recommendations
if params['epochs'] < 5:
recommendations.append({
'icon': 'πŸ“ˆ',
'text': 'Few epochs may not be enough to overcome noise. Try 8-15 epochs.'
})
elif params['epochs'] > 20:
recommendations.append({
'icon': 'πŸ”’',
'text': 'Many epochs increase privacy cost. Consider early stopping around 10-15 epochs.'
})
# Accuracy-based recommendations
if metrics['accuracy'] < 60:
recommendations.append({
'icon': 'πŸ“‰',
'text': 'Low accuracy suggests too much noise. Reduce Οƒ or increase C for better utility.'
})
elif metrics['accuracy'] > 85:
recommendations.append({
'icon': '🎯',
'text': 'Good accuracy! This is a well-balanced privacy-utility trade-off.'
})
return recommendations
def generate_gradient_norms(self, clipping_norm: float) -> List[Dict[str, float]]:
"""Generate realistic gradient norms following a log-normal distribution."""
num_points = 100
gradients = []
# Parameters for log-normal distribution
mu = np.log(clipping_norm) - 0.5
sigma = 0.8
for _ in range(num_points):
# Generate log-normal distributed gradient norms
u1, u2 = np.random.random(2)
z = np.sqrt(-2.0 * np.log(u1)) * np.cos(2.0 * np.pi * u2)
norm = np.exp(mu + sigma * z)
# Calculate density using kernel density estimation
density = np.exp(-(np.power(np.log(norm) - mu, 2) / (2 * sigma * sigma))) / (norm * sigma * np.sqrt(2 * np.pi))
density = 0.2 + 0.8 * (density / 0.8) + 0.1 * (np.random.random() - 0.5)
gradients.append({'x': float(norm), 'y': float(density)})
return sorted(gradients, key=lambda x: x['x'])
def generate_clipped_gradients(self, clipping_norm: float) -> List[Dict[str, float]]:
"""Generate clipped versions of the gradient norms."""
original_gradients = self.generate_gradient_norms(clipping_norm)
return [{'x': min(g['x'], clipping_norm), 'y': g['y']} for g in original_gradients]