Spaces:

Jethro85
/

DPSGDTool

Running

DPSGDTool / app /training /mock_trainer.py

Shuya Feng

Replace the dummy data with true dataset

e3e63bf about 1 month ago

15.6 kB

	import numpy as np
	import time
	from typing import Dict, List, Any

	class MockTrainer:
	def __init__(self):
	# More realistic base accuracy for DP-SGD on MNIST (should achieve 85-98% like research shows)
	self.base_accuracy = 0.98 # Non-private MNIST accuracy
	self.base_loss = 0.08 # Corresponding base loss

	def train(self, params: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Simulate DP-SGD training with given parameters using realistic privacy trade-offs.

	Args:
	params: Dictionary containing training parameters:
	- clipping_norm: float
	- noise_multiplier: float
	- batch_size: int
	- learning_rate: float
	- epochs: int

	Returns:
	Dictionary containing training results and metrics
	"""
	# Extract parameters
	clipping_norm = params['clipping_norm']
	noise_multiplier = params['noise_multiplier']
	batch_size = params['batch_size']
	learning_rate = params['learning_rate']
	epochs = params['epochs']

	# Calculate realistic privacy impact on performance
	privacy_factor = self._calculate_realistic_privacy_factor(clipping_norm, noise_multiplier, batch_size, epochs)

	# Generate epoch-wise data
	epochs_data = self._generate_epoch_data(epochs, privacy_factor)

	# Generate iteration-wise data (mock version for consistency)
	iterations_data = self._generate_iteration_data(epochs, privacy_factor, batch_size)

	# Calculate final metrics (must be consistent with epoch data)
	final_metrics = self._calculate_final_metrics(epochs_data, privacy_factor)

	# Generate recommendations
	recommendations = self._generate_recommendations(params, final_metrics)

	# Generate gradient information
	gradient_info = {
	'before_clipping': self.generate_gradient_norms(clipping_norm),
	'after_clipping': self.generate_clipped_gradients(clipping_norm)
	}

	# Calculate realistic privacy budget
	privacy_budget = self._calculate_mock_privacy_budget(params)

	return {
	'epochs_data': epochs_data,
	'iterations_data': iterations_data,
	'final_metrics': final_metrics,
	'recommendations': recommendations,
	'gradient_info': gradient_info,
	'privacy_budget': privacy_budget
	}

	def _calculate_mock_privacy_budget(self, params: Dict[str, Any]) -> float:
	"""Calculate a realistic mock privacy budget based on DP-SGD theory."""
	noise_multiplier = params['noise_multiplier']
	epochs = params['epochs']
	batch_size = params['batch_size']

	# More realistic calculation based on DP-SGD research
	q = batch_size / 60000 # Sampling rate for MNIST
	steps = epochs * (60000 // batch_size)

	# Simplified but more accurate RDP calculation
	# Based on research: ε ≈ qsqrt(stepslog(1/δ)) / σ for large σ
	import math
	delta = 1e-5
	epsilon = (q * math.sqrt(steps * math.log(1/delta))) / noise_multiplier

	# Add some realistic variation
	epsilon *= (1 + np.random.normal(0, 0.1))

	return max(0.1, min(50.0, epsilon))

	def _calculate_realistic_privacy_factor(self, clipping_norm: float, noise_multiplier: float, batch_size: int, epochs: int) -> float:
	"""Calculate realistic privacy impact based on DP-SGD research."""
	# Research shows DP-SGD can achieve 85-98% accuracy with proper parameters
	# The privacy impact should be much less severe than previously modeled

	# Base degradation from noise (much less severe)
	if noise_multiplier <= 0.5:
	noise_degradation = 0.02 # Very little impact with low noise
	elif noise_multiplier <= 1.0:
	noise_degradation = 0.05 # Small impact with medium noise
	elif noise_multiplier <= 1.5:
	noise_degradation = 0.12 # Moderate impact
	else:
	noise_degradation = min(0.25, 0.1 + 0.05 * noise_multiplier) # Higher impact with very high noise

	# Clipping degradation (much less severe)
	if clipping_norm >= 2.0:
	clipping_degradation = 0.01 # Minimal impact with good clipping
	elif clipping_norm >= 1.0:
	clipping_degradation = 0.03 # Small impact
	else:
	clipping_degradation = min(0.15, 0.2 / clipping_norm) # More impact with very low clipping

	# Batch size effect (larger batches help significantly)
	if batch_size >= 256:
	batch_factor = -0.02 # Bonus for large batches
	elif batch_size >= 128:
	batch_factor = 0.01 # Small penalty
	else:
	batch_factor = min(0.08, 0.001 * (128 - batch_size))

	# Epochs effect (more training helps overcome noise)
	if epochs >= 10:
	epoch_factor = -0.03 # Bonus for sufficient training
	elif epochs >= 5:
	epoch_factor = 0.01 # Small penalty
	else:
	epoch_factor = 0.05 # Penalty for insufficient training

	total_degradation = noise_degradation + clipping_degradation + batch_factor + epoch_factor
	privacy_factor = 1.0 - max(0, total_degradation) # Much less degradation overall

	return max(0.7, privacy_factor) # Ensure minimum 70% of original performance (can achieve 85%+ with good params)

	def _generate_iteration_data(self, epochs: int, privacy_factor: float, batch_size: int) -> List[Dict[str, float]]:
	"""Generate realistic iteration-wise training metrics."""
	iterations_data = []

	# Simulate ~60,000 training samples, so iterations_per_epoch = 60000 / batch_size
	dataset_size = 60000
	iterations_per_epoch = dataset_size // batch_size

	# Realistic base learning curve parameters
	base_accuracy = self.base_accuracy * privacy_factor
	base_loss = self.base_loss / privacy_factor

	current_iteration = 0
	for epoch in range(1, epochs + 1):
	for iteration_in_epoch in range(0, iterations_per_epoch, 10): # Sample every 10th
	current_iteration += 10

	# Overall progress through all training
	total_iterations = epochs * iterations_per_epoch
	overall_progress = current_iteration / total_iterations

	# More realistic learning curve: slower start, plateau effect
	learning_progress = 1 - np.exp(-3 * overall_progress) # Exponential approach to target

	# Add realistic variation (DP-SGD has more noise)
	noise_std = 0.08 if privacy_factor < 0.7 else 0.04 # More noise for high privacy
	noise = np.random.normal(0, noise_std)

	# Calculate realistic accuracy progression
	target_accuracy = base_accuracy * (0.4 + 0.6 * learning_progress)
	accuracy = target_accuracy + noise

	# Calculate corresponding loss
	target_loss = base_loss * (1.5 - 0.5 * learning_progress)
	loss = target_loss - noise * 0.3 # Loss inversely correlated with accuracy

	# Add some iteration-level oscillations (typical of SGD)
	oscillation = 0.015 * np.sin(current_iteration * 0.05)
	accuracy += oscillation
	loss -= oscillation * 0.5

	iterations_data.append({
	'iteration': current_iteration,
	'epoch': epoch,
	'accuracy': max(5, min(95, accuracy * 100)), # Realistic bounds
	'loss': max(0.05, loss),
	'train_accuracy': max(5, min(95, (accuracy + np.random.normal(0, 0.02)) * 100)),
	'train_loss': max(0.05, loss + np.random.normal(0, 0.1))
	})

	return iterations_data

	def _generate_epoch_data(self, epochs: int, privacy_factor: float) -> List[Dict[str, float]]:
	"""Generate realistic training metrics for each epoch."""
	epochs_data = []

	# Realistic base learning curve parameters
	base_accuracy = self.base_accuracy * privacy_factor
	base_loss = self.base_loss / privacy_factor

	for epoch in range(1, epochs + 1):
	# Realistic learning curve: fast early improvement, then plateau
	progress = epoch / epochs
	learning_factor = 1 - np.exp(-2.5 * progress) # Exponential learning curve

	# Add realistic epoch-to-epoch variation
	noise_std = 0.03 if privacy_factor < 0.7 else 0.015
	noise = np.random.normal(0, noise_std)

	# Calculate realistic metrics
	accuracy = base_accuracy * (0.4 + 0.6 * learning_factor) + noise
	loss = base_loss * (1.4 - 0.4 * learning_factor) - noise * 0.3

	epochs_data.append({
	'epoch': epoch,
	'accuracy': max(5, min(95, accuracy * 100)), # Convert to percentage with bounds
	'loss': max(0.05, loss),
	'train_accuracy': max(5, min(95, (accuracy + np.random.normal(0, 0.01)) * 100)),
	'train_loss': max(0.05, loss + np.random.normal(0, 0.05))
	})

	return epochs_data

	def _calculate_final_metrics(self, epochs_data: List[Dict[str, float]], privacy_factor: float) -> Dict[str, float]:
	"""Calculate final training metrics that are CONSISTENT with epoch data."""
	if not epochs_data:
	return {'accuracy': 50.0, 'loss': 1.0, 'training_time': 1.0}

	# Use the LAST epoch's results as final metrics (consistency!)
	final_epoch = epochs_data[-1]

	# Training time should be realistic for DP-SGD (slower than normal)
	base_time = len(epochs_data) * 0.8 # Base time per epoch
	privacy_slowdown = (2.0 - privacy_factor) # DP-SGD is slower
	time_variation = 1.0 + np.random.normal(0, 0.1)

	return {
	'accuracy': final_epoch['accuracy'], # Consistent with training progress!
	'loss': final_epoch['loss'],
	'training_time': base_time * privacy_slowdown * time_variation
	}

	def _generate_recommendations(self, params: Dict[str, Any], metrics: Dict[str, float]) -> List[Dict[str, str]]:
	"""Generate realistic recommendations based on DP-SGD best practices."""
	recommendations = []

	# Noise multiplier recommendations (critical for DP-SGD)
	if params['noise_multiplier'] < 0.5:
	recommendations.append({
	'icon': '🔒',
	'text': 'Very low noise provides minimal privacy. Consider σ ≥ 0.8 for meaningful privacy.'
	})
	elif params['noise_multiplier'] > 2.0:
	recommendations.append({
	'icon': '⚠️',
	'text': 'High noise (σ > 2.0) significantly degrades accuracy. Try reducing to 0.8-1.5.'
	})
	elif params['noise_multiplier'] > 1.5:
	recommendations.append({
	'icon': '💡',
	'text': 'Consider reducing noise multiplier to 0.8-1.2 for better utility-privacy trade-off.'
	})

	# Clipping norm recommendations
	if params['clipping_norm'] < 0.5:
	recommendations.append({
	'icon': '⚠️',
	'text': 'Very low clipping norm can prevent learning. Try C = 1.0-2.0.'
	})
	elif params['clipping_norm'] > 3.0:
	recommendations.append({
	'icon': '🔒',
	'text': 'Large clipping norm reduces privacy protection. Consider C ≤ 2.0.'
	})

	# Batch size recommendations (important for DP-SGD)
	if params['batch_size'] < 64:
	recommendations.append({
	'icon': '⚡',
	'text': 'Small batch sizes amplify noise effects. Try batch size ≥ 128 for better stability.'
	})
	elif params['batch_size'] > 512:
	recommendations.append({
	'icon': '💾',
	'text': 'Very large batch sizes may require more memory and longer training time.'
	})

	# Learning rate recommendations
	if params['learning_rate'] > 0.05:
	recommendations.append({
	'icon': '⚠️',
	'text': 'High learning rate with noise can destabilize training. Try ≤ 0.02.'
	})
	elif params['learning_rate'] < 0.005:
	recommendations.append({
	'icon': '⏳',
	'text': 'Very low learning rate may require more epochs for convergence.'
	})

	# Epochs recommendations
	if params['epochs'] < 5:
	recommendations.append({
	'icon': '📈',
	'text': 'Few epochs may not be enough to overcome noise. Try 8-15 epochs.'
	})
	elif params['epochs'] > 20:
	recommendations.append({
	'icon': '🔒',
	'text': 'Many epochs increase privacy cost. Consider early stopping around 10-15 epochs.'
	})

	# Accuracy-based recommendations
	if metrics['accuracy'] < 60:
	recommendations.append({
	'icon': '📉',
	'text': 'Low accuracy suggests too much noise. Reduce σ or increase C for better utility.'
	})
	elif metrics['accuracy'] > 85:
	recommendations.append({
	'icon': '🎯',
	'text': 'Good accuracy! This is a well-balanced privacy-utility trade-off.'
	})

	return recommendations

	def generate_gradient_norms(self, clipping_norm: float) -> List[Dict[str, float]]:
	"""Generate realistic gradient norms following a log-normal distribution."""
	num_points = 100
	gradients = []

	# Parameters for log-normal distribution
	mu = np.log(clipping_norm) - 0.5
	sigma = 0.8

	for _ in range(num_points):
	# Generate log-normal distributed gradient norms
	u1, u2 = np.random.random(2)
	z = np.sqrt(-2.0 * np.log(u1)) * np.cos(2.0 * np.pi * u2)
	norm = np.exp(mu + sigma * z)

	# Calculate density using kernel density estimation
	density = np.exp(-(np.power(np.log(norm) - mu, 2) / (2 * sigma * sigma))) / (norm * sigma * np.sqrt(2 * np.pi))
	density = 0.2 + 0.8 * (density / 0.8) + 0.1 * (np.random.random() - 0.5)

	gradients.append({'x': float(norm), 'y': float(density)})

	return sorted(gradients, key=lambda x: x['x'])

	def generate_clipped_gradients(self, clipping_norm: float) -> List[Dict[str, float]]:
	"""Generate clipped versions of the gradient norms."""
	original_gradients = self.generate_gradient_norms(clipping_norm)
	return [{'x': min(g['x'], clipping_norm), 'y': g['y']} for g in original_gradients]