video-stable-diffusion / geffnet /efficientnet_builder.py

Upload 198 files

074c857 about 2 years ago

26.5 kB

	""" EfficientNet / MobileNetV3 Blocks and Builder

	Copyright 2020 Ross Wightman
	"""
	import re
	from copy import deepcopy

	from .conv2d_layers import *
	from geffnet.activations import *

	__all__ = ['get_bn_args_tf', 'resolve_bn_args', 'resolve_se_args', 'resolve_act_layer', 'make_divisible',
	'round_channels', 'drop_connect', 'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv',
	'InvertedResidual', 'CondConvResidual', 'EdgeResidual', 'EfficientNetBuilder', 'decode_arch_def',
	'initialize_weight_default', 'initialize_weight_goog', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT'
	]

	# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
	# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
	# NOTE: momentum varies btw .99 and .9997 depending on source
	# .99 in official TF TPU impl
	# .9997 (/w .999 in search space) for paper
	#
	# PyTorch defaults are momentum = .1, eps = 1e-5
	#
	BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
	BN_EPS_TF_DEFAULT = 1e-3
	_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)


	def get_bn_args_tf():
	return _BN_ARGS_TF.copy()


	def resolve_bn_args(kwargs):
	bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
	bn_momentum = kwargs.pop('bn_momentum', None)
	if bn_momentum is not None:
	bn_args['momentum'] = bn_momentum
	bn_eps = kwargs.pop('bn_eps', None)
	if bn_eps is not None:
	bn_args['eps'] = bn_eps
	return bn_args


	_SE_ARGS_DEFAULT = dict(
	gate_fn=sigmoid,
	act_layer=None, # None == use containing block's activation layer
	reduce_mid=False,
	divisor=1)


	def resolve_se_args(kwargs, in_chs, act_layer=None):
	se_kwargs = kwargs.copy() if kwargs is not None else {}
	# fill in args that aren't specified with the defaults
	for k, v in _SE_ARGS_DEFAULT.items():
	se_kwargs.setdefault(k, v)
	# some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
	if not se_kwargs.pop('reduce_mid'):
	se_kwargs['reduced_base_chs'] = in_chs
	# act_layer override, if it remains None, the containing block's act_layer will be used
	if se_kwargs['act_layer'] is None:
	assert act_layer is not None
	se_kwargs['act_layer'] = act_layer
	return se_kwargs


	def resolve_act_layer(kwargs, default='relu'):
	act_layer = kwargs.pop('act_layer', default)
	if isinstance(act_layer, str):
	act_layer = get_act_layer(act_layer)
	return act_layer


	def make_divisible(v: int, divisor: int = 8, min_value: int = None):
	min_value = min_value or divisor
	new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
	if new_v < 0.9 * v: # ensure round down does not go down by more than 10%.
	new_v += divisor
	return new_v


	def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
	"""Round number of filters based on depth multiplier."""
	if not multiplier:
	return channels
	channels *= multiplier
	return make_divisible(channels, divisor, channel_min)


	def drop_connect(inputs, training: bool = False, drop_connect_rate: float = 0.):
	"""Apply drop connect."""
	if not training:
	return inputs

	keep_prob = 1 - drop_connect_rate
	random_tensor = keep_prob + torch.rand(
	(inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
	random_tensor.floor_() # binarize
	output = inputs.div(keep_prob) * random_tensor
	return output


	class SqueezeExcite(nn.Module):

	def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1):
	super(SqueezeExcite, self).__init__()
	reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
	self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
	self.act1 = act_layer(inplace=True)
	self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
	self.gate_fn = gate_fn

	def forward(self, x):
	x_se = x.mean((2, 3), keepdim=True)
	x_se = self.conv_reduce(x_se)
	x_se = self.act1(x_se)
	x_se = self.conv_expand(x_se)
	x = x * self.gate_fn(x_se)
	return x


	class ConvBnAct(nn.Module):
	def __init__(self, in_chs, out_chs, kernel_size,
	stride=1, pad_type='', act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
	super(ConvBnAct, self).__init__()
	assert stride in [1, 2]
	norm_kwargs = norm_kwargs or {}
	self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, padding=pad_type)
	self.bn1 = norm_layer(out_chs, **norm_kwargs)
	self.act1 = act_layer(inplace=True)

	def forward(self, x):
	x = self.conv(x)
	x = self.bn1(x)
	x = self.act1(x)
	return x


	class DepthwiseSeparableConv(nn.Module):
	""" DepthwiseSeparable block
	Used for DS convs in MobileNet-V1 and in the place of IR blocks with an expansion
	factor of 1.0. This is an alternative to having a IR with optional first pw conv.
	"""
	def __init__(self, in_chs, out_chs, dw_kernel_size=3,
	stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
	pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
	norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
	super(DepthwiseSeparableConv, self).__init__()
	assert stride in [1, 2]
	norm_kwargs = norm_kwargs or {}
	self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
	self.drop_connect_rate = drop_connect_rate

	self.conv_dw = select_conv2d(
	in_chs, in_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True)
	self.bn1 = norm_layer(in_chs, **norm_kwargs)
	self.act1 = act_layer(inplace=True)

	# Squeeze-and-excitation
	if se_ratio is not None and se_ratio > 0.:
	se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
	self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
	else:
	self.se = nn.Identity()

	self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
	self.bn2 = norm_layer(out_chs, **norm_kwargs)
	self.act2 = act_layer(inplace=True) if pw_act else nn.Identity()

	def forward(self, x):
	residual = x

	x = self.conv_dw(x)
	x = self.bn1(x)
	x = self.act1(x)

	x = self.se(x)

	x = self.conv_pw(x)
	x = self.bn2(x)
	x = self.act2(x)

	if self.has_residual:
	if self.drop_connect_rate > 0.:
	x = drop_connect(x, self.training, self.drop_connect_rate)
	x += residual
	return x


	class InvertedResidual(nn.Module):
	""" Inverted residual block w/ optional SE"""

	def __init__(self, in_chs, out_chs, dw_kernel_size=3,
	stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
	exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
	se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
	conv_kwargs=None, drop_connect_rate=0.):
	super(InvertedResidual, self).__init__()
	norm_kwargs = norm_kwargs or {}
	conv_kwargs = conv_kwargs or {}
	mid_chs: int = make_divisible(in_chs * exp_ratio)
	self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
	self.drop_connect_rate = drop_connect_rate

	# Point-wise expansion
	self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
	self.bn1 = norm_layer(mid_chs, **norm_kwargs)
	self.act1 = act_layer(inplace=True)

	# Depth-wise convolution
	self.conv_dw = select_conv2d(
	mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True, **conv_kwargs)
	self.bn2 = norm_layer(mid_chs, **norm_kwargs)
	self.act2 = act_layer(inplace=True)

	# Squeeze-and-excitation
	if se_ratio is not None and se_ratio > 0.:
	se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
	self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
	else:
	self.se = nn.Identity() # for jit.script compat

	# Point-wise linear projection
	self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
	self.bn3 = norm_layer(out_chs, **norm_kwargs)

	def forward(self, x):
	residual = x

	# Point-wise expansion
	x = self.conv_pw(x)
	x = self.bn1(x)
	x = self.act1(x)

	# Depth-wise convolution
	x = self.conv_dw(x)
	x = self.bn2(x)
	x = self.act2(x)

	# Squeeze-and-excitation
	x = self.se(x)

	# Point-wise linear projection
	x = self.conv_pwl(x)
	x = self.bn3(x)

	if self.has_residual:
	if self.drop_connect_rate > 0.:
	x = drop_connect(x, self.training, self.drop_connect_rate)
	x += residual
	return x


	class CondConvResidual(InvertedResidual):
	""" Inverted residual block w/ CondConv routing"""

	def __init__(self, in_chs, out_chs, dw_kernel_size=3,
	stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
	exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
	se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
	num_experts=0, drop_connect_rate=0.):

	self.num_experts = num_experts
	conv_kwargs = dict(num_experts=self.num_experts)

	super(CondConvResidual, self).__init__(
	in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, pad_type=pad_type,
	act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
	pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
	norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
	drop_connect_rate=drop_connect_rate)

	self.routing_fn = nn.Linear(in_chs, self.num_experts)

	def forward(self, x):
	residual = x

	# CondConv routing
	pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
	routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))

	# Point-wise expansion
	x = self.conv_pw(x, routing_weights)
	x = self.bn1(x)
	x = self.act1(x)

	# Depth-wise convolution
	x = self.conv_dw(x, routing_weights)
	x = self.bn2(x)
	x = self.act2(x)

	# Squeeze-and-excitation
	x = self.se(x)

	# Point-wise linear projection
	x = self.conv_pwl(x, routing_weights)
	x = self.bn3(x)

	if self.has_residual:
	if self.drop_connect_rate > 0.:
	x = drop_connect(x, self.training, self.drop_connect_rate)
	x += residual
	return x


	class EdgeResidual(nn.Module):
	""" EdgeTPU Residual block with expansion convolution followed by pointwise-linear w/ stride"""

	def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
	stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
	se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
	super(EdgeResidual, self).__init__()
	norm_kwargs = norm_kwargs or {}
	mid_chs = make_divisible(fake_in_chs * exp_ratio) if fake_in_chs > 0 else make_divisible(in_chs * exp_ratio)
	self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
	self.drop_connect_rate = drop_connect_rate

	# Expansion convolution
	self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
	self.bn1 = norm_layer(mid_chs, **norm_kwargs)
	self.act1 = act_layer(inplace=True)

	# Squeeze-and-excitation
	if se_ratio is not None and se_ratio > 0.:
	se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
	self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
	else:
	self.se = nn.Identity()

	# Point-wise linear projection
	self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, stride=stride, padding=pad_type)
	self.bn2 = nn.BatchNorm2d(out_chs, **norm_kwargs)

	def forward(self, x):
	residual = x

	# Expansion convolution
	x = self.conv_exp(x)
	x = self.bn1(x)
	x = self.act1(x)

	# Squeeze-and-excitation
	x = self.se(x)

	# Point-wise linear projection
	x = self.conv_pwl(x)
	x = self.bn2(x)

	if self.has_residual:
	if self.drop_connect_rate > 0.:
	x = drop_connect(x, self.training, self.drop_connect_rate)
	x += residual

	return x


	class EfficientNetBuilder:
	""" Build Trunk Blocks for Efficient/Mobile Networks

	This ended up being somewhat of a cross between
	https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
	and
	https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py

	"""

	def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
	pad_type='', act_layer=None, se_kwargs=None,
	norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
	self.channel_multiplier = channel_multiplier
	self.channel_divisor = channel_divisor
	self.channel_min = channel_min
	self.pad_type = pad_type
	self.act_layer = act_layer
	self.se_kwargs = se_kwargs
	self.norm_layer = norm_layer
	self.norm_kwargs = norm_kwargs
	self.drop_connect_rate = drop_connect_rate

	# updated during build
	self.in_chs = None
	self.block_idx = 0
	self.block_count = 0

	def _round_channels(self, chs):
	return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)

	def _make_block(self, ba):
	bt = ba.pop('block_type')
	ba['in_chs'] = self.in_chs
	ba['out_chs'] = self._round_channels(ba['out_chs'])
	if 'fake_in_chs' in ba and ba['fake_in_chs']:
	# FIXME this is a hack to work around mismatch in origin impl input filters for EdgeTPU
	ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
	ba['norm_layer'] = self.norm_layer
	ba['norm_kwargs'] = self.norm_kwargs
	ba['pad_type'] = self.pad_type
	# block act fn overrides the model default
	ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
	assert ba['act_layer'] is not None
	if bt == 'ir':
	ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
	ba['se_kwargs'] = self.se_kwargs
	if ba.get('num_experts', 0) > 0:
	block = CondConvResidual(**ba)
	else:
	block = InvertedResidual(**ba)
	elif bt == 'ds' or bt == 'dsa':
	ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
	ba['se_kwargs'] = self.se_kwargs
	block = DepthwiseSeparableConv(**ba)
	elif bt == 'er':
	ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
	ba['se_kwargs'] = self.se_kwargs
	block = EdgeResidual(**ba)
	elif bt == 'cn':
	block = ConvBnAct(**ba)
	else:
	assert False, 'Uknkown block type (%s) while building model.' % bt
	self.in_chs = ba['out_chs'] # update in_chs for arg of next block
	return block

	def _make_stack(self, stack_args):
	blocks = []
	# each stack (stage) contains a list of block arguments
	for i, ba in enumerate(stack_args):
	if i >= 1:
	# only the first block in any stack can have a stride > 1
	ba['stride'] = 1
	block = self._make_block(ba)
	blocks.append(block)
	self.block_idx += 1 # incr global idx (across all stacks)
	return nn.Sequential(*blocks)

	def __call__(self, in_chs, block_args):
	""" Build the blocks
	Args:
	in_chs: Number of input-channels passed to first block
	block_args: A list of lists, outer list defines stages, inner
	list contains strings defining block configuration(s)
	Return:
	List of block stacks (each stack wrapped in nn.Sequential)
	"""
	self.in_chs = in_chs
	self.block_count = sum([len(x) for x in block_args])
	self.block_idx = 0
	blocks = []
	# outer list of block_args defines the stacks ('stages' by some conventions)
	for stack_idx, stack in enumerate(block_args):
	assert isinstance(stack, list)
	stack = self._make_stack(stack)
	blocks.append(stack)
	return blocks


	def _parse_ksize(ss):
	if ss.isdigit():
	return int(ss)
	else:
	return [int(k) for k in ss.split('.')]


	def _decode_block_str(block_str):
	""" Decode block definition string

	Gets a list of block arg (dicts) through a string notation of arguments.
	E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip

	All args can exist in any order with the exception of the leading string which
	is assumed to indicate the block type.

	leading string - block type (
	ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
	r - number of repeat blocks,
	k - kernel size,
	s - strides (1-9),
	e - expansion ratio,
	c - output channels,
	se - squeeze/excitation ratio
	n - activation fn ('re', 'r6', 'hs', or 'sw')
	Args:
	block_str: a string representation of block arguments.
	Returns:
	A list of block args (dicts)
	Raises:
	ValueError: if the string def not properly specified (TODO)
	"""
	assert isinstance(block_str, str)
	ops = block_str.split('_')
	block_type = ops[0] # take the block type off the front
	ops = ops[1:]
	options = {}
	noskip = False
	for op in ops:
	# string options being checked on individual basis, combine if they grow
	if op == 'noskip':
	noskip = True
	elif op.startswith('n'):
	# activation fn
	key = op[0]
	v = op[1:]
	if v == 're':
	value = get_act_layer('relu')
	elif v == 'r6':
	value = get_act_layer('relu6')
	elif v == 'hs':
	value = get_act_layer('hard_swish')
	elif v == 'sw':
	value = get_act_layer('swish')
	else:
	continue
	options[key] = value
	else:
	# all numeric options
	splits = re.split(r'(\d.*)', op)
	if len(splits) >= 2:
	key, value = splits[:2]
	options[key] = value

	# if act_layer is None, the model default (passed to model init) will be used
	act_layer = options['n'] if 'n' in options else None
	exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
	pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
	fake_in_chs = int(options['fc']) if 'fc' in options else 0 # FIXME hack to deal with in_chs issue in TPU def

	num_repeat = int(options['r'])
	# each type of block has different valid arguments, fill accordingly
	if block_type == 'ir':
	block_args = dict(
	block_type=block_type,
	dw_kernel_size=_parse_ksize(options['k']),
	exp_kernel_size=exp_kernel_size,
	pw_kernel_size=pw_kernel_size,
	out_chs=int(options['c']),
	exp_ratio=float(options['e']),
	se_ratio=float(options['se']) if 'se' in options else None,
	stride=int(options['s']),
	act_layer=act_layer,
	noskip=noskip,
	)
	if 'cc' in options:
	block_args['num_experts'] = int(options['cc'])
	elif block_type == 'ds' or block_type == 'dsa':
	block_args = dict(
	block_type=block_type,
	dw_kernel_size=_parse_ksize(options['k']),
	pw_kernel_size=pw_kernel_size,
	out_chs=int(options['c']),
	se_ratio=float(options['se']) if 'se' in options else None,
	stride=int(options['s']),
	act_layer=act_layer,
	pw_act=block_type == 'dsa',
	noskip=block_type == 'dsa' or noskip,
	)
	elif block_type == 'er':
	block_args = dict(
	block_type=block_type,
	exp_kernel_size=_parse_ksize(options['k']),
	pw_kernel_size=pw_kernel_size,
	out_chs=int(options['c']),
	exp_ratio=float(options['e']),
	fake_in_chs=fake_in_chs,
	se_ratio=float(options['se']) if 'se' in options else None,
	stride=int(options['s']),
	act_layer=act_layer,
	noskip=noskip,
	)
	elif block_type == 'cn':
	block_args = dict(
	block_type=block_type,
	kernel_size=int(options['k']),
	out_chs=int(options['c']),
	stride=int(options['s']),
	act_layer=act_layer,
	)
	else:
	assert False, 'Unknown block type (%s)' % block_type

	return block_args, num_repeat


	def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
	""" Per-stage depth scaling
	Scales the block repeats in each stage. This depth scaling impl maintains
	compatibility with the EfficientNet scaling method, while allowing sensible
	scaling for other models that may have multiple block arg definitions in each stage.
	"""

	# We scale the total repeat count for each stage, there may be multiple
	# block arg defs per stage so we need to sum.
	num_repeat = sum(repeats)
	if depth_trunc == 'round':
	# Truncating to int by rounding allows stages with few repeats to remain
	# proportionally smaller for longer. This is a good choice when stage definitions
	# include single repeat stages that we'd prefer to keep that way as long as possible
	num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
	else:
	# The default for EfficientNet truncates repeats to int via 'ceil'.
	# Any multiplier > 1.0 will result in an increased depth for every stage.
	num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))

	# Proportionally distribute repeat count scaling to each block definition in the stage.
	# Allocation is done in reverse as it results in the first block being less likely to be scaled.
	# The first block makes less sense to repeat in most of the arch definitions.
	repeats_scaled = []
	for r in repeats[::-1]:
	rs = max(1, round((r / num_repeat * num_repeat_scaled)))
	repeats_scaled.append(rs)
	num_repeat -= r
	num_repeat_scaled -= rs
	repeats_scaled = repeats_scaled[::-1]

	# Apply the calculated scaling to each block arg in the stage
	sa_scaled = []
	for ba, rep in zip(stack_args, repeats_scaled):
	sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
	return sa_scaled


	def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
	arch_args = []
	for stack_idx, block_strings in enumerate(arch_def):
	assert isinstance(block_strings, list)
	stack_args = []
	repeats = []
	for block_str in block_strings:
	assert isinstance(block_str, str)
	ba, rep = _decode_block_str(block_str)
	if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
	ba['num_experts'] *= experts_multiplier
	stack_args.append(ba)
	repeats.append(rep)
	if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
	arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
	else:
	arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
	return arch_args


	def initialize_weight_goog(m, n='', fix_group_fanout=True):
	# weight init as per Tensorflow Official impl
	# https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
	if isinstance(m, CondConv2d):
	fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
	if fix_group_fanout:
	fan_out //= m.groups
	init_weight_fn = get_condconv_initializer(
	lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
	init_weight_fn(m.weight)
	if m.bias is not None:
	m.bias.data.zero_()
	elif isinstance(m, nn.Conv2d):
	fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
	if fix_group_fanout:
	fan_out //= m.groups
	m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
	if m.bias is not None:
	m.bias.data.zero_()
	elif isinstance(m, nn.BatchNorm2d):
	m.weight.data.fill_(1.0)
	m.bias.data.zero_()
	elif isinstance(m, nn.Linear):
	fan_out = m.weight.size(0) # fan-out
	fan_in = 0
	if 'routing_fn' in n:
	fan_in = m.weight.size(1)
	init_range = 1.0 / math.sqrt(fan_in + fan_out)
	m.weight.data.uniform_(-init_range, init_range)
	m.bias.data.zero_()


	def initialize_weight_default(m, n=''):
	if isinstance(m, CondConv2d):
	init_fn = get_condconv_initializer(partial(
	nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape)
	init_fn(m.weight)
	elif isinstance(m, nn.Conv2d):
	nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
	elif isinstance(m, nn.BatchNorm2d):
	m.weight.data.fill_(1.0)
	m.bias.data.zero_()
	elif isinstance(m, nn.Linear):
	nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')