| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549 |
- # @package optimizer
- # Module caffe2.python.regularizer
- from caffe2.python import core, utils
- import numpy as np
- class RegularizationBy(object):
- AFTER_OPTIMIZER = "after_optimizer"
- ON_LOSS = "on_loss"
- class Regularizer(object):
- def __init__(self):
- self.kEpsilon = 1e-9
- """
- Adds regularization to train_net for given parameter. Its factor ahead of
- regularization is given when initialization.
- The param should be a BlobReference.
- """
- def __call__(self, net, param_init_net, param, grad=None, by=None):
- assert isinstance(param, core.BlobReference)
- by_enum = utils.EnumClassKeyVals(RegularizationBy)
- assert by in by_enum.values(), (
- "Regularizer of type {} is called with invalid by={}, "
- "not in {}".format(self.__class__, by, by_enum.values())
- )
- run_func = "_run_" + by
- assert hasattr(
- self, run_func
- ), "Regularizer of type {} does not implement function {}".format(
- self.__class__, run_func
- )
- return getattr(self, run_func)(net, param_init_net, param, grad)
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- return None
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- return None
- def _feature_grouping(self, param, net):
- # Possible alternative grouping method via summing over absolute values
- # Compute l2norm over feature weights
- # pow( sum_i { pow(theda_i, 2) } , 0.5)
- param_mul = net.Mul([param, param], [net.NextScopedBlob("param_mul")])
- param_reduced = net.ReduceFrontSum(
- [param_mul], [net.NextScopedBlob("param_reduced")]
- )
- grouped_feature_weight_vec = net.Pow(
- [param_reduced],
- [net.NextScopedBlob("grouped_feature_weight_vec")],
- exponent=0.5,
- )
- return grouped_feature_weight_vec
- def _ensure_clipped(
- self,
- net,
- param,
- grad=None,
- min=None,
- max=None,
- open_range=False,
- left_open=False,
- right_open=False,
- ):
- min = (
- min + self.kEpsilon
- if min is not None and (open_range or left_open)
- else min
- )
- max = (
- max - self.kEpsilon
- if max is not None and (open_range or right_open)
- else max
- )
- input_blobs = (
- [param, grad.indices, grad.values]
- if isinstance(grad, core.GradientSlice)
- else [param]
- )
- net.EnsureClipped(input_blobs, [param], min=min, max=max)
- class L1Norm(Regularizer):
- def __init__(self, reg_lambda):
- super(L1Norm, self).__init__()
- assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
- self.reg_lambda = reg_lambda
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- output_blob = net.NextScopedBlob(param + "_l1_regularization")
- net.LpNorm([param], [output_blob], p=1)
- net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
- return output_blob
- class LpNorm(Regularizer):
- def __init__(self, reg_lambda, p_value=0.5):
- """
- reg_lambda: parameter to scale regularization by
- p_value: determines what type of Lp norm to calculate. If p > 0,
- we will calculate Lp norm with the formula:
- pow( sum_i { pow(theda_i, p) } , 1/p)
- """
- super(LpNorm, self).__init__()
- assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
- assert p_value > 0, "p_value factor should be greater than 0"
- self.p_value = p_value
- self.reg_lambda = reg_lambda
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- # TODO: the second dim (num of input nodes) of param is after feature preproc,
- # and does not correspond to the original num of dense features.
- # In the future, will want to create a util to reduce the input dim of param to
- # match the num of dense features.
- output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
- grouped_feature_weight_vec = self._feature_grouping(param, net)
- # Compute Lpnorm:
- # pow( sum_i { pow(theda_i, p) } , 1/p)
- lp_vec_raised = net.Pow(
- [grouped_feature_weight_vec],
- [net.NextScopedBlob("lp_vec_raised")],
- exponent=self.p_value,
- )
- lp_vec_summed = net.ReduceFrontSum(
- [lp_vec_raised], [net.NextScopedBlob("lp_vec_summed")]
- )
- lp_norm = net.Pow(
- [lp_vec_summed],
- [net.NextScopedBlob("lp_vec")],
- exponent=(1 / self.p_value),
- )
- net.Scale([lp_norm], [output_blob], scale=self.reg_lambda)
- return output_blob
- class L0ApproxNorm(Regularizer):
- def __init__(self, reg_lambda, alpha=0.01, budget=0):
- """
- reg_lambda: parameter to scale regularization by
- alpha: hyper parameter to tune that is only used in the calculation
- of approximate L0 norm
- budget: desired number of features. If the number of features is greater
- than the budget amount, then the least important features will
- be penalized. If there are fewer features than the desired
- budget, no penalization will be applied. Optional parameter, if
- 0, then no budget is used
- """
- super(L0ApproxNorm, self).__init__()
- assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
- assert alpha > 0, "alpha factor must be a positive value greater than 0"
- assert budget >= 0, "budget factor must be greater than or equal to 0"
- self.reg_lambda = reg_lambda
- self.alpha = alpha
- self.budget = float(budget) # budget must be float for future calculations
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- # TODO: the second dim (num of input nodes) of param is after feature preproc,
- # and does not correspond to the original num of dense features.
- # In the future, will want to create a util to reduce the input dim of param to
- # match the num of dense features.
- output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
- grouped_feature_weight_vec = self._feature_grouping(param, net)
- # compute approximate L0 norm
- # sum_i ( min ( abs (theta_i), alpha))) / alpha
- l0_abs = net.Abs([grouped_feature_weight_vec], [net.NextScopedBlob("l0_abs")])
- l0_min = net.Clip([l0_abs], [net.NextScopedBlob("l0_min")], max=self.alpha)
- l0_summed = net.ReduceFrontSum([l0_min], [net.NextScopedBlob("l0_summed")])
- l0_norm = net.Scale(
- [l0_summed], [net.NextScopedBlob("l0_norm")], scale=(1 / self.alpha)
- )
- # incorporate budget factor
- # regularization = reg_lambda * max(0, l0_norm - budget)
- if self.budget:
- budget_blob = net.ConstantFill([], "budget", shape=[1], value=self.budget)
- l0_sub_budget = net.Sub(
- [l0_norm, budget_blob], [net.NextScopedBlob("l0_budget")]
- )
- relu_l0_sub_budget = net.Relu(
- [l0_sub_budget], [net.NextScopedBlob("relu_l0_sub_budget")]
- )
- net.Scale([relu_l0_sub_budget], [output_blob], scale=self.reg_lambda)
- else:
- net.Scale([l0_norm], [output_blob], scale=self.reg_lambda)
- return output_blob
- class L1NormTrimmed(Regularizer):
- """
- The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
- """
- def __init__(self, reg_lambda, k):
- super(L1NormTrimmed, self).__init__()
- assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
- assert isinstance(k, int), "k should be an interger as expected #. after selection"
- assert k >= 1, "k should be larger than 1"
- self.reg_lambda = reg_lambda
- self.k = k
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- output_blob = net.NextScopedBlob(param + "_l1_trimmed_regularization")
- abs = net.Abs([param], [net.NextScopedBlob("abs")])
- sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
- topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
- topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
- net.Sub([sum_abs, topk_sum], [output_blob])
- net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
- return output_blob
- class L2Norm(Regularizer):
- def __init__(self, reg_lambda):
- super(L2Norm, self).__init__()
- assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
- self.reg_lambda = reg_lambda
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- output_blob = net.NextScopedBlob(param + "_l2_regularization")
- net.LpNorm([param], [output_blob], p=2)
- net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
- return output_blob
- class ElasticNet(Regularizer):
- def __init__(self, l1, l2):
- super(ElasticNet, self).__init__()
- self.l1 = l1
- self.l2 = l2
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- output_blob = net.NextScopedBlob(param + "_elastic_net_regularization")
- l2_blob = net.NextScopedBlob(param + "_l2_blob")
- l1_blob = net.NextScopedBlob(param + "_l1_blob")
- net.LpNorm([param], [l2_blob], p=2)
- net.LpNorm([param], [l1_blob], p=1)
- net.Scale([l2_blob], [l2_blob], scale=self.l2)
- net.Scale([l1_blob], [l1_blob], scale=self.l1)
- net.Add([l1_blob, l2_blob], [output_blob])
- return output_blob
- class ElasticNetL1NormTrimmed(Regularizer):
- def __init__(self, l1, l2, k):
- super(ElasticNetL1NormTrimmed, self).__init__()
- self.l1 = l1
- self.l2 = l2
- self.k = k
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- output_blob = net.NextScopedBlob(param + "_elastic_net_l1_trimmed_regularization")
- l2_blob = net.NextScopedBlob(param + "_l2_blob")
- net.LpNorm([param], [l2_blob], p=2)
- net.Scale([l2_blob], [l2_blob], scale=self.l2)
- l1_blob = net.NextScopedBlob(param + "_l1_blob")
- abs = net.Abs([param], [net.NextScopedBlob("abs")])
- sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
- topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
- topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
- net.Sub([sum_abs, topk_sum], [l1_blob])
- net.Scale([l1_blob], [l1_blob], scale=self.l1)
- net.Add([l1_blob, l2_blob], [output_blob])
- return output_blob
- class MaxNorm(Regularizer):
- def __init__(self, norm=1.0, dtype=None):
- super(MaxNorm, self).__init__()
- self.norm = norm
- self.dtype = dtype
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- assert self.norm > 0, "norm should be bigger than 0."
- if isinstance(grad, core.GradientSlice):
- if self.dtype and self.dtype == 'fp16':
- net.Float16SparseNormalize(
- [param, grad.indices],
- [param],
- use_max_norm=True,
- norm=self.norm,
- )
- else:
- net.SparseNormalize(
- [param, grad.indices],
- [param],
- use_max_norm=True,
- norm=self.norm,
- )
- else:
- raise NotImplementedError("MaxNorm is not supported for dense parameters")
- class ConstantNorm(Regularizer):
- def __init__(self, norm=1.0):
- super(ConstantNorm, self).__init__()
- self.norm = norm
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- assert self.norm > 0, "norm should be bigger than 0."
- if isinstance(grad, core.GradientSlice):
- net.SparseNormalize(
- [param, grad.indices],
- [param],
- use_max_norm=False,
- norm=self.norm,
- )
- else:
- raise NotImplementedError(
- "ConstantNorm is not supported for dense parameters"
- )
- class SparseLpNorm(Regularizer):
- def __init__(self, p, reg_lambda):
- super(SparseLpNorm, self).__init__()
- assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
- assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
- self.p = p
- self.reg_lambda = reg_lambda
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- if isinstance(grad, core.GradientSlice):
- net.SparseLpRegularizer(
- [param, grad.indices],
- [param],
- p=self.p,
- reg_lambda=self.reg_lambda,
- )
- else:
- raise NotImplementedError("SparseLpNorm is not supported for dense parameters")
- class SparseL1Norm(SparseLpNorm):
- def __init__(self, reg_lambda):
- super(SparseL1Norm, self).__init__(p=1.0, reg_lambda=reg_lambda)
- class SparseL2Norm(SparseLpNorm):
- def __init__(self, reg_lambda):
- super(SparseL2Norm, self).__init__(p=2.0, reg_lambda=reg_lambda)
- class LogBarrier(Regularizer):
- """
- Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
- 35(67-68), 7. Chapter 19
- """
- def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
- """
- discount is a positive weight that is decreasing, and here it is implemented
- similar to the learning rate. It is specified by a learning rate policy and
- corresponding options
- """
- super(LogBarrier, self).__init__()
- assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
- self.reg_lambda = reg_lambda
- self.discount_policy = discount_policy
- self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- iteration = utils.BuildUniqueMutexIter(param_init_net, net)
- # Since we are most likely to do a minimization
- discount = net.NextScopedBlob(param + "_log_barrier_discount")
- net.LearningRate(
- [iteration],
- [discount],
- base_lr=-self.reg_lambda,
- policy=self.discount_policy,
- **self.discount_options
- )
- # TODO(xlwang): param might still be negative at the initialization time or
- # slightly negative due to the distributed training. Enforce it's non-negativity
- # for now (at least above machine epsilon)
- param_non_neg = net.NextScopedBlob(param + "_non_neg")
- net.Clip([param], [param_non_neg], min=self.kEpsilon)
- param_log = net.NextScopedBlob(param + "_log")
- net.Log([param_non_neg], [param_log])
- param_log_sum = net.NextScopedBlob(param + "_log_sum")
- net.SumElements([param_log], [param_log_sum])
- output_blob = net.NextScopedBlob(param + "_log_barrier")
- net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
- return output_blob
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- self._ensure_clipped(net, param, grad, min=0, open_range=True)
- class BoundedGradientProjection(Regularizer):
- """
- Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
- 35(67-68), 7. Chapter 16
- """
- def __init__(
- self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
- ):
- super(BoundedGradientProjection, self).__init__()
- lb = float(lb) if lb is not None else None
- ub = float(ub) if ub is not None else None
- epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
- assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
- eps=epsilon
- )
- assert (
- (lb is None)
- or (ub is None)
- or (
- lb + (epsilon if left_open else 0.)
- <= ub - (epsilon if right_open else 0.)
- )
- ), (
- "Bounded Gradient Projection with invalid "
- "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
- lb=lb,
- ub=ub,
- lp="(" if left_open else "[",
- rp=")" if right_open else "]",
- eps=epsilon,
- )
- )
- self.left_open = left_open
- self.right_open = right_open
- self.kEpsilon = epsilon
- self.lb = lb
- self.ub = ub
- def _run_after_optimizer(self, net, param_init_net, param, grad):
- self._ensure_clipped(
- net,
- param,
- grad,
- min=self.lb,
- max=self.ub,
- left_open=self.left_open,
- right_open=self.right_open,
- )
- class GroupL1Norm(Regularizer):
- """
- Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
- Neurocomputing 241 (2017): 81-89.
- This regularizer computes l1 norm of a weight matrix based on groups.
- There are essentially three stages in the computation:
- 1. Compute the l2 norm on all the members of each group
- 2. Scale each l2 norm by the size of each group
- 3. Compute the l1 norm of the scaled l2 norms
- """
- def __init__(self, reg_lambda, groups, stabilizing_val=0):
- """
- Args:
- reg_lambda: The weight of the regularization term.
- groups: A list of integers describing the size of each group.
- The length of the list is the number of groups.
- Optional Args:
- stabilizing_val: The computation of GroupL1Norm involves the Sqrt
- operator. When values are small, its gradient can be numerically
- unstable and causing gradient explosion. Adding this term to
- stabilize gradient calculation. Recommended value of this term is
- 1e-8, but it depends on the specific scenarios. If the implementation
- of the gradient operator of Sqrt has taken into stability into
- consideration, this term won't be necessary.
- """
- super(GroupL1Norm, self).__init__()
- assert (
- (reg_lambda) >= 0
- ), "regularization weight should be 0 or positive"
- assert isinstance(groups, list), "groups needs to be a list"
- self.reg_lambda = (reg_lambda)
- self.groups = groups
- self.stabilizing_val = stabilizing_val
- def _run_on_loss(self, net, param_init_net, param, grad=None):
- """
- Args:
- param: The input blob to regularize. It should be a weight matrix
- blob with shape (output_dim, input_dim). input_dim should be
- equal to the sum of self.groups.
- Returns:
- group_l1_norm: The output blob after applying regularization.
- These are the steps of computation:
- 1. square all elements
- 2. sum by row
- 3. lengthssum by group
- 4. square_root all elements
- 5. normalize each group based on group size
- 6. compute l1 norm of each group
- 7. scale the result with the regularization lambda
- """
- squared = net.Sqr(param)
- reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
- lengths_sum = net.LengthsSum(
- [
- reduced_sum,
- net.GivenTensorIntFill(
- [], 1, shape=[len(self.groups)], values=self.groups
- ),
- ]
- )
- if self.stabilizing_val:
- net.Add(
- [lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
- [lengths_sum],
- broadcast=1,
- )
- sqrt = net.Sqrt(lengths_sum)
- # Here we combine step 5 and step 7 into one operator call to
- # improve efficiency: values = np.sqrt(self.groups) * self.reg_lambda
- l2_scaled = net.Mul(
- [
- sqrt,
- net.GivenTensorFill(
- [],
- shape=[len(self.groups)],
- values=np.sqrt(self.groups) * self.reg_lambda
- )
- ],
- ['normalized_l2_norm_scaled']
- )
- group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)
- return group_l1_norm
|