regularizer.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. # @package optimizer
  2. # Module caffe2.python.regularizer
  3. from caffe2.python import core, utils
  4. import numpy as np
  5. class RegularizationBy(object):
  6. AFTER_OPTIMIZER = "after_optimizer"
  7. ON_LOSS = "on_loss"
  8. class Regularizer(object):
  9. def __init__(self):
  10. self.kEpsilon = 1e-9
  11. """
  12. Adds regularization to train_net for given parameter. Its factor ahead of
  13. regularization is given when initialization.
  14. The param should be a BlobReference.
  15. """
  16. def __call__(self, net, param_init_net, param, grad=None, by=None):
  17. assert isinstance(param, core.BlobReference)
  18. by_enum = utils.EnumClassKeyVals(RegularizationBy)
  19. assert by in by_enum.values(), (
  20. "Regularizer of type {} is called with invalid by={}, "
  21. "not in {}".format(self.__class__, by, by_enum.values())
  22. )
  23. run_func = "_run_" + by
  24. assert hasattr(
  25. self, run_func
  26. ), "Regularizer of type {} does not implement function {}".format(
  27. self.__class__, run_func
  28. )
  29. return getattr(self, run_func)(net, param_init_net, param, grad)
  30. def _run_on_loss(self, net, param_init_net, param, grad=None):
  31. return None
  32. def _run_after_optimizer(self, net, param_init_net, param, grad):
  33. return None
  34. def _feature_grouping(self, param, net):
  35. # Possible alternative grouping method via summing over absolute values
  36. # Compute l2norm over feature weights
  37. # pow( sum_i { pow(theda_i, 2) } , 0.5)
  38. param_mul = net.Mul([param, param], [net.NextScopedBlob("param_mul")])
  39. param_reduced = net.ReduceFrontSum(
  40. [param_mul], [net.NextScopedBlob("param_reduced")]
  41. )
  42. grouped_feature_weight_vec = net.Pow(
  43. [param_reduced],
  44. [net.NextScopedBlob("grouped_feature_weight_vec")],
  45. exponent=0.5,
  46. )
  47. return grouped_feature_weight_vec
  48. def _ensure_clipped(
  49. self,
  50. net,
  51. param,
  52. grad=None,
  53. min=None,
  54. max=None,
  55. open_range=False,
  56. left_open=False,
  57. right_open=False,
  58. ):
  59. min = (
  60. min + self.kEpsilon
  61. if min is not None and (open_range or left_open)
  62. else min
  63. )
  64. max = (
  65. max - self.kEpsilon
  66. if max is not None and (open_range or right_open)
  67. else max
  68. )
  69. input_blobs = (
  70. [param, grad.indices, grad.values]
  71. if isinstance(grad, core.GradientSlice)
  72. else [param]
  73. )
  74. net.EnsureClipped(input_blobs, [param], min=min, max=max)
  75. class L1Norm(Regularizer):
  76. def __init__(self, reg_lambda):
  77. super(L1Norm, self).__init__()
  78. assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
  79. self.reg_lambda = reg_lambda
  80. def _run_on_loss(self, net, param_init_net, param, grad=None):
  81. output_blob = net.NextScopedBlob(param + "_l1_regularization")
  82. net.LpNorm([param], [output_blob], p=1)
  83. net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
  84. return output_blob
  85. class LpNorm(Regularizer):
  86. def __init__(self, reg_lambda, p_value=0.5):
  87. """
  88. reg_lambda: parameter to scale regularization by
  89. p_value: determines what type of Lp norm to calculate. If p > 0,
  90. we will calculate Lp norm with the formula:
  91. pow( sum_i { pow(theda_i, p) } , 1/p)
  92. """
  93. super(LpNorm, self).__init__()
  94. assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
  95. assert p_value > 0, "p_value factor should be greater than 0"
  96. self.p_value = p_value
  97. self.reg_lambda = reg_lambda
  98. def _run_on_loss(self, net, param_init_net, param, grad=None):
  99. # TODO: the second dim (num of input nodes) of param is after feature preproc,
  100. # and does not correspond to the original num of dense features.
  101. # In the future, will want to create a util to reduce the input dim of param to
  102. # match the num of dense features.
  103. output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
  104. grouped_feature_weight_vec = self._feature_grouping(param, net)
  105. # Compute Lpnorm:
  106. # pow( sum_i { pow(theda_i, p) } , 1/p)
  107. lp_vec_raised = net.Pow(
  108. [grouped_feature_weight_vec],
  109. [net.NextScopedBlob("lp_vec_raised")],
  110. exponent=self.p_value,
  111. )
  112. lp_vec_summed = net.ReduceFrontSum(
  113. [lp_vec_raised], [net.NextScopedBlob("lp_vec_summed")]
  114. )
  115. lp_norm = net.Pow(
  116. [lp_vec_summed],
  117. [net.NextScopedBlob("lp_vec")],
  118. exponent=(1 / self.p_value),
  119. )
  120. net.Scale([lp_norm], [output_blob], scale=self.reg_lambda)
  121. return output_blob
  122. class L0ApproxNorm(Regularizer):
  123. def __init__(self, reg_lambda, alpha=0.01, budget=0):
  124. """
  125. reg_lambda: parameter to scale regularization by
  126. alpha: hyper parameter to tune that is only used in the calculation
  127. of approximate L0 norm
  128. budget: desired number of features. If the number of features is greater
  129. than the budget amount, then the least important features will
  130. be penalized. If there are fewer features than the desired
  131. budget, no penalization will be applied. Optional parameter, if
  132. 0, then no budget is used
  133. """
  134. super(L0ApproxNorm, self).__init__()
  135. assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
  136. assert alpha > 0, "alpha factor must be a positive value greater than 0"
  137. assert budget >= 0, "budget factor must be greater than or equal to 0"
  138. self.reg_lambda = reg_lambda
  139. self.alpha = alpha
  140. self.budget = float(budget) # budget must be float for future calculations
  141. def _run_on_loss(self, net, param_init_net, param, grad=None):
  142. # TODO: the second dim (num of input nodes) of param is after feature preproc,
  143. # and does not correspond to the original num of dense features.
  144. # In the future, will want to create a util to reduce the input dim of param to
  145. # match the num of dense features.
  146. output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
  147. grouped_feature_weight_vec = self._feature_grouping(param, net)
  148. # compute approximate L0 norm
  149. # sum_i ( min ( abs (theta_i), alpha))) / alpha
  150. l0_abs = net.Abs([grouped_feature_weight_vec], [net.NextScopedBlob("l0_abs")])
  151. l0_min = net.Clip([l0_abs], [net.NextScopedBlob("l0_min")], max=self.alpha)
  152. l0_summed = net.ReduceFrontSum([l0_min], [net.NextScopedBlob("l0_summed")])
  153. l0_norm = net.Scale(
  154. [l0_summed], [net.NextScopedBlob("l0_norm")], scale=(1 / self.alpha)
  155. )
  156. # incorporate budget factor
  157. # regularization = reg_lambda * max(0, l0_norm - budget)
  158. if self.budget:
  159. budget_blob = net.ConstantFill([], "budget", shape=[1], value=self.budget)
  160. l0_sub_budget = net.Sub(
  161. [l0_norm, budget_blob], [net.NextScopedBlob("l0_budget")]
  162. )
  163. relu_l0_sub_budget = net.Relu(
  164. [l0_sub_budget], [net.NextScopedBlob("relu_l0_sub_budget")]
  165. )
  166. net.Scale([relu_l0_sub_budget], [output_blob], scale=self.reg_lambda)
  167. else:
  168. net.Scale([l0_norm], [output_blob], scale=self.reg_lambda)
  169. return output_blob
  170. class L1NormTrimmed(Regularizer):
  171. """
  172. The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
  173. """
  174. def __init__(self, reg_lambda, k):
  175. super(L1NormTrimmed, self).__init__()
  176. assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
  177. assert isinstance(k, int), "k should be an interger as expected #. after selection"
  178. assert k >= 1, "k should be larger than 1"
  179. self.reg_lambda = reg_lambda
  180. self.k = k
  181. def _run_on_loss(self, net, param_init_net, param, grad=None):
  182. output_blob = net.NextScopedBlob(param + "_l1_trimmed_regularization")
  183. abs = net.Abs([param], [net.NextScopedBlob("abs")])
  184. sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
  185. topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
  186. topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
  187. net.Sub([sum_abs, topk_sum], [output_blob])
  188. net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
  189. return output_blob
  190. class L2Norm(Regularizer):
  191. def __init__(self, reg_lambda):
  192. super(L2Norm, self).__init__()
  193. assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
  194. self.reg_lambda = reg_lambda
  195. def _run_on_loss(self, net, param_init_net, param, grad=None):
  196. output_blob = net.NextScopedBlob(param + "_l2_regularization")
  197. net.LpNorm([param], [output_blob], p=2)
  198. net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
  199. return output_blob
  200. class ElasticNet(Regularizer):
  201. def __init__(self, l1, l2):
  202. super(ElasticNet, self).__init__()
  203. self.l1 = l1
  204. self.l2 = l2
  205. def _run_on_loss(self, net, param_init_net, param, grad=None):
  206. output_blob = net.NextScopedBlob(param + "_elastic_net_regularization")
  207. l2_blob = net.NextScopedBlob(param + "_l2_blob")
  208. l1_blob = net.NextScopedBlob(param + "_l1_blob")
  209. net.LpNorm([param], [l2_blob], p=2)
  210. net.LpNorm([param], [l1_blob], p=1)
  211. net.Scale([l2_blob], [l2_blob], scale=self.l2)
  212. net.Scale([l1_blob], [l1_blob], scale=self.l1)
  213. net.Add([l1_blob, l2_blob], [output_blob])
  214. return output_blob
  215. class ElasticNetL1NormTrimmed(Regularizer):
  216. def __init__(self, l1, l2, k):
  217. super(ElasticNetL1NormTrimmed, self).__init__()
  218. self.l1 = l1
  219. self.l2 = l2
  220. self.k = k
  221. def _run_on_loss(self, net, param_init_net, param, grad=None):
  222. output_blob = net.NextScopedBlob(param + "_elastic_net_l1_trimmed_regularization")
  223. l2_blob = net.NextScopedBlob(param + "_l2_blob")
  224. net.LpNorm([param], [l2_blob], p=2)
  225. net.Scale([l2_blob], [l2_blob], scale=self.l2)
  226. l1_blob = net.NextScopedBlob(param + "_l1_blob")
  227. abs = net.Abs([param], [net.NextScopedBlob("abs")])
  228. sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
  229. topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
  230. topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
  231. net.Sub([sum_abs, topk_sum], [l1_blob])
  232. net.Scale([l1_blob], [l1_blob], scale=self.l1)
  233. net.Add([l1_blob, l2_blob], [output_blob])
  234. return output_blob
  235. class MaxNorm(Regularizer):
  236. def __init__(self, norm=1.0, dtype=None):
  237. super(MaxNorm, self).__init__()
  238. self.norm = norm
  239. self.dtype = dtype
  240. def _run_after_optimizer(self, net, param_init_net, param, grad):
  241. assert self.norm > 0, "norm should be bigger than 0."
  242. if isinstance(grad, core.GradientSlice):
  243. if self.dtype and self.dtype == 'fp16':
  244. net.Float16SparseNormalize(
  245. [param, grad.indices],
  246. [param],
  247. use_max_norm=True,
  248. norm=self.norm,
  249. )
  250. else:
  251. net.SparseNormalize(
  252. [param, grad.indices],
  253. [param],
  254. use_max_norm=True,
  255. norm=self.norm,
  256. )
  257. else:
  258. raise NotImplementedError("MaxNorm is not supported for dense parameters")
  259. class ConstantNorm(Regularizer):
  260. def __init__(self, norm=1.0):
  261. super(ConstantNorm, self).__init__()
  262. self.norm = norm
  263. def _run_after_optimizer(self, net, param_init_net, param, grad):
  264. assert self.norm > 0, "norm should be bigger than 0."
  265. if isinstance(grad, core.GradientSlice):
  266. net.SparseNormalize(
  267. [param, grad.indices],
  268. [param],
  269. use_max_norm=False,
  270. norm=self.norm,
  271. )
  272. else:
  273. raise NotImplementedError(
  274. "ConstantNorm is not supported for dense parameters"
  275. )
  276. class SparseLpNorm(Regularizer):
  277. def __init__(self, p, reg_lambda):
  278. super(SparseLpNorm, self).__init__()
  279. assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
  280. assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
  281. self.p = p
  282. self.reg_lambda = reg_lambda
  283. def _run_after_optimizer(self, net, param_init_net, param, grad):
  284. if isinstance(grad, core.GradientSlice):
  285. net.SparseLpRegularizer(
  286. [param, grad.indices],
  287. [param],
  288. p=self.p,
  289. reg_lambda=self.reg_lambda,
  290. )
  291. else:
  292. raise NotImplementedError("SparseLpNorm is not supported for dense parameters")
  293. class SparseL1Norm(SparseLpNorm):
  294. def __init__(self, reg_lambda):
  295. super(SparseL1Norm, self).__init__(p=1.0, reg_lambda=reg_lambda)
  296. class SparseL2Norm(SparseLpNorm):
  297. def __init__(self, reg_lambda):
  298. super(SparseL2Norm, self).__init__(p=2.0, reg_lambda=reg_lambda)
  299. class LogBarrier(Regularizer):
  300. """
  301. Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
  302. 35(67-68), 7. Chapter 19
  303. """
  304. def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
  305. """
  306. discount is a positive weight that is decreasing, and here it is implemented
  307. similar to the learning rate. It is specified by a learning rate policy and
  308. corresponding options
  309. """
  310. super(LogBarrier, self).__init__()
  311. assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
  312. self.reg_lambda = reg_lambda
  313. self.discount_policy = discount_policy
  314. self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
  315. def _run_on_loss(self, net, param_init_net, param, grad=None):
  316. iteration = utils.BuildUniqueMutexIter(param_init_net, net)
  317. # Since we are most likely to do a minimization
  318. discount = net.NextScopedBlob(param + "_log_barrier_discount")
  319. net.LearningRate(
  320. [iteration],
  321. [discount],
  322. base_lr=-self.reg_lambda,
  323. policy=self.discount_policy,
  324. **self.discount_options
  325. )
  326. # TODO(xlwang): param might still be negative at the initialization time or
  327. # slightly negative due to the distributed training. Enforce it's non-negativity
  328. # for now (at least above machine epsilon)
  329. param_non_neg = net.NextScopedBlob(param + "_non_neg")
  330. net.Clip([param], [param_non_neg], min=self.kEpsilon)
  331. param_log = net.NextScopedBlob(param + "_log")
  332. net.Log([param_non_neg], [param_log])
  333. param_log_sum = net.NextScopedBlob(param + "_log_sum")
  334. net.SumElements([param_log], [param_log_sum])
  335. output_blob = net.NextScopedBlob(param + "_log_barrier")
  336. net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
  337. return output_blob
  338. def _run_after_optimizer(self, net, param_init_net, param, grad):
  339. self._ensure_clipped(net, param, grad, min=0, open_range=True)
  340. class BoundedGradientProjection(Regularizer):
  341. """
  342. Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
  343. 35(67-68), 7. Chapter 16
  344. """
  345. def __init__(
  346. self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
  347. ):
  348. super(BoundedGradientProjection, self).__init__()
  349. lb = float(lb) if lb is not None else None
  350. ub = float(ub) if ub is not None else None
  351. epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
  352. assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
  353. eps=epsilon
  354. )
  355. assert (
  356. (lb is None)
  357. or (ub is None)
  358. or (
  359. lb + (epsilon if left_open else 0.)
  360. <= ub - (epsilon if right_open else 0.)
  361. )
  362. ), (
  363. "Bounded Gradient Projection with invalid "
  364. "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
  365. lb=lb,
  366. ub=ub,
  367. lp="(" if left_open else "[",
  368. rp=")" if right_open else "]",
  369. eps=epsilon,
  370. )
  371. )
  372. self.left_open = left_open
  373. self.right_open = right_open
  374. self.kEpsilon = epsilon
  375. self.lb = lb
  376. self.ub = ub
  377. def _run_after_optimizer(self, net, param_init_net, param, grad):
  378. self._ensure_clipped(
  379. net,
  380. param,
  381. grad,
  382. min=self.lb,
  383. max=self.ub,
  384. left_open=self.left_open,
  385. right_open=self.right_open,
  386. )
  387. class GroupL1Norm(Regularizer):
  388. """
  389. Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
  390. Neurocomputing 241 (2017): 81-89.
  391. This regularizer computes l1 norm of a weight matrix based on groups.
  392. There are essentially three stages in the computation:
  393. 1. Compute the l2 norm on all the members of each group
  394. 2. Scale each l2 norm by the size of each group
  395. 3. Compute the l1 norm of the scaled l2 norms
  396. """
  397. def __init__(self, reg_lambda, groups, stabilizing_val=0):
  398. """
  399. Args:
  400. reg_lambda: The weight of the regularization term.
  401. groups: A list of integers describing the size of each group.
  402. The length of the list is the number of groups.
  403. Optional Args:
  404. stabilizing_val: The computation of GroupL1Norm involves the Sqrt
  405. operator. When values are small, its gradient can be numerically
  406. unstable and causing gradient explosion. Adding this term to
  407. stabilize gradient calculation. Recommended value of this term is
  408. 1e-8, but it depends on the specific scenarios. If the implementation
  409. of the gradient operator of Sqrt has taken into stability into
  410. consideration, this term won't be necessary.
  411. """
  412. super(GroupL1Norm, self).__init__()
  413. assert (
  414. (reg_lambda) >= 0
  415. ), "regularization weight should be 0 or positive"
  416. assert isinstance(groups, list), "groups needs to be a list"
  417. self.reg_lambda = (reg_lambda)
  418. self.groups = groups
  419. self.stabilizing_val = stabilizing_val
  420. def _run_on_loss(self, net, param_init_net, param, grad=None):
  421. """
  422. Args:
  423. param: The input blob to regularize. It should be a weight matrix
  424. blob with shape (output_dim, input_dim). input_dim should be
  425. equal to the sum of self.groups.
  426. Returns:
  427. group_l1_norm: The output blob after applying regularization.
  428. These are the steps of computation:
  429. 1. square all elements
  430. 2. sum by row
  431. 3. lengthssum by group
  432. 4. square_root all elements
  433. 5. normalize each group based on group size
  434. 6. compute l1 norm of each group
  435. 7. scale the result with the regularization lambda
  436. """
  437. squared = net.Sqr(param)
  438. reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
  439. lengths_sum = net.LengthsSum(
  440. [
  441. reduced_sum,
  442. net.GivenTensorIntFill(
  443. [], 1, shape=[len(self.groups)], values=self.groups
  444. ),
  445. ]
  446. )
  447. if self.stabilizing_val:
  448. net.Add(
  449. [lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
  450. [lengths_sum],
  451. broadcast=1,
  452. )
  453. sqrt = net.Sqrt(lengths_sum)
  454. # Here we combine step 5 and step 7 into one operator call to
  455. # improve efficiency: values = np.sqrt(self.groups) * self.reg_lambda
  456. l2_scaled = net.Mul(
  457. [
  458. sqrt,
  459. net.GivenTensorFill(
  460. [],
  461. shape=[len(self.groups)],
  462. values=np.sqrt(self.groups) * self.reg_lambda
  463. )
  464. ],
  465. ['normalized_l2_norm_scaled']
  466. )
  467. group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)
  468. return group_l1_norm