gradient_clipping.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. from caffe2.python import core
  2. from caffe2.proto import caffe2_pb2
  3. from caffe2.python.optimizer import get_param_device
  4. from caffe2.python.modeling.net_modifier import NetModifier
  5. import logging
  6. logger = logging.getLogger(__name__)
  7. class GradientClipping(NetModifier):
  8. L1_NORM = 'l1_norm'
  9. L2_NORM = 'l2_norm'
  10. BY_NORM = 'by_norm'
  11. BY_VALUE = 'by_value'
  12. GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
  13. CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
  14. def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
  15. clip_threshold=0.1, use_parameter_norm=False,
  16. compute_norm_ratio=False, clip_max=1, clip_min=-1,
  17. blobs_to_include=None, blobs_to_exclude=None):
  18. """
  19. Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
  20. Args:
  21. grad_clip_method: ways to clip the gradients
  22. clip_norm_type: type of norm used in the necessary computation
  23. clip_threshold: threshold used to determine whether to clip
  24. use_parameter_norm: a boolean to indicate whether to incorporate
  25. the norm of the parameter
  26. compute_norm_ratio: a boolean to compute the ratio between gradient norm
  27. and parameter norm explicitly for debugging purpose
  28. clip_max: when clipping by_value, any value that is greater than
  29. clip_max will be clipped to clip_max
  30. clip_min: when clipping by_value, any value that is smaller than
  31. clip_min will be clipped to clip_min
  32. blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
  33. to none, all param 's gradient in grad_map will be clipped.
  34. blobs_to_exclude: names of blobs whose gradient is not to be clipped.
  35. """
  36. assert grad_clip_method in self.GRAD_CLIP_METHODS, (
  37. "This method of clipping, {}, has not been implemented.".format(
  38. clip_norm_type))
  39. if clip_norm_type is not None:
  40. assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
  41. "This method of clipping, {}, has not been implemented.".format(
  42. clip_norm_type))
  43. self.grad_clip_method = grad_clip_method
  44. self.clip_norm_type = clip_norm_type
  45. self.clip_threshold = float(clip_threshold)
  46. self.use_parameter_norm = use_parameter_norm
  47. self.compute_norm_ratio = compute_norm_ratio
  48. self.clip_max = float(clip_max)
  49. self.clip_min = float(clip_min)
  50. self.blobs_to_include = blobs_to_include
  51. self.blobs_to_exclude = blobs_to_exclude
  52. def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
  53. modify_output_record=False):
  54. assert grad_map is not None
  55. CPU = core.DeviceOption(caffe2_pb2.CPU)
  56. final_param_map = {}
  57. if self.blobs_to_include is None:
  58. final_param_map = grad_map
  59. else:
  60. for blob in self.blobs_to_include:
  61. param = core.BlobReference(blob)
  62. if not net.BlobIsDefined(param):
  63. raise Exception('param {0} is not defined in net {1}'.format(
  64. param, net.Name()))
  65. final_param_map[param] = grad_map[param]
  66. if self.blobs_to_exclude is not None:
  67. for blob in self.blobs_to_exclude:
  68. final_param_map.pop(blob, None)
  69. for param, grad in final_param_map.items():
  70. # currently sparse gradients won't be clipped
  71. # further implementation is needed to enable it
  72. if isinstance(grad, core.GradientSlice):
  73. continue
  74. device = get_param_device(
  75. param,
  76. grad_map[str(param)],
  77. param_to_device=blob_to_device,
  78. default_device=CPU,
  79. )
  80. with core.DeviceScope(device):
  81. if self.grad_clip_method == self.BY_NORM:
  82. if self.clip_norm_type == self.L2_NORM:
  83. p = 2
  84. elif self.clip_norm_type == self.L1_NORM:
  85. p = 1
  86. grad_norm = net.LpNorm(
  87. [grad],
  88. net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
  89. p=p,
  90. )
  91. if p == 2:
  92. grad_norm = net.Pow([grad_norm], exponent=0.5)
  93. op_inputs = [grad, grad_norm]
  94. if self.use_parameter_norm:
  95. param_norm = net.LpNorm(
  96. [param],
  97. net.NextScopedBlob(
  98. prefix=str(param) + '_l{}_norm'.format(p)),
  99. p=p,
  100. )
  101. if p == 2:
  102. param_norm = net.Pow([param_norm], exponent=0.5)
  103. op_inputs.append(param_norm)
  104. if self.compute_norm_ratio:
  105. net.Div(
  106. [grad_norm, param_norm],
  107. [net.NextScopedBlob(
  108. prefix=str(param) + "_norm_ratio")]
  109. )
  110. net.ClipTensorByScaling(
  111. op_inputs,
  112. [grad],
  113. threshold=self.clip_threshold,
  114. )
  115. elif self.grad_clip_method == self.BY_VALUE:
  116. net.Clip(
  117. [grad],
  118. [grad],
  119. max=self.clip_max,
  120. min=self.clip_min,
  121. )