| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- from caffe2.python import core
- from caffe2.proto import caffe2_pb2
- from caffe2.python.optimizer import get_param_device
- from caffe2.python.modeling.net_modifier import NetModifier
- import logging
- logger = logging.getLogger(__name__)
- class GradientClipping(NetModifier):
- L1_NORM = 'l1_norm'
- L2_NORM = 'l2_norm'
- BY_NORM = 'by_norm'
- BY_VALUE = 'by_value'
- GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
- CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
- def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
- clip_threshold=0.1, use_parameter_norm=False,
- compute_norm_ratio=False, clip_max=1, clip_min=-1,
- blobs_to_include=None, blobs_to_exclude=None):
- """
- Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
- Args:
- grad_clip_method: ways to clip the gradients
- clip_norm_type: type of norm used in the necessary computation
- clip_threshold: threshold used to determine whether to clip
- use_parameter_norm: a boolean to indicate whether to incorporate
- the norm of the parameter
- compute_norm_ratio: a boolean to compute the ratio between gradient norm
- and parameter norm explicitly for debugging purpose
- clip_max: when clipping by_value, any value that is greater than
- clip_max will be clipped to clip_max
- clip_min: when clipping by_value, any value that is smaller than
- clip_min will be clipped to clip_min
- blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
- to none, all param 's gradient in grad_map will be clipped.
- blobs_to_exclude: names of blobs whose gradient is not to be clipped.
- """
- assert grad_clip_method in self.GRAD_CLIP_METHODS, (
- "This method of clipping, {}, has not been implemented.".format(
- clip_norm_type))
- if clip_norm_type is not None:
- assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
- "This method of clipping, {}, has not been implemented.".format(
- clip_norm_type))
- self.grad_clip_method = grad_clip_method
- self.clip_norm_type = clip_norm_type
- self.clip_threshold = float(clip_threshold)
- self.use_parameter_norm = use_parameter_norm
- self.compute_norm_ratio = compute_norm_ratio
- self.clip_max = float(clip_max)
- self.clip_min = float(clip_min)
- self.blobs_to_include = blobs_to_include
- self.blobs_to_exclude = blobs_to_exclude
- def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
- modify_output_record=False):
- assert grad_map is not None
- CPU = core.DeviceOption(caffe2_pb2.CPU)
- final_param_map = {}
- if self.blobs_to_include is None:
- final_param_map = grad_map
- else:
- for blob in self.blobs_to_include:
- param = core.BlobReference(blob)
- if not net.BlobIsDefined(param):
- raise Exception('param {0} is not defined in net {1}'.format(
- param, net.Name()))
- final_param_map[param] = grad_map[param]
- if self.blobs_to_exclude is not None:
- for blob in self.blobs_to_exclude:
- final_param_map.pop(blob, None)
- for param, grad in final_param_map.items():
- # currently sparse gradients won't be clipped
- # further implementation is needed to enable it
- if isinstance(grad, core.GradientSlice):
- continue
- device = get_param_device(
- param,
- grad_map[str(param)],
- param_to_device=blob_to_device,
- default_device=CPU,
- )
- with core.DeviceScope(device):
- if self.grad_clip_method == self.BY_NORM:
- if self.clip_norm_type == self.L2_NORM:
- p = 2
- elif self.clip_norm_type == self.L1_NORM:
- p = 1
- grad_norm = net.LpNorm(
- [grad],
- net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
- p=p,
- )
- if p == 2:
- grad_norm = net.Pow([grad_norm], exponent=0.5)
- op_inputs = [grad, grad_norm]
- if self.use_parameter_norm:
- param_norm = net.LpNorm(
- [param],
- net.NextScopedBlob(
- prefix=str(param) + '_l{}_norm'.format(p)),
- p=p,
- )
- if p == 2:
- param_norm = net.Pow([param_norm], exponent=0.5)
- op_inputs.append(param_norm)
- if self.compute_norm_ratio:
- net.Div(
- [grad_norm, param_norm],
- [net.NextScopedBlob(
- prefix=str(param) + "_norm_ratio")]
- )
- net.ClipTensorByScaling(
- op_inputs,
- [grad],
- threshold=self.clip_threshold,
- )
- elif self.grad_clip_method == self.BY_VALUE:
- net.Clip(
- [grad],
- [grad],
- max=self.clip_max,
- min=self.clip_min,
- )
|