gradient_checker.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. ## @package gradient_checker
  2. # Module caffe2.python.gradient_checker
  3. import os
  4. import numpy as np
  5. from caffe2.python import core, workspace, net_drawer
  6. from caffe2.proto import caffe2_pb2
  7. def getGradientForOp(op):
  8. return core.GradientRegistry.GetGradientForOp(
  9. op, [s + '_grad' for s in op.output])
  10. def _get_grad_blob(grad_map, input_to_check):
  11. grad_blob = grad_map[input_to_check]
  12. if isinstance(grad_blob, core.BlobReference):
  13. return workspace.blobs[grad_blob]
  14. # If grad_blob is not a single blob, it should be a gradient slice.
  15. # To make it comparable with the estimiated gradient which is dense,
  16. # we need to first convert grad_blob to dense gradient.
  17. assert isinstance(grad_blob, core.GradientSlice)
  18. dense_grad = 'tmp_dense_grad'
  19. sparse_to_dense_op = core.CreateOperator(
  20. 'SparseToDense',
  21. [grad_blob.indices, grad_blob.values, input_to_check],
  22. dense_grad,
  23. )
  24. workspace.RunOperatorOnce(sparse_to_dense_op)
  25. return workspace.blobs[dense_grad]
  26. def _get_grad(net, outputs, outputs_with_grad, input_values, inputs_with_grads):
  27. grad_net = net.Clone(net.Name() + "_copy")
  28. grad_map = grad_net.AddGradientOperators(outputs_with_grad)
  29. for name, value in (input_values or {}).items():
  30. workspace.blobs[name] = value
  31. for input_to_check in inputs_with_grads:
  32. assert input_to_check in grad_map, (
  33. '{} has no gradient, cannot check net gradient.'.format(
  34. input_to_check))
  35. assert str(input_to_check) in workspace.blobs
  36. workspace.RunNetOnce(grad_net)
  37. forward_results = [(output, workspace.blobs[output]) for output in outputs]
  38. grads = {input_to_check: _get_grad_blob(grad_map, input_to_check)
  39. for input_to_check in inputs_with_grads}
  40. return forward_results, grads, grad_net
  41. def _assert_close(value1, value2, threshold, err_msg=''):
  42. np.testing.assert_allclose(
  43. value1, value2,
  44. atol=threshold, rtol=threshold,
  45. err_msg=err_msg,
  46. )
  47. delta = np.abs(value1 - value2).flatten()
  48. return np.mean(delta), max(delta)
  49. class NetGradientChecker(object):
  50. @staticmethod
  51. def CompareNets(nets, outputs, outputs_with_grad_ids,
  52. inputs_with_grads, input_values=None,
  53. threshold=0.0000001, print_net_images=False):
  54. def _get_output_with_grad_names(net_outputs):
  55. return [net_outputs[i] for i in outputs_with_grad_ids]
  56. if print_net_images:
  57. for i, net in enumerate(nets):
  58. png = net_drawer.GetPydotGraph(net).create_png()
  59. with open("caffe2_net_forward_" + str(i) + net.Name() + ".png",
  60. 'wb') \
  61. as f:
  62. f.write(png)
  63. results = [
  64. _get_grad(net, net_outputs,
  65. _get_output_with_grad_names(net_outputs),
  66. input_values, inputs_with_grads)
  67. for net, net_outputs in zip(nets, outputs)
  68. ]
  69. if print_net_images:
  70. _, _, backward_nets = zip(*results)
  71. for i, net in enumerate(backward_nets):
  72. png = net_drawer.GetPydotGraph(net).create_png()
  73. with open("caffe2_net_" + str(i) + net.Name() + ".png", 'wb') \
  74. as f:
  75. f.write(png)
  76. first_net_results, first_net_grads, _ = results[0]
  77. for net_results, net_grads, _ in results[1:]:
  78. assert len(net_results) == len(first_net_results)
  79. for idx, ((blob1, blob_value1), (blob2, blob_value2)) in enumerate(
  80. zip(first_net_results, net_results)):
  81. _assert_close(
  82. blob_value1, blob_value2, threshold,
  83. err_msg="Different forward pass results for output id {}. "
  84. "Corresponding output blobs: {} and {}".format(
  85. idx, blob1, blob2))
  86. assert net_grads.keys() == first_net_grads.keys()
  87. for blob, blob_grad_value in net_grads.items():
  88. _assert_close(
  89. first_net_grads[blob], blob_grad_value, threshold,
  90. err_msg="Different gradients for input {}".format(blob))
  91. @staticmethod
  92. def Check(net, outputs_with_grad, input_values,
  93. input_to_check, step_size=0.0001,
  94. threshold=0.05, print_net=True):
  95. net_results, net_grads, full_net = _get_grad(
  96. net, [], outputs_with_grad, input_values, [input_to_check])
  97. analytic_grad = net_grads[input_to_check]
  98. def GetLoss(new_value):
  99. workspace.blobs[input_to_check] = new_value
  100. workspace.RunNetOnce(full_net)
  101. return sum([
  102. workspace.blobs[output]
  103. for output in outputs_with_grad
  104. ]).sum()
  105. def GetValue(dim, delta):
  106. input_value = input_values[input_to_check].copy()
  107. input_value.flat[dim] += delta
  108. return input_value
  109. grad_estimate = np.zeros_like(input_values[input_to_check])
  110. for dim in range(input_values[input_to_check].size):
  111. pos_loss = GetLoss(GetValue(dim, step_size))
  112. neg_loss = GetLoss(GetValue(dim, -step_size))
  113. grad_estimate.flat[dim] = (pos_loss - neg_loss) / step_size / 2
  114. err_msg = "Error in gradient check for net_copy {}".format(
  115. net.Name())
  116. if print_net:
  117. err_msg += ": {}".format(net.Proto())
  118. return _assert_close(analytic_grad, grad_estimate, threshold, err_msg)
  119. class GradientChecker:
  120. """A gradient checker in Python.
  121. This is not the most efficient way to check gradients, as the Python
  122. interface will involve a lot of copies back and forth operations. Use at your
  123. own risk.
  124. """
  125. def __init__(
  126. self,
  127. stepsize,
  128. threshold,
  129. device_option=None,
  130. workspace_name="gradient_check",
  131. input_device_options=None,
  132. ):
  133. self._stepsize = stepsize
  134. self._threshold = threshold
  135. self._device_option = device_option or caffe2_pb2.DeviceOption()
  136. self._workspace_name = workspace_name
  137. if input_device_options is None:
  138. self._input_device_options = {}
  139. else:
  140. self._input_device_options = input_device_options
  141. def GetLossAndGrad(
  142. self, op, grad_ops, inputs, input_names, input_to_check, grad_name,
  143. outputs_with_grads
  144. ):
  145. for i in range(len(inputs)):
  146. workspace.FeedBlob(input_names[i], inputs[i],
  147. self._input_device_options.get(
  148. input_names[i], self._device_option))
  149. x = inputs[input_to_check]
  150. # Run.
  151. workspace.RunOperatorOnce(op)
  152. loss = 0.
  153. # Get Loss and feed in the gradients, run gradient ops.
  154. for idx in outputs_with_grads:
  155. name = op.output[idx]
  156. arr = workspace.FetchBlob(name)
  157. loss += (arr**2).sum()
  158. workspace.FeedBlob(name + '_grad', arr, self._device_option)
  159. loss /= 2.
  160. # Run gradient ops
  161. workspace.RunOperatorsOnce(grad_ops)
  162. # Get gradients
  163. if isinstance(grad_name, core.GradientSlice):
  164. workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32))
  165. workspace.FeedBlob('ones', np.ones(1, dtype=np.float32))
  166. gv_cpu_op = core.CreateOperator(
  167. 'EnsureCPUOutput', grad_name.values, grad_name.values + '_cpu',
  168. device_option=self._device_option
  169. )
  170. gi_cpu_op = core.CreateOperator(
  171. 'EnsureCPUOutput', grad_name.indices, grad_name.indices + '_cpu',
  172. device_option=self._device_option
  173. )
  174. sparse_to_dense_op = core.CreateOperator(
  175. 'ScatterWeightedSum',
  176. [
  177. 'zeros', 'ones', grad_name.indices + '_cpu',
  178. grad_name.values + '_cpu', 'ones'
  179. ],
  180. 'zeros',
  181. )
  182. workspace.RunOperatorOnce(gv_cpu_op)
  183. workspace.RunOperatorOnce(gi_cpu_op)
  184. workspace.RunOperatorOnce(sparse_to_dense_op)
  185. grad = workspace.FetchBlob('zeros')
  186. else:
  187. grad = workspace.FetchBlob(grad_name)
  188. return loss, grad
  189. def CheckSimple(
  190. self,
  191. op,
  192. inputs,
  193. input_to_check,
  194. outputs_with_grads,
  195. grad_ops=None,
  196. input_device_options=None,
  197. ensure_outputs_are_inferred=False,
  198. ):
  199. """Checks the operator in a very simple fashion by stacking a sum of
  200. squares on the top.
  201. Inputs:
  202. op: the operator to be checked.
  203. inputs: the input data in numpy arrays.
  204. input_to_check: an index specifying which input blob we should
  205. check.
  206. outputs_with_grads: indices specifying which output blobs will we
  207. need to check gradients with. For these outputs, we will collect a
  208. squared sum and also feed in their gradients.
  209. grad_operator: the gradient operator. If not given, we will get the
  210. gradient operator from the gradient registry.
  211. input_device_options: an optional mapping from input names to
  212. DeviceOptions (to override the default DeviceOption)
  213. ensure_outputs_are_inferred: if set will assert that the gradient output
  214. shapes matches the inferred shapes
  215. Outputs:
  216. boolean: True if it passes, False if it does not pass.
  217. """
  218. # Entering the checker workspace
  219. old_ws_name = workspace.CurrentWorkspace()
  220. if self._workspace_name != old_ws_name:
  221. workspace.SwitchWorkspace(self._workspace_name, True)
  222. op.device_option.CopyFrom(self._device_option)
  223. if grad_ops is None:
  224. # TODO(jiayq): use the gradient registration instead of the old
  225. # hack.
  226. grad_ops, g_input = getGradientForOp(op)
  227. _input_device_options = input_device_options or \
  228. core.InferOpBlobDevicesAsDict(op)[0]
  229. # First, feed in the input.
  230. for i, arr in enumerate(inputs):
  231. workspace.FeedBlob(
  232. op.input[i], arr,
  233. _input_device_options.get(
  234. op.input[i], self._device_option))
  235. # Get the loss and gradient for the original.
  236. grad_name = g_input[input_to_check]
  237. loss, grad = self.GetLossAndGrad(
  238. op, grad_ops, inputs, op.input, input_to_check, grad_name,
  239. outputs_with_grads,
  240. )
  241. grad_estimate = np.zeros_like(inputs[input_to_check])
  242. if grad_estimate.shape != grad.shape:
  243. raise Exception(
  244. "Mismatched gradient shapes: estimated ({}), grad ({})".format(
  245. grad_estimate.shape, grad.shape))
  246. if ensure_outputs_are_inferred:
  247. self._assertInferTensorChecks(op, grad_ops)
  248. full_grad_check = os.getenv('CAFFE2_FULL_GRAD_CHECK') == '1'
  249. dims_to_check = inputs[input_to_check].size
  250. for current_dim in range(dims_to_check):
  251. # Grad check is very expensive (as it involves running the op from
  252. # scratch for each of the input tensor elements). Thus, let's
  253. # run it by default only on a small subset of dimensions. Here we
  254. # apply very scientific approach: the first and the last 3 elements
  255. # of each tensor. Pass CAFFE2_FULL_GRAD_CHECK=1 env var to enable
  256. # the full check
  257. if not full_grad_check and current_dim >= 3 and \
  258. current_dim + 3 < dims_to_check:
  259. grad_estimate.flat[current_dim] = grad.flat[current_dim]
  260. continue
  261. # Positive gradient
  262. inputs[input_to_check].flat[current_dim] += self._stepsize
  263. pos_loss, _ = self.GetLossAndGrad(
  264. op, grad_ops, inputs, op.input, input_to_check, grad_name,
  265. outputs_with_grads
  266. )
  267. # Negative gradient
  268. inputs[input_to_check].flat[current_dim] -= self._stepsize * 2
  269. neg_loss, _ = self.GetLossAndGrad(
  270. op, grad_ops, inputs, op.input, input_to_check, grad_name,
  271. outputs_with_grads
  272. )
  273. # Recover the value
  274. inputs[input_to_check].flat[current_dim] += self._stepsize
  275. grad_estimate.flat[current_dim] = (
  276. pos_loss - neg_loss) / self._stepsize / 2
  277. # Now, check correctness
  278. fail_mat = ~np.isclose(
  279. grad, grad_estimate, atol=self._threshold, rtol=self._threshold)
  280. if np.any(fail_mat):
  281. idx = np.flatnonzero(fail_mat)
  282. print('Failed. [idx, grad, grad_estimate] are:')
  283. print(np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T)
  284. ret = False
  285. else:
  286. ret = True
  287. # After finishing, cleaning up things.
  288. if self._workspace_name != old_ws_name:
  289. # We reset the workspace to make sure everything intermediate is
  290. # cleaned up. Note that there is no need to delete a workspace -
  291. # when empty it takes a very limited amount of memory.
  292. workspace.ResetWorkspace()
  293. workspace.SwitchWorkspace(old_ws_name)
  294. return ret, grad, grad_estimate
  295. def _assertInferTensorChecks(self, op, grad_ops):
  296. tmp_net = caffe2_pb2.NetDef()
  297. tmp_net.op.extend([op])
  298. tmp_net.op.extend(grad_ops)
  299. inferred_shapes, inferred_types = workspace.InferShapesAndTypes(
  300. [tmp_net],
  301. nets_proto=True,
  302. )
  303. outputs = set()
  304. for grad_op in grad_ops:
  305. outputs.update(grad_op.output)
  306. for output in outputs:
  307. if output not in inferred_shapes:
  308. raise Exception(
  309. "expected output {} to be inferred".format(output))
  310. blob = workspace.FetchBlob(output)
  311. correct_shape = list(blob.shape)
  312. inferred_shape = list(inferred_shapes[output])
  313. if correct_shape != inferred_shape:
  314. raise Exception(
  315. "Mismatched inferred shape: want({}), got({})".format(
  316. correct_shape, inferred_shape))
  317. if type(blob) is np.ndarray:
  318. if blob.dtype == np.dtype('float64'):
  319. correct_type = caffe2_pb2.TensorProto.DOUBLE
  320. elif blob.dtype == np.dtype('float32'):
  321. correct_type = caffe2_pb2.TensorProto.FLOAT
  322. elif blob.dtype == np.dtype('int32'):
  323. correct_type = caffe2_pb2.TensorProto.INT32
  324. elif blob.dtype == np.dtype('int64'):
  325. correct_type = caffe2_pb2.TensorProto.INT64
  326. else:
  327. correct_type = "unknown {}".format(np.dtype)
  328. else:
  329. correct_type = str(type(blob))
  330. inferred_type = inferred_types[output]
  331. if correct_type != inferred_type:
  332. raise Exception(
  333. "Mismatched inferred type: want({}), got({})".format(
  334. correct_type, inferred_type))