optimizer_test.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. from caffe2.proto import caffe2_pb2
  2. import caffe2.python.optimizer as optimizer
  3. from caffe2.python.optimizer import (
  4. build_sgd, build_multi_precision_sgd, build_ftrl, build_gftrl, build_wngrad,
  5. build_adagrad, build_adadelta, build_adam, build_yellowfin, build_rms_prop,
  6. build_storm, build_decay_adagrad, add_weight_decay, SgdOptimizer)
  7. from caffe2.python.optimizer_context import UseOptimizer
  8. from caffe2.python.optimizer_test_util import (
  9. OptimizerTestBase, LRModificationTestBase
  10. )
  11. from caffe2.python import core, workspace
  12. from caffe2.python.test_util import TestCase
  13. import numpy as np
  14. from numpy.testing import assert_allclose, assert_equal
  15. import math
  16. import unittest
  17. class TestLars(OptimizerTestBase, TestCase):
  18. def testSparse(self):
  19. raise unittest.SkipTest("no sparse support")
  20. def build_optimizer(self, model, **kwargs):
  21. self._skip_gpu = False
  22. return build_sgd(model, base_learning_rate=0.1, lars=0.5, **kwargs)
  23. def check_optimizer(self, optimizer):
  24. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  25. self.assertFalse(optimizer.get_auxiliary_parameters().local)
  26. for param in optimizer.get_auxiliary_parameters().shared:
  27. tensor = workspace.FetchBlob(param)
  28. np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
  29. class TestMomentumSgd(OptimizerTestBase, TestCase):
  30. def build_optimizer(self, model, **kwargs):
  31. self._skip_gpu = False
  32. return build_sgd(model, base_learning_rate=0.1, momentum=0.1, **kwargs)
  33. def check_optimizer(self, optimizer):
  34. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  35. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  36. for param in optimizer.get_auxiliary_parameters().shared:
  37. tensor = workspace.FetchBlob(param)
  38. np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
  39. class TestSgd(OptimizerTestBase, LRModificationTestBase, TestCase):
  40. def build_optimizer(self, model, **kwargs):
  41. self._skip_gpu = False
  42. return build_sgd(model, base_learning_rate=0.1, **kwargs)
  43. def check_optimizer(self, optimizer):
  44. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  45. self.assertFalse(optimizer.get_auxiliary_parameters().local)
  46. for param in optimizer.get_auxiliary_parameters().shared:
  47. tensor = workspace.FetchBlob(param)
  48. np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
  49. class TestMultiPrecisionSgd(
  50. OptimizerTestBase, LRModificationTestBase, TestCase
  51. ):
  52. def build_optimizer(self, model, **kwargs):
  53. self._skip_gpu = False
  54. return build_multi_precision_sgd(
  55. model, base_learning_rate=0.1, **kwargs
  56. )
  57. def check_optimizer(self, optimizer):
  58. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  59. self.assertFalse(optimizer.get_auxiliary_parameters().local)
  60. for param in optimizer.get_auxiliary_parameters().shared:
  61. tensor = workspace.FetchBlob(param)
  62. np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
  63. @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
  64. def testGPUDense(self):
  65. super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
  66. class TestFtrl(OptimizerTestBase, TestCase):
  67. def build_optimizer(self, model, **kwargs):
  68. self._skip_gpu = True
  69. return build_ftrl(
  70. model,
  71. engine=None,
  72. alpha=1.0,
  73. beta=0.1,
  74. lambda1=0.0,
  75. lambda2=0.0,
  76. **kwargs
  77. )
  78. def check_optimizer(self, optimizer):
  79. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  80. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  81. for param in optimizer.get_auxiliary_parameters().local:
  82. workspace.FetchBlob(param)
  83. class TestGFtrl(OptimizerTestBase, TestCase):
  84. def testSparse(self):
  85. raise unittest.SkipTest("no sparse support")
  86. def build_optimizer(self, model, **kwargs):
  87. self._skip_gpu = True
  88. return build_gftrl(
  89. model,
  90. engine=None,
  91. alpha=1.0,
  92. beta=0.1,
  93. lambda1=0.0,
  94. lambda2=0.0,
  95. **kwargs
  96. )
  97. def check_optimizer(self, optimizer):
  98. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  99. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  100. for param in optimizer.get_auxiliary_parameters().local:
  101. workspace.FetchBlob(param)
  102. class TestAdagrad(OptimizerTestBase, LRModificationTestBase, TestCase):
  103. def build_optimizer(self, model, **kwargs):
  104. self._skip_gpu = False
  105. return build_adagrad(model, base_learning_rate=1.0, lars=0.5, **kwargs)
  106. def check_optimizer(self, optimizer):
  107. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  108. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  109. for param in optimizer.get_auxiliary_parameters().local:
  110. workspace.FetchBlob(param)
  111. class TestRowWiseAdagrad(OptimizerTestBase, TestCase):
  112. def build_optimizer(self, model, **kwargs):
  113. self._skip_gpu = True
  114. return build_adagrad(
  115. model, base_learning_rate=1.0, lars=0.5, rowWise=True, **kwargs
  116. )
  117. def check_optimizer(self, optimizer):
  118. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  119. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  120. for param in optimizer.get_auxiliary_parameters().local:
  121. workspace.FetchBlob(param)
  122. def testDense(self):
  123. raise unittest.SkipTest("no dense support")
  124. def testGPUDense(self):
  125. raise unittest.SkipTest("no dense support")
  126. class TestRowWiseAdagradWithCounter(OptimizerTestBase, TestCase):
  127. def build_optimizer(self, model, **kwargs):
  128. self._skip_gpu = True
  129. return build_adagrad(
  130. model,
  131. base_learning_rate=1.0,
  132. lars=0.5,
  133. rowWise=True,
  134. counter_halflife=5,
  135. **kwargs
  136. )
  137. def check_optimizer(self, optimizer):
  138. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  139. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  140. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  141. iteration_tensor = workspace.FetchBlob("optimizer_iteration")
  142. np.testing.assert_allclose(np.array([2000]),
  143. iteration_tensor,
  144. atol=1e-5)
  145. for param in optimizer.get_auxiliary_parameters().shared:
  146. workspace.FetchBlob(param)
  147. for param in optimizer.get_auxiliary_parameters().local:
  148. workspace.FetchBlob(param)
  149. def testDense(self):
  150. raise unittest.SkipTest("no dense support")
  151. def testGPUDense(self):
  152. raise unittest.SkipTest("no dense support")
  153. class TestWngrad(OptimizerTestBase, LRModificationTestBase, TestCase):
  154. def build_optimizer(self, model, **kwargs):
  155. self._skip_gpu = True
  156. return build_wngrad(model, base_learning_rate=25.0, **kwargs)
  157. def check_optimizer(self, optimizer):
  158. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  159. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  160. for param in optimizer.get_auxiliary_parameters().local:
  161. workspace.FetchBlob(param)
  162. class TestStorm(OptimizerTestBase, LRModificationTestBase, TestCase):
  163. def build_optimizer(self, model, **kwargs):
  164. self._skip_gpu = True
  165. return build_storm(model, base_learning_rate=2.0, **kwargs)
  166. def check_optimizer(self, optimizer):
  167. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  168. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  169. for param in optimizer.get_auxiliary_parameters().local:
  170. workspace.FetchBlob(param)
  171. class TestAdadelta(OptimizerTestBase, LRModificationTestBase, TestCase):
  172. def build_optimizer(self, model, **kwargs):
  173. self._skip_gpu = False
  174. return build_adadelta(model, base_learning_rate=1.0, decay=0.995, **kwargs)
  175. def check_optimizer(self, optimizer):
  176. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  177. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  178. for param in optimizer.get_auxiliary_parameters().local:
  179. workspace.FetchBlob(param)
  180. class TestAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
  181. def build_optimizer(self, model, **kwargs):
  182. self._skip_gpu = False
  183. return build_adam(model, base_learning_rate=0.1, **kwargs)
  184. def check_optimizer(self, optimizer):
  185. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  186. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  187. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  188. iteration_tensor = workspace.FetchBlob("optimizer_iteration")
  189. np.testing.assert_allclose(np.array([2000]),
  190. iteration_tensor,
  191. atol=1e-5)
  192. for param in optimizer.get_auxiliary_parameters().shared:
  193. workspace.FetchBlob(param)
  194. for param in optimizer.get_auxiliary_parameters().local:
  195. workspace.FetchBlob(param)
  196. class TestSmartDecayAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
  197. def build_optimizer(self, model, **kwargs):
  198. self._skip_gpu = False
  199. kwargs['beta1'] = 0.0
  200. return build_adam(model, base_learning_rate=0.1, use_smart_decay=True, **kwargs)
  201. def check_optimizer(self, optimizer):
  202. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  203. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  204. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  205. blob_names = workspace.Blobs()
  206. self.assertTrue(any((bn.endswith('_last_seen') for bn in blob_names)))
  207. for param in optimizer.get_auxiliary_parameters().shared:
  208. workspace.FetchBlob(param)
  209. for param in optimizer.get_auxiliary_parameters().local:
  210. workspace.FetchBlob(param)
  211. class TestDecayAdagrad(OptimizerTestBase, LRModificationTestBase, TestCase):
  212. def build_optimizer(self, model, **kwargs):
  213. self._skip_gpu = True
  214. return build_decay_adagrad(model, base_learning_rate=1.0, **kwargs)
  215. def check_optimizer(self, optimizer):
  216. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  217. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  218. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  219. iteration_tensor = workspace.FetchBlob("optimizer_iteration")
  220. np.testing.assert_allclose(np.array([2000]),
  221. iteration_tensor,
  222. atol=1e-5)
  223. for param in optimizer.get_auxiliary_parameters().shared:
  224. workspace.FetchBlob(param)
  225. for param in optimizer.get_auxiliary_parameters().local:
  226. workspace.FetchBlob(param)
  227. def testSparse(self):
  228. raise unittest.SkipTest("no sparse support")
  229. class TestSparseRAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
  230. def build_optimizer(self, model, **kwargs):
  231. self._skip_gpu = True
  232. return build_adam(model, base_learning_rate=0.1, enableRAdam=True, **kwargs)
  233. def check_optimizer(self, optimizer):
  234. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  235. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  236. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  237. iteration_tensor = workspace.FetchBlob("optimizer_iteration")
  238. np.testing.assert_allclose(np.array([2000]),
  239. iteration_tensor,
  240. atol=1e-5)
  241. for param in optimizer.get_auxiliary_parameters().shared:
  242. workspace.FetchBlob(param)
  243. for param in optimizer.get_auxiliary_parameters().local:
  244. workspace.FetchBlob(param)
  245. class TestYellowFin(OptimizerTestBase, TestCase):
  246. # YellowFin: An automatic tuner for momentum SGD
  247. # (https://arxiv.org/abs/1706.03471)
  248. def build_optimizer(self, model):
  249. self._skip_gpu = False
  250. return build_yellowfin(model, base_learning_rate=0.1)
  251. def check_optimizer(self, optimizer):
  252. self.assertTrue(optimizer.get_auxiliary_parameters().shared)
  253. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  254. self.assertTrue(workspace.HasBlob("optimizer_iteration"))
  255. iteration_tensor = workspace.FetchBlob("optimizer_iteration")
  256. np.testing.assert_allclose(np.array([2000]),
  257. iteration_tensor,
  258. atol=1e-5)
  259. for param in optimizer.get_auxiliary_parameters().shared:
  260. workspace.FetchBlob(param)
  261. for param in optimizer.get_auxiliary_parameters().local:
  262. workspace.FetchBlob(param)
  263. def testSparse(self):
  264. raise unittest.SkipTest("no sparse support")
  265. def deb(self, val, beta, i, zero_debias):
  266. if zero_debias:
  267. return val / (1.0 - beta ** i)
  268. else:
  269. return val
  270. def get_lr_mu(self, distance, grad_var, h_min, h_max):
  271. # First tune based on dynamic range
  272. if grad_var == 0:
  273. dr = h_max / h_min
  274. mu = ((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2
  275. lr_min = (1 + np.sqrt(mu)) ** 2 / h_max
  276. return lr_min, mu
  277. p = distance ** 2 * h_min ** 2 / 2 / grad_var
  278. w3 = (-math.sqrt(p * p + 4.0 / 27.0 * p * p * p) - p) / 2.0
  279. w = (1.0 if w3 > 0.0 else -1.0) * math.pow(math.fabs(w3), 1.0 / 3.0)
  280. y = w - p / 3.0 / w
  281. root = y + 1
  282. root = min(root, 1.0 - 1e-6)
  283. dr = h_max / h_min
  284. mu = max(((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2, root**2)
  285. lr_min = (1 - np.sqrt(mu)) ** 2 / h_min
  286. return lr_min, mu
  287. def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
  288. caffe2_res = {}
  289. alpha = 1.0
  290. mu = 0.0
  291. beta = 0.999
  292. curv_win_width = 20
  293. epsilon = 1e-6
  294. net = core.Net("net")
  295. param_init_net = core.Net("param_init_net")
  296. workspace.ResetWorkspace()
  297. with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
  298. iteration = param_init_net.ConstantFill(
  299. [],
  300. "iteration",
  301. shape=[1],
  302. value=0,
  303. dtype=core.DataType.INT64)
  304. iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
  305. net.AtomicIter([iter_mutex, iteration], [iteration])
  306. pre_grad = param_init_net.ConstantFill(
  307. [],
  308. "pre_grad",
  309. shape=[n_dim],
  310. value=grad_coef
  311. )
  312. if gpu:
  313. iteration = net.CopyCPUToGPU(
  314. [iteration],
  315. "iteration_cpu"
  316. )
  317. iteration_float = net.Cast([iteration], "iteration_float")
  318. grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
  319. w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)
  320. # a hack to create an object with __dict__
  321. param_info = lambda: None
  322. param_info.blob = w
  323. param_info.grad = grad
  324. optimizer.YellowFinOptimizer(
  325. alpha=alpha,
  326. mu=mu,
  327. beta=beta,
  328. curv_win_width=curv_win_width,
  329. epsilon=epsilon,
  330. zero_debias=zero_debias
  331. )._run(
  332. net,
  333. param_init_net,
  334. param_info
  335. )
  336. workspace.RunNetOnce(param_init_net)
  337. workspace.CreateNet(net, overwrite=True)
  338. for i in range(n_iter):
  339. workspace.RunNet(net)
  340. scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
  341. g_norm2_avg = scalars_memory_blob[1]
  342. g_norm2_min_avg = scalars_memory_blob[2]
  343. g_norm2_max_avg = scalars_memory_blob[3]
  344. distance_avg = scalars_memory_blob[4]
  345. g_avg_blob = workspace.FetchBlob("w_g_avg")
  346. res_lr = workspace.FetchBlob("w_lr_avg")[0]
  347. res_mu = workspace.FetchBlob("w_mu_avg")[0]
  348. g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
  349. variance = max(
  350. self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
  351. g_deb.dot(g_deb),
  352. epsilon
  353. )
  354. if i > 0:
  355. caffe2_res[i] = {
  356. 'h_max': np.exp(self.deb(g_norm2_max_avg,
  357. beta,
  358. i + 1,
  359. zero_debias)),
  360. 'h_min': np.exp(self.deb(g_norm2_min_avg,
  361. beta,
  362. i + 1,
  363. zero_debias)),
  364. 'var': variance,
  365. 'dist': self.deb(distance_avg, beta, i + 1, zero_debias),
  366. 'lr': res_lr,
  367. 'mu': res_mu
  368. }
  369. return caffe2_res
  370. def numpy_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
  371. numpy_res = {}
  372. target_h_max = 0.0
  373. target_h_min = 0.0
  374. target_g_norm_squared_avg = 0.0
  375. target_g_norm_avg = 0.0
  376. target_g_avg = 0.0
  377. target_dist_avg = 0.0
  378. target_lr = 1.0
  379. target_mu = 0.0
  380. for i in range(n_iter):
  381. grad_val = (i + 1) * grad_coef
  382. target_g_norm_squared_avg = 0.999 * target_g_norm_squared_avg + \
  383. 0.001 * np.sum((grad_val * np.ones([n_dim, ])) ** 2)
  384. target_g_norm_avg = 0.999 * target_g_norm_avg + \
  385. 0.001 * np.linalg.norm(grad_val * np.ones([n_dim, ]))
  386. target_g_avg = 0.999 * target_g_avg + 0.001 * grad_val
  387. target_h_max = 0.999 * target_h_max + \
  388. 0.001 * np.log(grad_val ** 2 * n_dim)
  389. target_h_min = 0.999 * target_h_min + \
  390. 0.001 * np.log((max(1, i + 2 - 20) * grad_coef) ** 2 * n_dim)
  391. if zero_debias:
  392. target_var = target_g_norm_squared_avg / \
  393. (1 - 0.999 ** (i + 1)) - \
  394. target_g_avg ** 2 * n_dim / (1 - 0.999 ** (i + 1)) ** 2
  395. else:
  396. target_var = target_g_norm_squared_avg - \
  397. target_g_avg ** 2 * n_dim
  398. target_dist_avg = 0.999 * target_dist_avg + \
  399. 0.001 * target_g_norm_avg / target_g_norm_squared_avg
  400. if i > 0:
  401. if zero_debias:
  402. lr, mu = self.get_lr_mu(
  403. target_dist_avg / (1.0 - 0.999 ** (i + 1)),
  404. target_var,
  405. np.exp(target_h_min / (1.0 - 0.999 ** (i + 1))),
  406. np.exp(target_h_max / (1.0 - 0.999 ** (i + 1))))
  407. target_lr = 0.999 * target_lr + 0.001 * lr
  408. target_mu = 0.999 * target_mu + 0.001 * mu
  409. numpy_res[i] = {
  410. 'h_max': np.exp(target_h_max / (1 - 0.999 ** (i + 1))),
  411. 'h_min': np.exp(target_h_min / (1 - 0.999 ** (i + 1))),
  412. 'var': target_var,
  413. 'dist': target_dist_avg / (1 - 0.999 ** (i + 1)),
  414. 'lr': target_lr,
  415. 'mu': target_mu
  416. }
  417. else:
  418. lr, mu = self.get_lr_mu(
  419. target_dist_avg,
  420. target_var,
  421. np.exp(target_h_min),
  422. np.exp(target_h_max))
  423. target_lr = 0.999 * target_lr + 0.001 * lr
  424. target_mu = 0.999 * target_mu + 0.001 * mu
  425. numpy_res[i] = {
  426. 'h_max': np.exp(target_h_max),
  427. 'h_min': np.exp(target_h_min),
  428. 'var': target_var,
  429. 'dist': target_dist_avg,
  430. 'lr': target_lr,
  431. 'mu': target_mu
  432. }
  433. return numpy_res
  434. def compare_yellowfin_models(self,
  435. model0,
  436. model1,
  437. zero_debias,
  438. grad_coef,
  439. n_dim,
  440. n_iter,
  441. gpu):
  442. model0_res = model0(zero_debias, grad_coef, n_dim, n_iter, gpu)
  443. model1_res = model1(zero_debias, grad_coef, n_dim, n_iter, gpu)
  444. assert_equal(len(model0_res), len(model1_res))
  445. for i in range(1, len(model0_res)):
  446. assert_equal(model0_res[i].keys(), model1_res[i].keys())
  447. for feat in model0_res[i].keys():
  448. err_msg = \
  449. 'i=' + str(i) + ',\n' + \
  450. 'feat=' + feat + ',\n' + \
  451. 'grad_coef=' + str(grad_coef) + ',\n' + \
  452. 'zero_debias=' + str(zero_debias)
  453. assert_allclose(model0_res[i][feat],
  454. model1_res[i][feat],
  455. rtol=1e-2,
  456. err_msg=err_msg)
  457. @unittest.skip("Results might vary too much. Only for individual use.")
  458. def test_caffe2_cpu_vs_numpy(self):
  459. n_dim = 1000000
  460. n_iter = 50
  461. cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
  462. with core.DeviceScope(cpu_device_opt):
  463. for zero_debias, grad_coef in [
  464. (False, 1.0),
  465. (False, 0.1),
  466. (False, 0.01),
  467. (True, 1.0)
  468. ]:
  469. self.compare_yellowfin_models(
  470. self.caffe2_yellowfin,
  471. self.numpy_yellowfin,
  472. zero_debias,
  473. grad_coef,
  474. n_dim,
  475. n_iter,
  476. gpu=False
  477. )
  478. @unittest.skip("Results might vary too much. Only for individual use.")
  479. @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
  480. def test_caffe2_gpu_vs_numpy(self):
  481. n_dim = 1000000
  482. n_iter = 50
  483. gpu_device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
  484. with core.DeviceScope(gpu_device_opt):
  485. for zero_debias in [False, True]:
  486. for grad_coef in [1.0, 0.1, 0.01]:
  487. self.compare_yellowfin_models(
  488. self.caffe2_yellowfin,
  489. self.numpy_yellowfin,
  490. zero_debias,
  491. grad_coef,
  492. n_dim,
  493. n_iter,
  494. gpu=True
  495. )
  496. class TestRmsProp(OptimizerTestBase, LRModificationTestBase, TestCase):
  497. def build_optimizer(self, model, **kwargs):
  498. self._skip_gpu = False
  499. return build_rms_prop(
  500. model, base_learning_rate=0.1, epsilon=0.1, **kwargs
  501. )
  502. def check_optimizer(self, optimizer):
  503. self.assertFalse(optimizer.get_auxiliary_parameters().shared)
  504. self.assertTrue(optimizer.get_auxiliary_parameters().local)
  505. for param in optimizer.get_auxiliary_parameters().local:
  506. workspace.FetchBlob(param)
  507. def testSparse(self):
  508. raise unittest.SkipTest("no sparse support")
  509. class TestMultiOptimizers(TestCase):
  510. def test_multiple_optimizers(self):
  511. from caffe2.python import brew, core, optimizer
  512. from caffe2.python.model_helper import ModelHelper
  513. model = ModelHelper(name="test")
  514. fc1 = brew.fc(model, 'data', 'fc1', 100, 50)
  515. fc2 = brew.fc(model, fc1, 'fc2', 50, 25)
  516. pred = brew.fc(model, fc2, 'fc3', 25, 10)
  517. (softmax, loss) = model.SoftmaxWithLoss(
  518. [pred, 'label'],
  519. ['softmax', 'loss'],
  520. )
  521. model.AddGradientOperators([loss])
  522. param_to_device = optimizer._get_param_to_device(model)
  523. def infer_blob_device(blob_name):
  524. return optimizer.get_param_device(
  525. blob_name, "{}_grad".format(blob_name), param_to_device
  526. )
  527. sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1)
  528. sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2)
  529. adagrad = optimizer.AdagradOptimizer()
  530. # Check same optimizer share the same learning rate.
  531. with core.DeviceScope(infer_blob_device("fc1_w")):
  532. sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad")
  533. with core.DeviceScope(infer_blob_device("fc1_b")):
  534. sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad")
  535. fc1_lr_blobs = []
  536. for op in model.net.Proto().op:
  537. if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \
  538. op.input[0] == 'fc1_b':
  539. fc1_lr_blobs.append(op.input[3])
  540. self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1])
  541. # Check different instance of the same optimizer has a different lr.
  542. with core.DeviceScope(infer_blob_device("fc2_w")):
  543. sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad")
  544. with core.DeviceScope(infer_blob_device("fc2_b")):
  545. sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad")
  546. fc2_lr_blobs = []
  547. for op in model.net.Proto().op:
  548. if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \
  549. op.input[0] == 'fc2_b':
  550. self.assertTrue(op.input[3] not in fc1_lr_blobs)
  551. fc2_lr_blobs.append(op.input[3])
  552. self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1])
  553. # Check different optimizer type case
  554. with core.DeviceScope(infer_blob_device("fc3_w")):
  555. adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad")
  556. with core.DeviceScope(infer_blob_device("fc3_b")):
  557. adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad")
  558. fc3_lr_blobs = []
  559. for op in model.net.Proto().op:
  560. if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \
  561. op.input[0] == 'fc3_b':
  562. self.assertTrue(op.input[3] not in fc2_lr_blobs)
  563. self.assertTrue(op.input[3] not in fc1_lr_blobs)
  564. fc3_lr_blobs.append(op.input[3])
  565. self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
  566. class TestWeightDecay(TestCase):
  567. def test_weight_decay(self):
  568. from caffe2.python import brew
  569. from caffe2.python.model_helper import ModelHelper
  570. model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
  571. cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
  572. a = brew.fc(model, cnv, 'a', 100, 200)
  573. pred = brew.fc(model, a, 'b', 200, 5)
  574. (softmax, loss) = model.SoftmaxWithLoss(
  575. [pred, 'label'],
  576. ['softmax', 'loss'],
  577. )
  578. model.AddGradientOperators([loss])
  579. add_weight_decay(model, weight_decay=1e-4)
  580. build_sgd(model, 0.11)
  581. expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
  582. # Check the proto that all weights are decayed and not non-weights
  583. # are decayed.
  584. for op in model.net.Proto().op:
  585. if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
  586. if op.output[0] not in expected_weight_grad:
  587. print(
  588. "Unexpected param for weight_decay: {}".
  589. format(op.output[0])
  590. )
  591. self.assertTrue(op.output[0] in expected_weight_grad)
  592. expected_weight_grad.remove(op.output[0])
  593. self.assertEqual(
  594. expected_weight_grad,
  595. set(),
  596. "Not all weights were decayed: {}".format(expected_weight_grad)
  597. )
  598. class TestOptimizerContext(TestCase):
  599. def test_optimizer_context(self):
  600. from caffe2.python import brew, optimizer
  601. from caffe2.python.model_helper import ModelHelper
  602. model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
  603. count = optimizer._optimizer_instance_count['SgdOptimizer']
  604. cnv_optim = SgdOptimizer(0.15)
  605. weight_optim = SgdOptimizer(0.2)
  606. bias_optim = SgdOptimizer(0.1)
  607. with UseOptimizer(cnv_optim):
  608. cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
  609. with UseOptimizer({'WEIGHT': weight_optim, 'BIAS': bias_optim}):
  610. a = brew.fc(model, cnv, 'a', 100, 200)
  611. pred = brew.fc(model, a, 'b', 200, 5)
  612. (softmax, loss) = model.SoftmaxWithLoss(
  613. [pred, 'label'],
  614. ['softmax', 'loss'],
  615. )
  616. model.AddGradientOperators([loss])
  617. add_weight_decay(model, weight_decay=1e-4)
  618. # use the following optimizer if none specified in param_info
  619. build_sgd(model, 0.11)
  620. expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
  621. expected_learning_rate = {
  622. "SgdOptimizer_{}_lr_cpu".format(count): -0.15,
  623. "SgdOptimizer_{}_lr_cpu".format(count + 1): -0.2,
  624. "SgdOptimizer_{}_lr_cpu".format(count + 2): -0.1,
  625. "SgdOptimizer_{}_lr_cpu".format(count + 3): -0.11
  626. }
  627. for op in model.net.Proto().op:
  628. # Check the proto that all weights are decayed and not non-weights
  629. # are decayed.
  630. if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
  631. if op.output[0] not in expected_weight_grad:
  632. print(
  633. "Unexpected param for weight_decay: {}".
  634. format(op.output[0])
  635. )
  636. self.assertTrue(op.output[0] in expected_weight_grad)
  637. expected_weight_grad.remove(op.output[0])
  638. # Check the learning rate for each parameter
  639. if op.type == 'LearningRate':
  640. val = 0
  641. for arg in op.arg:
  642. if arg.name == 'base_lr':
  643. val = arg.f
  644. self.assertAlmostEqual(
  645. val,
  646. expected_learning_rate[op.output[0]]
  647. )
  648. self.assertEqual(
  649. expected_weight_grad,
  650. set(),
  651. "Not all weights were decayed: {}".format(expected_weight_grad)
  652. )