muji_test.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import numpy as np
  2. import unittest
  3. from caffe2.python import core, workspace, muji, test_util
  4. @unittest.skipIf(not workspace.has_gpu_support, "no gpu")
  5. class TestMuji(test_util.TestCase):
  6. def RunningAllreduceWithGPUs(self, gpu_ids, allreduce_function):
  7. """A base function to test different scenarios."""
  8. net = core.Net("mujitest")
  9. for id in gpu_ids:
  10. net.ConstantFill(
  11. [],
  12. "testblob_gpu_" + str(id),
  13. shape=[1, 2, 3, 4],
  14. value=float(id + 1),
  15. device_option=muji.OnGPU(id)
  16. )
  17. allreduce_function(
  18. net, ["testblob_gpu_" + str(i)
  19. for i in gpu_ids], "_reduced", gpu_ids
  20. )
  21. workspace.RunNetOnce(net)
  22. target_value = sum(gpu_ids) + len(gpu_ids)
  23. all_blobs = workspace.Blobs()
  24. all_blobs.sort()
  25. for blob in all_blobs:
  26. print('{} {}'.format(blob, workspace.FetchBlob(blob)))
  27. for idx in gpu_ids:
  28. blob = workspace.FetchBlob("testblob_gpu_" + str(idx) + "_reduced")
  29. np.testing.assert_array_equal(
  30. blob,
  31. target_value,
  32. err_msg="gpu id %d of %s" % (idx, str(gpu_ids))
  33. )
  34. def testAllreduceFallback(self):
  35. self.RunningAllreduceWithGPUs(
  36. list(range(workspace.NumGpuDevices())), muji.AllreduceFallback
  37. )
  38. def testAllreduceSingleGPU(self):
  39. for i in range(workspace.NumGpuDevices()):
  40. self.RunningAllreduceWithGPUs([i], muji.Allreduce)
  41. def testAllreduceWithTwoGPUs(self):
  42. pattern = workspace.GetGpuPeerAccessPattern()
  43. if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
  44. self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2)
  45. else:
  46. print('Skipping allreduce with 2 gpus. Not peer access ready.')
  47. def testAllreduceWithFourGPUs(self):
  48. pattern = workspace.GetGpuPeerAccessPattern()
  49. if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
  50. self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4)
  51. else:
  52. print('Skipping allreduce with 4 gpus. Not peer access ready.')
  53. def testAllreduceWithFourGPUsAndTwoGroups(self):
  54. pattern = workspace.GetGpuPeerAccessPattern()
  55. if pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
  56. self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4Group2)
  57. else:
  58. print('Skipping allreduce with 4 gpus and 2 groups. Not peer access ready.')
  59. def testAllreduceWithEightGPUs(self):
  60. pattern = workspace.GetGpuPeerAccessPattern()
  61. if (
  62. pattern.shape[0] >= 8 and np.all(pattern[:4, :4]) and
  63. np.all(pattern[4:, 4:])
  64. ):
  65. self.RunningAllreduceWithGPUs(
  66. list(range(8)), muji.Allreduce8)
  67. else:
  68. print('Skipping allreduce with 8 gpus. Not peer access ready.')
  69. if __name__ == '__main__':
  70. unittest.main()