lstm_benchmark.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. ## @package lstm_benchmark
  2. # Module caffe2.python.lstm_benchmark
  3. from caffe2.proto import caffe2_pb2
  4. from caffe2.python import workspace, core, utils, rnn_cell, model_helper
  5. from caffe2.python import recurrent
  6. import argparse
  7. import numpy as np
  8. import time
  9. import logging
  10. logging.basicConfig()
  11. log = logging.getLogger("lstm_bench")
  12. log.setLevel(logging.DEBUG)
  13. def generate_data(T, shape, num_labels, fixed_shape):
  14. '''
  15. Fill a queue with input data
  16. '''
  17. log.info("Generating T={} sequence batches".format(T))
  18. generate_input_init_net = core.Net('generate_input_init')
  19. queue = generate_input_init_net.CreateBlobsQueue(
  20. [], "inputqueue", num_blobs=1, capacity=T,
  21. )
  22. label_queue = generate_input_init_net.CreateBlobsQueue(
  23. [], "labelqueue", num_blobs=1, capacity=T,
  24. )
  25. workspace.RunNetOnce(generate_input_init_net)
  26. generate_input_net = core.Net('generate_input')
  27. generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
  28. generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
  29. np.random.seed(2603)
  30. entry_counts = []
  31. for t in range(T):
  32. if (t % (max(10, T // 10)) == 0):
  33. print("Generating data {}/{}".format(t, T))
  34. # Randomize the seqlength
  35. random_shape = (
  36. [np.random.randint(1, shape[0])] + shape[1:]
  37. if t > 0 and not fixed_shape else shape
  38. )
  39. X = np.random.rand(*random_shape).astype(np.float32)
  40. batch_size = random_shape[1]
  41. L = num_labels * batch_size
  42. labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
  43. workspace.FeedBlob("scratch", X)
  44. workspace.FeedBlob("label_scr", labels)
  45. workspace.RunNetOnce(generate_input_net.Proto())
  46. entry_counts.append(random_shape[0] * random_shape[1])
  47. log.info("Finished data generation")
  48. return queue, label_queue, entry_counts
  49. def create_model(args, queue, label_queue, input_shape):
  50. model = model_helper.ModelHelper(name="LSTM_bench")
  51. seq_lengths, target = \
  52. model.net.AddExternalInputs(
  53. 'seq_lengths',
  54. 'target',
  55. )
  56. input_blob = model.net.DequeueBlobs(queue, "input_data")
  57. labels = model.net.DequeueBlobs(label_queue, "label")
  58. init_blobs = []
  59. if args.implementation in ["own", "static", "static_dag"]:
  60. T = None
  61. if "static" in args.implementation:
  62. assert args.fixed_shape, \
  63. "Random input length is not static RNN compatible"
  64. T = args.seq_length
  65. print("Using static RNN of size {}".format(T))
  66. for i in range(args.num_layers):
  67. hidden_init, cell_init = model.net.AddExternalInputs(
  68. "hidden_init_{}".format(i),
  69. "cell_init_{}".format(i)
  70. )
  71. init_blobs.extend([hidden_init, cell_init])
  72. output, last_hidden, _, last_state = rnn_cell.LSTM(
  73. model=model,
  74. input_blob=input_blob,
  75. seq_lengths=seq_lengths,
  76. initial_states=init_blobs,
  77. dim_in=args.input_dim,
  78. dim_out=[args.hidden_dim] * args.num_layers,
  79. scope="lstm1",
  80. memory_optimization=args.memory_optimization,
  81. forward_only=args.forward_only,
  82. drop_states=True,
  83. return_last_layer_only=True,
  84. static_rnn_unroll_size=T,
  85. )
  86. if "dag" in args.implementation:
  87. print("Using DAG net type")
  88. model.net.Proto().type = 'dag'
  89. model.net.Proto().num_workers = 4
  90. elif args.implementation == "cudnn":
  91. # We need to feed a placeholder input so that RecurrentInitOp
  92. # can infer the dimensions.
  93. init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
  94. model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
  95. output, last_hidden, _ = rnn_cell.cudnn_LSTM(
  96. model=model,
  97. input_blob=input_blob,
  98. initial_states=init_blobs,
  99. dim_in=args.input_dim,
  100. dim_out=args.hidden_dim,
  101. scope="cudnnlstm",
  102. num_layers=args.num_layers,
  103. )
  104. else:
  105. assert False, "Unknown implementation"
  106. weights = model.net.UniformFill(labels, "weights")
  107. softmax, loss = model.net.SoftmaxWithLoss(
  108. [model.Flatten(output), labels, weights],
  109. ['softmax', 'loss'],
  110. )
  111. if not args.forward_only:
  112. model.AddGradientOperators([loss])
  113. # carry states over
  114. for init_blob in init_blobs:
  115. model.net.Copy(last_hidden, init_blob)
  116. sz = args.hidden_dim
  117. if args.implementation == "cudnn":
  118. sz *= args.num_layers
  119. workspace.FeedBlob(init_blob, np.zeros(
  120. [1, args.batch_size, sz], dtype=np.float32
  121. ))
  122. if args.rnn_executor:
  123. for op in model.net.Proto().op:
  124. if op.type.startswith('RecurrentNetwork'):
  125. recurrent.set_rnn_executor_config(
  126. op,
  127. num_threads=args.rnn_executor_num_threads,
  128. max_cuda_streams=args.rnn_executor_max_cuda_streams,
  129. )
  130. return model, output
  131. def Caffe2LSTM(args):
  132. T = args.data_size // args.batch_size
  133. input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
  134. queue, label_queue, entry_counts = generate_data(T // args.seq_length,
  135. input_blob_shape,
  136. args.hidden_dim,
  137. args.fixed_shape)
  138. workspace.FeedBlob(
  139. "seq_lengths",
  140. np.array([args.seq_length] * args.batch_size, dtype=np.int32)
  141. )
  142. model, output = create_model(args, queue, label_queue, input_blob_shape)
  143. workspace.RunNetOnce(model.param_init_net)
  144. workspace.CreateNet(model.net)
  145. start_time = time.time()
  146. num_iters = T // args.seq_length
  147. total_iters = 0
  148. # Run the Benchmark
  149. log.info("------ Warming up ------")
  150. workspace.RunNet(model.net.Proto().name)
  151. if (args.gpu):
  152. log.info("Memory stats:")
  153. stats = utils.GetGPUMemoryUsageStats()
  154. log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
  155. log.info("------ Starting benchmark ------")
  156. start_time = time.time()
  157. last_time = time.time()
  158. for iteration in range(1, num_iters, args.iters_to_report):
  159. iters_once = min(args.iters_to_report, num_iters - iteration)
  160. total_iters += iters_once
  161. workspace.RunNet(model.net.Proto().name, iters_once)
  162. new_time = time.time()
  163. log.info(
  164. "Iter: {} / {}. Entries Per Second: {}k.".format(
  165. iteration,
  166. num_iters,
  167. np.sum(entry_counts[iteration:iteration + iters_once]) /
  168. (new_time - last_time) // 100 / 10,
  169. )
  170. )
  171. last_time = new_time
  172. log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
  173. np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
  174. " (with RNN executor)" if args.rnn_executor else "",
  175. ))
  176. if (args.gpu):
  177. log.info("Memory stats:")
  178. stats = utils.GetGPUMemoryUsageStats()
  179. log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
  180. if (stats['max_total'] != stats['total']):
  181. log.warning(
  182. "Max usage differs from current total usage: {} > {}".
  183. format(stats['max_total'], stats['total'])
  184. )
  185. log.warning("This means that costly deallocations occurred.")
  186. return time.time() - start_time
  187. @utils.debug
  188. def Benchmark(args):
  189. return Caffe2LSTM(args)
  190. def GetArgumentParser():
  191. parser = argparse.ArgumentParser(description="LSTM benchmark.")
  192. parser.add_argument(
  193. "--hidden_dim",
  194. type=int,
  195. default=800,
  196. help="Hidden dimension",
  197. )
  198. parser.add_argument(
  199. "--input_dim",
  200. type=int,
  201. default=40,
  202. help="Input dimension",
  203. )
  204. parser.add_argument(
  205. "--batch_size",
  206. type=int,
  207. default=128,
  208. help="The batch size."
  209. )
  210. parser.add_argument(
  211. "--seq_length",
  212. type=int,
  213. default=20,
  214. help="Max sequence length"
  215. )
  216. parser.add_argument(
  217. "--data_size",
  218. type=int,
  219. default=1000000,
  220. help="Number of data points to generate"
  221. )
  222. parser.add_argument(
  223. "--iters_to_report",
  224. type=int,
  225. default=20,
  226. help="Number of iteration to report progress"
  227. )
  228. parser.add_argument(
  229. "--gpu",
  230. action="store_true",
  231. help="Run all on GPU",
  232. )
  233. parser.add_argument(
  234. "--implementation",
  235. type=str,
  236. default="own",
  237. help="'cudnn', 'own', 'static' or 'static_dag'",
  238. )
  239. parser.add_argument(
  240. "--fixed_shape",
  241. action="store_true",
  242. help=("Whether to randomize shape of input batches. "
  243. "Static RNN requires fixed shape"),
  244. )
  245. parser.add_argument(
  246. "--memory_optimization",
  247. action="store_true",
  248. help="Whether to use memory optimized LSTM or not",
  249. )
  250. parser.add_argument(
  251. "--forward_only",
  252. action="store_true",
  253. help="Whether to run only forward pass"
  254. )
  255. parser.add_argument(
  256. "--num_layers",
  257. type=int,
  258. default=1,
  259. help="Number of LSTM layers. All output dimensions are going to be"
  260. "of hidden_dim size",
  261. )
  262. parser.add_argument(
  263. "--rnn_executor",
  264. action="store_true",
  265. help="Whether to use RNN executor"
  266. )
  267. parser.add_argument(
  268. "--rnn_executor_num_threads",
  269. type=int,
  270. default=None,
  271. help="Number of threads used by CPU RNN Executor"
  272. )
  273. parser.add_argument(
  274. "--rnn_executor_max_cuda_streams",
  275. type=int,
  276. default=None,
  277. help="Maximum number of CUDA streams used by RNN executor on GPU"
  278. )
  279. return parser
  280. if __name__ == '__main__':
  281. args, extra_args = GetArgumentParser().parse_known_args()
  282. rnn_executor_opt = 1 if args.rnn_executor else 0
  283. workspace.GlobalInit([
  284. 'caffe2',
  285. '--caffe2_log_level=0',
  286. '--caffe2_print_blob_sizes_at_exit=0',
  287. '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
  288. '--caffe2_gpu_memory_tracking=1'] + extra_args)
  289. device = core.DeviceOption(
  290. workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4)
  291. with core.DeviceScope(device):
  292. Benchmark(args)