| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- ## @package lstm_benchmark
- # Module caffe2.python.lstm_benchmark
- from caffe2.proto import caffe2_pb2
- from caffe2.python import workspace, core, utils, rnn_cell, model_helper
- from caffe2.python import recurrent
- import argparse
- import numpy as np
- import time
- import logging
- logging.basicConfig()
- log = logging.getLogger("lstm_bench")
- log.setLevel(logging.DEBUG)
- def generate_data(T, shape, num_labels, fixed_shape):
- '''
- Fill a queue with input data
- '''
- log.info("Generating T={} sequence batches".format(T))
- generate_input_init_net = core.Net('generate_input_init')
- queue = generate_input_init_net.CreateBlobsQueue(
- [], "inputqueue", num_blobs=1, capacity=T,
- )
- label_queue = generate_input_init_net.CreateBlobsQueue(
- [], "labelqueue", num_blobs=1, capacity=T,
- )
- workspace.RunNetOnce(generate_input_init_net)
- generate_input_net = core.Net('generate_input')
- generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
- generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
- np.random.seed(2603)
- entry_counts = []
- for t in range(T):
- if (t % (max(10, T // 10)) == 0):
- print("Generating data {}/{}".format(t, T))
- # Randomize the seqlength
- random_shape = (
- [np.random.randint(1, shape[0])] + shape[1:]
- if t > 0 and not fixed_shape else shape
- )
- X = np.random.rand(*random_shape).astype(np.float32)
- batch_size = random_shape[1]
- L = num_labels * batch_size
- labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
- workspace.FeedBlob("scratch", X)
- workspace.FeedBlob("label_scr", labels)
- workspace.RunNetOnce(generate_input_net.Proto())
- entry_counts.append(random_shape[0] * random_shape[1])
- log.info("Finished data generation")
- return queue, label_queue, entry_counts
- def create_model(args, queue, label_queue, input_shape):
- model = model_helper.ModelHelper(name="LSTM_bench")
- seq_lengths, target = \
- model.net.AddExternalInputs(
- 'seq_lengths',
- 'target',
- )
- input_blob = model.net.DequeueBlobs(queue, "input_data")
- labels = model.net.DequeueBlobs(label_queue, "label")
- init_blobs = []
- if args.implementation in ["own", "static", "static_dag"]:
- T = None
- if "static" in args.implementation:
- assert args.fixed_shape, \
- "Random input length is not static RNN compatible"
- T = args.seq_length
- print("Using static RNN of size {}".format(T))
- for i in range(args.num_layers):
- hidden_init, cell_init = model.net.AddExternalInputs(
- "hidden_init_{}".format(i),
- "cell_init_{}".format(i)
- )
- init_blobs.extend([hidden_init, cell_init])
- output, last_hidden, _, last_state = rnn_cell.LSTM(
- model=model,
- input_blob=input_blob,
- seq_lengths=seq_lengths,
- initial_states=init_blobs,
- dim_in=args.input_dim,
- dim_out=[args.hidden_dim] * args.num_layers,
- scope="lstm1",
- memory_optimization=args.memory_optimization,
- forward_only=args.forward_only,
- drop_states=True,
- return_last_layer_only=True,
- static_rnn_unroll_size=T,
- )
- if "dag" in args.implementation:
- print("Using DAG net type")
- model.net.Proto().type = 'dag'
- model.net.Proto().num_workers = 4
- elif args.implementation == "cudnn":
- # We need to feed a placeholder input so that RecurrentInitOp
- # can infer the dimensions.
- init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
- model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
- output, last_hidden, _ = rnn_cell.cudnn_LSTM(
- model=model,
- input_blob=input_blob,
- initial_states=init_blobs,
- dim_in=args.input_dim,
- dim_out=args.hidden_dim,
- scope="cudnnlstm",
- num_layers=args.num_layers,
- )
- else:
- assert False, "Unknown implementation"
- weights = model.net.UniformFill(labels, "weights")
- softmax, loss = model.net.SoftmaxWithLoss(
- [model.Flatten(output), labels, weights],
- ['softmax', 'loss'],
- )
- if not args.forward_only:
- model.AddGradientOperators([loss])
- # carry states over
- for init_blob in init_blobs:
- model.net.Copy(last_hidden, init_blob)
- sz = args.hidden_dim
- if args.implementation == "cudnn":
- sz *= args.num_layers
- workspace.FeedBlob(init_blob, np.zeros(
- [1, args.batch_size, sz], dtype=np.float32
- ))
- if args.rnn_executor:
- for op in model.net.Proto().op:
- if op.type.startswith('RecurrentNetwork'):
- recurrent.set_rnn_executor_config(
- op,
- num_threads=args.rnn_executor_num_threads,
- max_cuda_streams=args.rnn_executor_max_cuda_streams,
- )
- return model, output
- def Caffe2LSTM(args):
- T = args.data_size // args.batch_size
- input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
- queue, label_queue, entry_counts = generate_data(T // args.seq_length,
- input_blob_shape,
- args.hidden_dim,
- args.fixed_shape)
- workspace.FeedBlob(
- "seq_lengths",
- np.array([args.seq_length] * args.batch_size, dtype=np.int32)
- )
- model, output = create_model(args, queue, label_queue, input_blob_shape)
- workspace.RunNetOnce(model.param_init_net)
- workspace.CreateNet(model.net)
- start_time = time.time()
- num_iters = T // args.seq_length
- total_iters = 0
- # Run the Benchmark
- log.info("------ Warming up ------")
- workspace.RunNet(model.net.Proto().name)
- if (args.gpu):
- log.info("Memory stats:")
- stats = utils.GetGPUMemoryUsageStats()
- log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
- log.info("------ Starting benchmark ------")
- start_time = time.time()
- last_time = time.time()
- for iteration in range(1, num_iters, args.iters_to_report):
- iters_once = min(args.iters_to_report, num_iters - iteration)
- total_iters += iters_once
- workspace.RunNet(model.net.Proto().name, iters_once)
- new_time = time.time()
- log.info(
- "Iter: {} / {}. Entries Per Second: {}k.".format(
- iteration,
- num_iters,
- np.sum(entry_counts[iteration:iteration + iters_once]) /
- (new_time - last_time) // 100 / 10,
- )
- )
- last_time = new_time
- log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
- np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
- " (with RNN executor)" if args.rnn_executor else "",
- ))
- if (args.gpu):
- log.info("Memory stats:")
- stats = utils.GetGPUMemoryUsageStats()
- log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
- if (stats['max_total'] != stats['total']):
- log.warning(
- "Max usage differs from current total usage: {} > {}".
- format(stats['max_total'], stats['total'])
- )
- log.warning("This means that costly deallocations occurred.")
- return time.time() - start_time
- @utils.debug
- def Benchmark(args):
- return Caffe2LSTM(args)
- def GetArgumentParser():
- parser = argparse.ArgumentParser(description="LSTM benchmark.")
- parser.add_argument(
- "--hidden_dim",
- type=int,
- default=800,
- help="Hidden dimension",
- )
- parser.add_argument(
- "--input_dim",
- type=int,
- default=40,
- help="Input dimension",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=128,
- help="The batch size."
- )
- parser.add_argument(
- "--seq_length",
- type=int,
- default=20,
- help="Max sequence length"
- )
- parser.add_argument(
- "--data_size",
- type=int,
- default=1000000,
- help="Number of data points to generate"
- )
- parser.add_argument(
- "--iters_to_report",
- type=int,
- default=20,
- help="Number of iteration to report progress"
- )
- parser.add_argument(
- "--gpu",
- action="store_true",
- help="Run all on GPU",
- )
- parser.add_argument(
- "--implementation",
- type=str,
- default="own",
- help="'cudnn', 'own', 'static' or 'static_dag'",
- )
- parser.add_argument(
- "--fixed_shape",
- action="store_true",
- help=("Whether to randomize shape of input batches. "
- "Static RNN requires fixed shape"),
- )
- parser.add_argument(
- "--memory_optimization",
- action="store_true",
- help="Whether to use memory optimized LSTM or not",
- )
- parser.add_argument(
- "--forward_only",
- action="store_true",
- help="Whether to run only forward pass"
- )
- parser.add_argument(
- "--num_layers",
- type=int,
- default=1,
- help="Number of LSTM layers. All output dimensions are going to be"
- "of hidden_dim size",
- )
- parser.add_argument(
- "--rnn_executor",
- action="store_true",
- help="Whether to use RNN executor"
- )
- parser.add_argument(
- "--rnn_executor_num_threads",
- type=int,
- default=None,
- help="Number of threads used by CPU RNN Executor"
- )
- parser.add_argument(
- "--rnn_executor_max_cuda_streams",
- type=int,
- default=None,
- help="Maximum number of CUDA streams used by RNN executor on GPU"
- )
- return parser
- if __name__ == '__main__':
- args, extra_args = GetArgumentParser().parse_known_args()
- rnn_executor_opt = 1 if args.rnn_executor else 0
- workspace.GlobalInit([
- 'caffe2',
- '--caffe2_log_level=0',
- '--caffe2_print_blob_sizes_at_exit=0',
- '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
- '--caffe2_gpu_memory_tracking=1'] + extra_args)
- device = core.DeviceOption(
- workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4)
- with core.DeviceScope(device):
- Benchmark(args)
|