| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- import copy
- from caffe2.proto import caffe2_pb2
- from caffe2.python import core
- def rewrite_init_net_simple(net):
- for op in net.op:
- op.device_option.device_type = caffe2_pb2.IDEEP
- def last_producer(ops, blob):
- for (i, op) in reversed(list(enumerate(ops))):
- if blob in op.output:
- return i
- raise ValueError("Failed to find last producer of blob, %s", blob)
- def fix_BoxWithNMSLimit(net):
- outputs = set()
- for op in net.op:
- if op.type == 'BoxWithNMSLimit':
- outputs.add(op.output[0])
- outputs.add(op.output[1])
- outputs.add(op.output[2])
- for op in net.op:
- if op.type == 'CopyIDEEPToCPU':
- if op.input[0] in outputs:
- print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
- op.type = 'Copy'
- op.device_option.device_type = caffe2_pb2.CPU
- def rewrite_run_net_simple(net):
- # Simple rewrite for now - assume entire graph can be executed
- # with MKL, so just insert copy ops for external_input[0] and
- # external_output[0]
- def mkl_tmp(name):
- return "{}__MKL__".format(name)
- input_blob = net.external_input[0]
- if input_blob != net.op[0].input[0]:
- raise Exception(
- "Input blob: {} is not consumed by first op: {}".format(
- input_blob, net.op[0]))
- # Modify input/outputs to point to copied MKL blobs.
- from_cpu = "CopyCPUToIDEEP"
- to_cpu = "CopyIDEEPToCPU"
- copy_input_op = core.CreateOperator(
- from_cpu, input_blob, mkl_tmp(input_blob))
- net.op[0].input[0] = mkl_tmp(input_blob)
- copy_output_ops = [
- core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
- for output_blob in net.external_output]
- for output_blob in net.external_output:
- last_producer_idx = last_producer(net.op, output_blob)
- renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
- for blob in net.op[last_producer_idx].output]
- net.op[last_producer_idx].output[:] = renamed_outputs
- # Rename any subsequent consumers of an output blob.
- for op in net.op[last_producer_idx + 1:]:
- renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
- for blob in op.input]
- op.input[:] = renamed_input
- ops = [copy_input_op] + net.op[:] + copy_output_ops
- del net.op[:]
- net.op.extend(ops)
- device = caffe2_pb2.IDEEP
- for op in net.op:
- op.device_option.MergeFrom(
- core.DeviceOption(device_type=device))
- op.engine = ""
- # Temporarily disable conv+relu fusion until we verify further
- # net.ParseFromString(
- # C.transform_optimizeForMKLDNN(net.SerializeToString()))
- fix_BoxWithNMSLimit(net)
- def rewrite_run_net_simple_xrayocr_lstm(net):
- # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
- # enable mkl, then copy the temporary output blob at the break point
- # and all external inputs for lstm part to cpu, and execuate rest of the net
- # (two lstm) on cpu
- # This only works for the xrayocr lstm model which uses the first 'Shape' op
- # to decide the break point, and after two lstm it's external_output
- # directly so there's no need to copy back to ideep/mkl
- def mkl_tmp(name):
- return "{}__MKL__".format(name)
- def cpu_tmp(name):
- return "{}__CPU__".format(name)
- input_blob = net.external_input[0]
- if input_blob != net.op[0].input[0]:
- raise Exception(
- "Input blob: {} is not consumed by first op: {}".format(
- input_blob, net.op[0]))
- # Modify input/outputs to point to copied MKL blobs.
- from_cpu = "CopyCPUToIDEEP"
- to_cpu = "CopyIDEEPToCPU"
- copy_input_op = core.CreateOperator(
- from_cpu, input_blob, mkl_tmp(input_blob))
- net.op[0].input[0] = mkl_tmp(input_blob)
- # the net may contain some external_inputs falsely added during ONNX->Caffe2
- # This should be taken care of in early steps during pytorch_to_caffe2,
- # but if not it can cause issue in follow up steps, so check here to confirm
- for input_blob in net.external_input:
- for op in net.op:
- # look for if the external_input blob is output of any op in the net
- assert input_blob not in op.output
- external_output = None
- external_inputs_to_cpu = set()
- find_first_shape_op = False
- cpu_op_start_idx = -1
- for op_idx, op in enumerate(net.op):
- # the first Shape op mark the starting point of LSTM chunk of the net
- if not find_first_shape_op:
- if op.type == 'Shape':
- external_output = op.input
- find_first_shape_op = True
- cpu_op_start_idx = op_idx
- else:
- # any external input in the LSTM part need to be copied to CPU
- for in_blob in op.input:
- if in_blob in net.external_input:
- external_inputs_to_cpu.add(in_blob)
- # make sure we found the expected break point of the net
- assert external_output is not None
- # create op to copy external input blobs used in LSTM part from IDEEP to CPU
- copy_extra_input_ops = []
- for in_blob in external_inputs_to_cpu:
- copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
- cpu_tmp(in_blob)))
- # rename input blobs in LSTM part to use the CPU copy
- for op in net.op[cpu_op_start_idx:]:
- renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
- for blob in op.input]
- op.input[:] = renamed_input
- copy_output_ops = [
- core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
- for output_blob in external_output]
- for output_blob in external_output:
- last_producer_idx = last_producer(net.op, output_blob)
- renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
- for blob in net.op[last_producer_idx].output]
- net.op[last_producer_idx].output[:] = renamed_outputs
- # rearrange all ops in correct order
- ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
- + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
- del net.op[:]
- net.op.extend(ops)
- device = caffe2_pb2.IDEEP
- for op in net.op:
- # the first Shape op mark the starting point of LSTM chunk of the net
- if op.type == 'Shape':
- # all LSTM ops should run on CPU
- device = caffe2_pb2.CPU
- op.device_option.MergeFrom(
- core.DeviceOption(device_type=device))
- op.engine = ""
- # RecurrentNetwork has a nested step_net that needs special treatment
- if op.type == 'RecurrentNetwork':
- for arg in op.arg:
- if arg.name == 'step_net':
- for nested_op in arg.n.op:
- # set device to CPU
- nested_op.device_option.MergeFrom(
- core.DeviceOption(device_type=device))
- nested_op.engine = ""
- # rename inputs in op of nested net
- renamed_input = []
- for blob in nested_op.input:
- renamed_input.append(blob
- if blob not in external_inputs_to_cpu
- else cpu_tmp(blob))
- nested_op.input[:] = renamed_input
- # rename external inputs of nested net
- new_external_input = []
- for blob in arg.n.external_input:
- new_external_input.append(blob
- if blob not in external_inputs_to_cpu
- else cpu_tmp(blob))
- arg.n.external_input[:] = new_external_input
- # Temporarily disable conv+relu fusion until we verify further
- # net.ParseFromString(
- # C.transform_optimizeForMKLDNN(net.SerializeToString()))
- fix_BoxWithNMSLimit(net)
- def rewrite_model_helper_simple(model):
- model = copy.deepcopy(model)
- # All parameter initialization should run on MKL
- rewrite_init_net_simple(model.param_init_net.Proto())
- rewrite_run_net_simple(model.net.Proto())
- return model
|