本文整理匯總了Python中tensorrt.volume方法的典型用法代碼示例。如果您正苦於以下問題:Python tensorrt.volume方法的具體用法?Python tensorrt.volume怎麽用?Python tensorrt.volume使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tensorrt
的用法示例。
在下文中一共展示了tensorrt.volume方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def __allocate_buffers(self, engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
示例2: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
"""Allocates all host/device in/out buffers required for an engine."""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * \
engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
示例3: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
示例4: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
index = 0
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
shape = [engine.max_batch_size] + list(engine.get_binding_shape(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
index += 1
return inputs, outputs, bindings
示例5: allocate_buffers_torch
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers_torch(engine: trt.ICudaEngine, device):
import torch
inputs = []
outputs = []
bindings = []
index = 0
dtype_map = np_to_torch_dtype_map()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
shape = [engine.max_batch_size] + list(engine.get_binding_shape(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
device_mem = torch.empty(*host_mem.shape, device=device, dtype=dtype_map[host_mem.dtype])
# Append the device buffer to device bindings.
bindings.append(device_mem.data_ptr())
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
index += 1
return inputs, outputs, bindings
示例6: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE))
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
return h_input, d_input, h_output, d_output, stream
示例7: _create_context
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def _create_context(self):
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * \
self.engine.max_batch_size
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(cuda_mem))
if self.engine.binding_is_input(binding):
self.host_inputs.append(host_mem)
self.cuda_inputs.append(cuda_mem)
else:
self.host_outputs.append(host_mem)
self.cuda_outputs.append(cuda_mem)
return self.engine.create_execution_context()
示例8: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(DTYPE))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(DTYPE))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
return h_input, d_input, h_output, d_output
示例9: alloc_buf
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def alloc_buf(engine):
# host cpu mem
h_in_size = trt.volume(engine.get_binding_shape(0))
h_out_size = trt.volume(engine.get_binding_shape(1))
h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
# allocate gpu mem
in_gpu = cuda.mem_alloc(in_cpu.nbytes)
out_gpu = cuda.mem_alloc(out_cpu.nbytes)
stream = cuda.Stream()
return in_cpu, out_cpu, in_gpu, out_gpu, stream
示例10: __init__
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def __init__(self, model_path, calibration_files, batch_size, h, w, means, stds):
trt.IInt8EntropyCalibrator2.__init__(self)
self.batch_size = batch_size
self.h = h
self.w = w
self.input_size = [h, w]
self.means = np.array(stds, dtype=np.float32)
self.stds = np.array(stds, dtype=np.float32)
assert(isinstance(calibration_files, list))
self.calib_image_paths = calibration_files
assert(os.path.exists(model_path))
self.cache_file = os.path.join(model_path, "model.trt.int8calib")
self.shape = [self.batch_size, 3] + self.input_size
self.device_input = cuda.mem_alloc(
trt.volume(self.shape) * trt.float32.itemsize)
self.indices = np.arange(len(self.calib_image_paths))
np.random.shuffle(self.indices)
def load_batches():
for i in range(0, len(self.calib_image_paths) - self.batch_size+1, self.batch_size):
indexs = self.indices[i:i+self.batch_size]
paths = [self.calib_image_paths[i] for i in indexs]
files = self.read_batch_file(paths)
yield files
self.batches = load_batches()
示例11: init_trt_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def init_trt_buffers(cuda, trt, engine):
"""Initialize host buffers and cuda buffers for the engine."""
assert engine[0] == 'input_1:0'
assert engine.get_binding_shape(0)[1:] == (224, 224, 3)
size = trt.volume((1, 224, 224, 3)) * engine.max_batch_size
host_input = cuda.pagelocked_empty(size, np.float32)
cuda_input = cuda.mem_alloc(host_input.nbytes)
assert engine[1] == 'Logits/Softmax:0'
assert engine.get_binding_shape(1)[1:] == (1000,)
size = trt.volume((1, 1000)) * engine.max_batch_size
host_output = cuda.pagelocked_empty(size, np.float32)
cuda_output = cuda.mem_alloc(host_output.nbytes)
return host_input, cuda_input, host_output, cuda_output
示例12: allocate_buffers
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def allocate_buffers(engine):
"""Allocates host and device buffer for TRT engine inference.
This function is similair to the one in ../../common.py, but
converts network outputs (which are np.float32) appropriately
before writing them to Python buffer. This is needed, since
TensorRT plugins doesn't support output type description, and
in our particular case, we use NMS plugin as network output.
Args:
engine (trt.ICudaEngine): TensorRT engine
Returns:
inputs [HostDeviceMem]: engine input memory
outputs [HostDeviceMem]: engine output memory
bindings [int]: buffer to device bindings
stream (cuda.Stream): cuda stream for engine inference synchronization
"""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
# Current NMS implementation in TRT only supports DataType.FLOAT but
# it may change in the future, which could brake this sample here
# when using lower precision [e.g. NMS output would not be np.float32
# anymore, even though this is assumed in binding_to_type]
binding_to_type = {"Input": np.float32, "NMS": np.float32, "NMS_1": np.int32}
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = binding_to_type[str(binding)]
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
示例13: __init__
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def __init__(self, trt_engine_path, uff_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):
"""Initializes TensorRT objects needed for model inference.
Args:
trt_engine_path (str): path where TensorRT engine should be stored
uff_model_path (str): path of .uff model
trt_engine_datatype (trt.DataType):
requested precision of TensorRT engine used for inference
batch_size (int): batch size for which engine
should be optimized for
"""
# We first load all custom plugins shipped with TensorRT,
# some of them will be needed during inference
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
# Initialize runtime needed for loading TensorRT engine from file
self.trt_runtime = trt.Runtime(TRT_LOGGER)
# TRT engine placeholder
self.trt_engine = None
# Display requested engine settings to stdout
print("TensorRT inference engine settings:")
print(" * Inference precision - {}".format(trt_engine_datatype))
print(" * Max batch size - {}\n".format(batch_size))
# If engine is not cached, we need to build it
if not os.path.exists(trt_engine_path):
# This function uses supplied .uff file
# alongside with UffParser to build TensorRT
# engine. For more details, check implmentation
self.trt_engine = engine_utils.build_engine(
uff_model_path, TRT_LOGGER,
trt_engine_datatype=trt_engine_datatype,
batch_size=batch_size)
# Save the engine to file
engine_utils.save_engine(self.trt_engine, trt_engine_path)
# If we get here, the file with engine exists, so we can load it
if not self.trt_engine:
print("Loading cached TensorRT engine from {}".format(
trt_engine_path))
self.trt_engine = engine_utils.load_engine(
self.trt_runtime, trt_engine_path)
# This allocates memory for network inputs/outputs on both CPU and GPU
self.inputs, self.outputs, self.bindings, self.stream = \
engine_utils.allocate_buffers(self.trt_engine)
# Execution context is needed for inference
self.context = self.trt_engine.create_execution_context()
# Allocate memory for multiple usage [e.g. multiple batch inference]
input_volume = trt.volume(model_utils.ModelData.INPUT_SHAPE)
self.numpy_array = np.zeros((self.trt_engine.max_batch_size, input_volume))
示例14: run_speed_eval
# 需要導入模塊: import tensorrt [as 別名]
# 或者: from tensorrt import volume [as 別名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
inputs, outputs, bindings = allocate_buffers(self.engine)
# warm run
for i in range(warm_run_loops):
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
# real run
logging.info('Start real run loop.')
sum_time_data_copy = 0.
sum_time_inference_only = 0.
for i in range(real_run_loops):
time_start = time.time()
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
sum_time_data_copy += time.time() - time_start
time_start = time.time()
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
sum_time_inference_only += time.time() - time_start
time_start = time.time()
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
sum_time_data_copy += time.time() - time_start
logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
% ((sum_time_data_copy + sum_time_inference_only) * 1000,
(sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only)))