本文整理汇总了Python中tensorrt.volume方法的典型用法代码示例。如果您正苦于以下问题:Python tensorrt.volume方法的具体用法?Python tensorrt.volume怎么用?Python tensorrt.volume使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tensorrt
的用法示例。
在下文中一共展示了tensorrt.volume方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def __allocate_buffers(self, engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
示例2: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
"""Allocates all host/device in/out buffers required for an engine."""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * \
engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
示例3: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
示例4: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
index = 0
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
shape = [engine.max_batch_size] + list(engine.get_binding_shape(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
index += 1
return inputs, outputs, bindings
示例5: allocate_buffers_torch
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers_torch(engine: trt.ICudaEngine, device):
import torch
inputs = []
outputs = []
bindings = []
index = 0
dtype_map = np_to_torch_dtype_map()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
shape = [engine.max_batch_size] + list(engine.get_binding_shape(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
device_mem = torch.empty(*host_mem.shape, device=device, dtype=dtype_map[host_mem.dtype])
# Append the device buffer to device bindings.
bindings.append(device_mem.data_ptr())
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
index += 1
return inputs, outputs, bindings
示例6: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE))
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
return h_input, d_input, h_output, d_output, stream
示例7: _create_context
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def _create_context(self):
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * \
self.engine.max_batch_size
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(cuda_mem))
if self.engine.binding_is_input(binding):
self.host_inputs.append(host_mem)
self.cuda_inputs.append(cuda_mem)
else:
self.host_outputs.append(host_mem)
self.cuda_outputs.append(cuda_mem)
return self.engine.create_execution_context()
示例8: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(DTYPE))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(DTYPE))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
return h_input, d_input, h_output, d_output
示例9: alloc_buf
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def alloc_buf(engine):
# host cpu mem
h_in_size = trt.volume(engine.get_binding_shape(0))
h_out_size = trt.volume(engine.get_binding_shape(1))
h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
# allocate gpu mem
in_gpu = cuda.mem_alloc(in_cpu.nbytes)
out_gpu = cuda.mem_alloc(out_cpu.nbytes)
stream = cuda.Stream()
return in_cpu, out_cpu, in_gpu, out_gpu, stream
示例10: __init__
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def __init__(self, model_path, calibration_files, batch_size, h, w, means, stds):
trt.IInt8EntropyCalibrator2.__init__(self)
self.batch_size = batch_size
self.h = h
self.w = w
self.input_size = [h, w]
self.means = np.array(stds, dtype=np.float32)
self.stds = np.array(stds, dtype=np.float32)
assert(isinstance(calibration_files, list))
self.calib_image_paths = calibration_files
assert(os.path.exists(model_path))
self.cache_file = os.path.join(model_path, "model.trt.int8calib")
self.shape = [self.batch_size, 3] + self.input_size
self.device_input = cuda.mem_alloc(
trt.volume(self.shape) * trt.float32.itemsize)
self.indices = np.arange(len(self.calib_image_paths))
np.random.shuffle(self.indices)
def load_batches():
for i in range(0, len(self.calib_image_paths) - self.batch_size+1, self.batch_size):
indexs = self.indices[i:i+self.batch_size]
paths = [self.calib_image_paths[i] for i in indexs]
files = self.read_batch_file(paths)
yield files
self.batches = load_batches()
示例11: init_trt_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def init_trt_buffers(cuda, trt, engine):
"""Initialize host buffers and cuda buffers for the engine."""
assert engine[0] == 'input_1:0'
assert engine.get_binding_shape(0)[1:] == (224, 224, 3)
size = trt.volume((1, 224, 224, 3)) * engine.max_batch_size
host_input = cuda.pagelocked_empty(size, np.float32)
cuda_input = cuda.mem_alloc(host_input.nbytes)
assert engine[1] == 'Logits/Softmax:0'
assert engine.get_binding_shape(1)[1:] == (1000,)
size = trt.volume((1, 1000)) * engine.max_batch_size
host_output = cuda.pagelocked_empty(size, np.float32)
cuda_output = cuda.mem_alloc(host_output.nbytes)
return host_input, cuda_input, host_output, cuda_output
示例12: allocate_buffers
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def allocate_buffers(engine):
"""Allocates host and device buffer for TRT engine inference.
This function is similair to the one in ../../common.py, but
converts network outputs (which are np.float32) appropriately
before writing them to Python buffer. This is needed, since
TensorRT plugins doesn't support output type description, and
in our particular case, we use NMS plugin as network output.
Args:
engine (trt.ICudaEngine): TensorRT engine
Returns:
inputs [HostDeviceMem]: engine input memory
outputs [HostDeviceMem]: engine output memory
bindings [int]: buffer to device bindings
stream (cuda.Stream): cuda stream for engine inference synchronization
"""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
# Current NMS implementation in TRT only supports DataType.FLOAT but
# it may change in the future, which could brake this sample here
# when using lower precision [e.g. NMS output would not be np.float32
# anymore, even though this is assumed in binding_to_type]
binding_to_type = {"Input": np.float32, "NMS": np.float32, "NMS_1": np.int32}
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = binding_to_type[str(binding)]
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
示例13: __init__
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def __init__(self, trt_engine_path, uff_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):
"""Initializes TensorRT objects needed for model inference.
Args:
trt_engine_path (str): path where TensorRT engine should be stored
uff_model_path (str): path of .uff model
trt_engine_datatype (trt.DataType):
requested precision of TensorRT engine used for inference
batch_size (int): batch size for which engine
should be optimized for
"""
# We first load all custom plugins shipped with TensorRT,
# some of them will be needed during inference
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
# Initialize runtime needed for loading TensorRT engine from file
self.trt_runtime = trt.Runtime(TRT_LOGGER)
# TRT engine placeholder
self.trt_engine = None
# Display requested engine settings to stdout
print("TensorRT inference engine settings:")
print(" * Inference precision - {}".format(trt_engine_datatype))
print(" * Max batch size - {}\n".format(batch_size))
# If engine is not cached, we need to build it
if not os.path.exists(trt_engine_path):
# This function uses supplied .uff file
# alongside with UffParser to build TensorRT
# engine. For more details, check implmentation
self.trt_engine = engine_utils.build_engine(
uff_model_path, TRT_LOGGER,
trt_engine_datatype=trt_engine_datatype,
batch_size=batch_size)
# Save the engine to file
engine_utils.save_engine(self.trt_engine, trt_engine_path)
# If we get here, the file with engine exists, so we can load it
if not self.trt_engine:
print("Loading cached TensorRT engine from {}".format(
trt_engine_path))
self.trt_engine = engine_utils.load_engine(
self.trt_runtime, trt_engine_path)
# This allocates memory for network inputs/outputs on both CPU and GPU
self.inputs, self.outputs, self.bindings, self.stream = \
engine_utils.allocate_buffers(self.trt_engine)
# Execution context is needed for inference
self.context = self.trt_engine.create_execution_context()
# Allocate memory for multiple usage [e.g. multiple batch inference]
input_volume = trt.volume(model_utils.ModelData.INPUT_SHAPE)
self.numpy_array = np.zeros((self.trt_engine.max_batch_size, input_volume))
示例14: run_speed_eval
# 需要导入模块: import tensorrt [as 别名]
# 或者: from tensorrt import volume [as 别名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
inputs, outputs, bindings = allocate_buffers(self.engine)
# warm run
for i in range(warm_run_loops):
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
# real run
logging.info('Start real run loop.')
sum_time_data_copy = 0.
sum_time_inference_only = 0.
for i in range(real_run_loops):
time_start = time.time()
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
sum_time_data_copy += time.time() - time_start
time_start = time.time()
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
sum_time_inference_only += time.time() - time_start
time_start = time.time()
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
sum_time_data_copy += time.time() - time_start
logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
% ((sum_time_data_copy + sum_time_inference_only) * 1000,
(sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only)))