本文整理匯總了Python中pycuda.driver.mem_alloc方法的典型用法代碼示例。如果您正苦於以下問題:Python driver.mem_alloc方法的具體用法?Python driver.mem_alloc怎麽用?Python driver.mem_alloc使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pycuda.driver
的用法示例。
在下文中一共展示了driver.mem_alloc方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: infer
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def infer(context, input_img, output_size, batch_size):
# Load engine
engine = context.get_engine()
assert(engine.get_nb_bindings() == 2)
# Convert input data to float32
input_img = input_img.astype(np.float32)
# Create host buffer to receive data
output = np.empty(output_size, dtype = np.float32)
# Allocate device memory
d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
# Transfer input data to device
cuda.memcpy_htod_async(d_input, input_img, stream)
# Execute model
context.enqueue(batch_size, bindings, stream.handle, None)
# Transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
# Synchronize threads
stream.synchronize()
# Return predictions
return output
示例2: allocate_buffers
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
示例3: setup_binding_shapes
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def setup_binding_shapes(
engine: trt.ICudaEngine,
context: trt.IExecutionContext,
host_inputs: List[np.ndarray],
input_binding_idxs: List[int],
output_binding_idxs: List[int],
):
# Explicitly set the dynamic input shapes, so the dynamic output
# shapes can be computed internally
for host_input, binding_index in zip(host_inputs, input_binding_idxs):
context.set_binding_shape(binding_index, host_input.shape)
assert context.all_binding_shapes_specified
host_outputs = []
device_outputs = []
for binding_index in output_binding_idxs:
output_shape = context.get_binding_shape(binding_index)
# Allocate buffers to hold output results after copying back to host
buffer = np.empty(output_shape, dtype=np.float32)
host_outputs.append(buffer)
# Allocate output buffers on device
device_outputs.append(cuda.mem_alloc(buffer.nbytes))
return host_outputs, device_outputs
示例4: __init__
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def __init__(self, calibration_files=[], batch_size=32, input_shape=(3, 224, 224),
cache_file="calibration.cache", preprocess_func=None):
super().__init__()
self.input_shape = input_shape
self.cache_file = cache_file
self.batch_size = batch_size
self.batch = np.zeros((self.batch_size, *self.input_shape), dtype=np.float32)
self.device_input = cuda.mem_alloc(self.batch.nbytes)
self.files = calibration_files
# Pad the list so it is a multiple of batch_size
if len(self.files) % self.batch_size != 0:
logger.info("Padding # calibration files to be a multiple of batch_size {:}".format(self.batch_size))
self.files += calibration_files[(len(calibration_files) % self.batch_size):self.batch_size]
self.batches = self.load_batches()
if preprocess_func is None:
logger.error("No preprocess_func defined! Please provide one to the constructor.")
sys.exit(1)
else:
self.preprocess_func = preprocess_func
示例5: ready_argument_list
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def ready_argument_list(self, arguments):
"""ready argument list to be passed to the kernel, allocates gpu mem
:param arguments: List of arguments to be passed to the kernel.
The order should match the argument list on the CUDA kernel.
Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
:type arguments: list(numpy objects)
:returns: A list of arguments that can be passed to an CUDA kernel.
:rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
"""
gpu_args = []
for arg in arguments:
# if arg i is a numpy array copy to device
if isinstance(arg, numpy.ndarray):
alloc = drv.mem_alloc(arg.nbytes)
self.allocations.append(alloc)
gpu_args.append(alloc)
drv.memcpy_htod(gpu_args[-1], arg)
else: # if not an array, just pass argument along
gpu_args.append(arg)
return gpu_args
示例6: create_memory
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def create_memory(engine, name, buf, mem, batchsize, inp, inp_idx):
binding_idx = engine.get_binding_index(name)
if binding_idx == -1:
raise AttributeError("Not a valid binding")
print("Binding: name={}, bindingIndex={}".format(name, str(binding_idx)))
dims = engine.get_binding_dimensions(binding_idx).to_DimsCHW()
eltCount = dims.C() * dims.H() * dims.W() * batchsize
if engine.binding_is_input(binding_idx):
h_mem = inp[inp_idx]
inp_idx = inp_idx + 1
else:
h_mem = np.random.uniform(0.0, 255.0, eltCount).astype(np.dtype('f4'))
d_mem = cuda.mem_alloc(eltCount * 4)
cuda.memcpy_htod(d_mem, h_mem)
buf.insert(binding_idx, int(d_mem))
mem.append(d_mem)
return inp_idx
#Run inference on device
示例7: __allocate_buffers
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def __allocate_buffers(self, engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
示例8: allocate_buffers
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def allocate_buffers(engine):
"""Allocates all host/device in/out buffers required for an engine."""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * \
engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
示例9: allocate_buffers
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
index = 0
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
shape = [engine.max_batch_size] + list(engine.get_binding_shape(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
index += 1
return inputs, outputs, bindings
示例10: _set_rand_state_dev
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def _set_rand_state_dev(self, state=None):
"""
Set on device RNG states to values given by "state" input.
Arguments:
state (np.array or None): an array of uint32 values used to
set the state of the on device LFSRs.
if set to None, the state will be created
randomly
"""
ctx = drv.Context.get_current()
if state is None:
state = self._gen_dev_randstate()
if ctx in self.context_rand_state_map:
rand_state = self.context_rand_state_map[ctx]
else:
rand_state = drv.mem_alloc(state.nbytes)
self.context_rand_state_map[ctx] = rand_state
drv.memcpy_htod(rand_state, state)
self.context_rand_state_alive[ctx] = True
return
示例11: empty_like
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def empty_like(self, other_ary, name=None):
"""
Instantiate a new instance of this backend's Tensor class, with the
shape taken from ary.
Arguments:
ary (tensor object): Tensor to inherit the dimensions of.
dtype (data-type, optional): If present, specifies the underlying
type to employ for each element.
Returns:
Tensor: array object
"""
# if other_ary is a numpy array it wont have attr persist_values or
# allocator so use default values in that case.
return GPUTensor(self, other_ary.shape, dtype=other_ary.dtype,
name=name,
persist_values=getattr(other_ary, 'persist_values', True),
allocator=getattr(other_ary, 'allocator', drv.mem_alloc),
rounding=self.round_mode)
示例12: time_inference
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def time_inference(engine, batch_size):
assert(engine.get_nb_bindings() == 2)
input_index = engine.get_binding_index(INPUT_LAYERS[0])
output_index = engine.get_binding_index(OUTPUT_LAYERS[0])
input_dim = engine.get_binding_dimensions(input_index).to_DimsCHW()
output_dim = engine.get_binding_dimensions(output_index).to_DimsCHW()
insize = batch_size * input_dim.C() * input_dim.H() * input_dim.W() * 4
outsize = batch_size * output_dim.C() * output_dim.H() * output_dim.W() * 4
d_input = cuda.mem_alloc(insize)
d_output = cuda.mem_alloc(outsize)
bindings = [int(d_input), int(d_output)]
context = engine.create_execution_context()
context.set_profiler(G_PROFILER)
cuda.memset_d32(d_input, 0, insize // 4)
for i in range(TIMING_INTERATIONS):
context.execute(batch_size, bindings)
context.destroy()
return
示例13: infer
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def infer(context, input_img, output_size, batch_size):
#load engine
engine = context.get_engine()
assert(engine.get_nb_bindings() == 2)
#convert input data to Float32
input_img = input_img.astype(np.float32)
#create output array to receive data
output = np.empty(output_size, dtype = np.float32)
#alocate device memory
d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
#transfer input data to device
cuda.memcpy_htod_async(d_input, input_img, stream)
#execute model
context.enqueue(batch_size, bindings, stream.handle, None)
#transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
#synchronize threads
stream.synchronize()
#return predictions
return output
示例14: infer
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def infer(engine, input_img, batch_size):
#load engine
context = engine.create_execution_context()
assert(engine.get_nb_bindings() == 2)
#create output array to receive data
dims = engine.get_binding_dimensions(1).to_DimsCHW()
elt_count = dims.C() * dims.H() * dims.W() * batch_size
#Allocate pagelocked memory
output = cuda.pagelocked_empty(elt_count, dtype = np.float32)
#alocate device memory
d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
#transfer input data to device
cuda.memcpy_htod_async(d_input, input_img, stream)
#execute model
context.enqueue(batch_size, bindings, stream.handle, None)
#transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
#return predictions
return output
示例15: infer
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import mem_alloc [as 別名]
def infer(context, input_img, batch_size):
#load engine
engine = context.get_engine()
assert(engine.get_nb_bindings() == 2)
#create output array to receive data
dims = engine.get_binding_dimensions(1).to_DimsCHW()
elt_count = dims.C() * dims.H() * dims.W() * batch_size
#convert input data to Float32
input_img = input_img.astype(np.float32)
#Allocate pagelocked memory
output = cuda.pagelocked_empty(elt_count, dtype=np.float32)
#alocate device memory
d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
#transfer input data to device
cuda.memcpy_htod_async(d_input, input_img, stream)
#execute model
context.enqueue(batch_size, bindings, stream.handle, None)
#transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
#return predictions
return output