本文整理匯總了Python中pycuda.driver.memcpy_htod方法的典型用法代碼示例。如果您正苦於以下問題:Python driver.memcpy_htod方法的具體用法?Python driver.memcpy_htod怎麽用?Python driver.memcpy_htod使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pycuda.driver
的用法示例。
在下文中一共展示了driver.memcpy_htod方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: ready_argument_list
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def ready_argument_list(self, arguments):
"""ready argument list to be passed to the kernel, allocates gpu mem
:param arguments: List of arguments to be passed to the kernel.
The order should match the argument list on the CUDA kernel.
Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
:type arguments: list(numpy objects)
:returns: A list of arguments that can be passed to an CUDA kernel.
:rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
"""
gpu_args = []
for arg in arguments:
# if arg i is a numpy array copy to device
if isinstance(arg, numpy.ndarray):
alloc = drv.mem_alloc(arg.nbytes)
self.allocations.append(alloc)
gpu_args.append(alloc)
drv.memcpy_htod(gpu_args[-1], arg)
else: # if not an array, just pass argument along
gpu_args.append(arg)
return gpu_args
示例2: copy_constant_memory_args
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def copy_constant_memory_args(self, cmem_args):
"""adds constant memory arguments to the most recently compiled module
:param cmem_args: A dictionary containing the data to be passed to the
device constant memory. The format to be used is as follows: A
string key is used to name the constant memory symbol to which the
value needs to be copied. Similar to regular arguments, these need
to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
:type cmem_args: dict( string: numpy.ndarray, ... )
"""
logging.debug('copy_constant_memory_args called')
logging.debug('current module: ' + str(self.current_module))
for k, v in cmem_args.items():
symbol = self.current_module.get_global(k)[0]
logging.debug('copying to symbol: ' + str(symbol))
logging.debug('array to be copied: ')
logging.debug(v.nbytes)
logging.debug(v.dtype)
logging.debug(v.flags)
drv.memcpy_htod(symbol, v)
示例3: do_inference
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def do_inference(context, h_input, d_input, h_output, d_output, iterations=None):
# Transfer input data to the GPU.
cuda.memcpy_htod(d_input, h_input)
# warm-up
for _ in range(10):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
# test proper iterations
if iterations is None:
elapsed_time = 0
iterations = 100
while elapsed_time < 1:
t_start = time.time()
for _ in range(iterations):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
elapsed_time = time.time() - t_start
iterations *= 2
FPS = iterations / elapsed_time
iterations = int(FPS * 3)
# Run inference.
t_start = time.time()
for _ in tqdm(range(iterations)):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
elapsed_time = time.time() - t_start
latency = elapsed_time / iterations * 1000
return latency
示例4: create_memory
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def create_memory(engine, name, buf, mem, batchsize, inp, inp_idx):
binding_idx = engine.get_binding_index(name)
if binding_idx == -1:
raise AttributeError("Not a valid binding")
print("Binding: name={}, bindingIndex={}".format(name, str(binding_idx)))
dims = engine.get_binding_dimensions(binding_idx).to_DimsCHW()
eltCount = dims.C() * dims.H() * dims.W() * batchsize
if engine.binding_is_input(binding_idx):
h_mem = inp[inp_idx]
inp_idx = inp_idx + 1
else:
h_mem = np.random.uniform(0.0, 255.0, eltCount).astype(np.dtype('f4'))
d_mem = cuda.mem_alloc(eltCount * 4)
cuda.memcpy_htod(d_mem, h_mem)
buf.insert(binding_idx, int(d_mem))
mem.append(d_mem)
return inp_idx
#Run inference on device
示例5: _set_rand_state_dev
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def _set_rand_state_dev(self, state=None):
"""
Set on device RNG states to values given by "state" input.
Arguments:
state (np.array or None): an array of uint32 values used to
set the state of the on device LFSRs.
if set to None, the state will be created
randomly
"""
ctx = drv.Context.get_current()
if state is None:
state = self._gen_dev_randstate()
if ctx in self.context_rand_state_map:
rand_state = self.context_rand_state_map[ctx]
else:
rand_state = drv.mem_alloc(state.nbytes)
self.context_rand_state_map[ctx] = rand_state
drv.memcpy_htod(rand_state, state)
self.context_rand_state_alive[ctx] = True
return
示例6: get_batch
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, names):
if self.current_index + self.batch_size > self.data.shape[0]:
return None
current_batch = int(self.current_index / self.batch_size)
if current_batch % 10 == 0:
print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size))
batch = self.data[self.current_index:self.current_index + self.batch_size].ravel()
cuda.memcpy_htod(self.device_input, batch)
self.current_index += self.batch_size
return [self.device_input]
示例7: get_batch
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, names):
try:
# Assume self.batches is a generator that provides batch data.
batch = next(self.batches)
# Assume that self.device_input is a device buffer allocated by the constructor.
cuda.memcpy_htod(self.device_input, batch)
return [int(self.device_input)]
except StopIteration:
# When we're out of batches, we return either [] or None.
# This signals to TensorRT that there is no calibration data remaining.
return None
示例8: memcpy_htod
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def memcpy_htod(self, dest, src):
"""perform a host to device memory copy
:param dest: A GPU memory allocation unit
:type dest: pycuda.driver.DeviceAllocation
:param src: A numpy array in host memory to store the data
:type src: numpy.ndarray
"""
if isinstance(dest, drv.DeviceAllocation):
drv.memcpy_htod(dest, src)
else:
dest = src
示例9: get_batch
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, bindings, names):
batch = self.stream.next_batch()
if not batch.size:
return None
cuda.memcpy_htod(self.d_input, batch)
for i in self.input_layers[0]:
assert names[0] != i
bindings[0] = int(self.d_input)
return bindings
示例10: inference
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
# async version
# with engine.create_execution_context() as context: # cost time to initialize
# cuda.memcpy_htod_async(in_gpu, inputs, stream)
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
# stream.synchronize()
# sync version
cuda.memcpy_htod(in_gpu, inputs)
context.execute(1, [int(in_gpu), int(out_gpu)])
cuda.memcpy_dtoh(out_cpu, out_gpu)
return out_cpu
示例11: get_batch
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, bindings, names):
try:
data = next(self.batches)
cuda.memcpy_htod(self.device_input, data)
return [int(self.device_input)]
except StopIteration:
return None
示例12: copy_htod
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def copy_htod(self, np_buffer, stream=None):
if stream:
# PyCUDA requires the host buffer to be pagelocked for asynchronous memcpys.
pagelocked = cuda.register_host_memory(np.ascontiguousarray(np_buffer.ravel()))
cuda.memcpy_htod_async(self.ptr, pagelocked, stream)
else:
cuda.memcpy_htod(self.ptr, np.ascontiguousarray(np_buffer.ravel()))
示例13: execute
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def execute(self, batch_size):
[
cuda.memcpy_htod(inp.device, inp.host[:batch_size])
for inp in self.inputs if inp.device_input is False
]
self.context.execute(batch_size=batch_size, bindings=self.bindings)
[
cuda.memcpy_dtoh(out.host[:batch_size], out.device)
for out in self.outputs
]
return {n: v.host[:batch_size] for n, v in self.output_dict.items()}
示例14: run_speed_eval
# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
inputs, outputs, bindings = allocate_buffers(self.engine)
# warm run
for i in range(warm_run_loops):
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
# real run
logging.info('Start real run loop.')
sum_time_data_copy = 0.
sum_time_inference_only = 0.
for i in range(real_run_loops):
time_start = time.time()
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
sum_time_data_copy += time.time() - time_start
time_start = time.time()
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
sum_time_inference_only += time.time() - time_start
time_start = time.time()
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
sum_time_data_copy += time.time() - time_start
logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
% ((sum_time_data_copy + sum_time_inference_only) * 1000,
(sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only)))