本文整理汇总了Python中pycuda.driver.memcpy_dtoh方法的典型用法代码示例。如果您正苦于以下问题:Python driver.memcpy_dtoh方法的具体用法?Python driver.memcpy_dtoh怎么用?Python driver.memcpy_dtoh使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pycuda.driver
的用法示例。
在下文中一共展示了driver.memcpy_dtoh方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: inference
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
# async version
# with engine.create_execution_context() as context: # cost time to initialize
# cuda.memcpy_htod_async(in_gpu, inputs, stream)
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
# stream.synchronize()
# sync version
cuda.memcpy_htod(in_gpu, inputs)
context.execute(1, [int(in_gpu), int(out_gpu)])
cuda.memcpy_dtoh(out_cpu, out_gpu)
return out_cpu
示例2: memcpy_dtoh
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def memcpy_dtoh(self, dest, src):
"""perform a device to host memory copy
:param dest: A numpy array in host memory to store the data
:type dest: numpy.ndarray
:param src: A GPU memory allocation unit
:type src: pycuda.driver.DeviceAllocation
"""
if isinstance(src, drv.DeviceAllocation):
drv.memcpy_dtoh(dest, src)
else:
dest = src
示例3: get_volume
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def get_volume(self):
if self.gpu_mode:
cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
return self._tsdf_vol_cpu, self._color_vol_cpu
示例4: copy_dtoh
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def copy_dtoh(self, device_buffer, stream=None):
if stream:
cuda.memcpy_dtoh_async(self.ptr, device_buffer.ptr, stream)
else:
cuda.memcpy_dtoh(self.ptr, device_buffer.ptr)
# Return a view of the buffer which has the correct shape
示例5: execute
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def execute(self, batch_size):
[
cuda.memcpy_htod(inp.device, inp.host[:batch_size])
for inp in self.inputs if inp.device_input is False
]
self.context.execute(batch_size=batch_size, bindings=self.bindings)
[
cuda.memcpy_dtoh(out.host[:batch_size], out.device)
for out in self.outputs
]
return {n: v.host[:batch_size] for n, v in self.output_dict.items()}
示例6: rng_get_state
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def rng_get_state(self):
"""
Return the current state of the on-host and on-device RNGs.
Returns:
(np.array, np.array): the on-host and on-device RNG state vectors,
respectively
"""
dev_state = self._get_rand_state_dev()
dev_state_local = np.zeros(NervanaGPU._RNG_POOL_SIZE).astype(np.uint32)
drv.memcpy_dtoh(dev_state_local, dev_state)
return (self.rng.get_state(), dev_state_local)
示例7: run_speed_eval
# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
inputs, outputs, bindings = allocate_buffers(self.engine)
# warm run
for i in range(warm_run_loops):
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
# real run
logging.info('Start real run loop.')
sum_time_data_copy = 0.
sum_time_inference_only = 0.
for i in range(real_run_loops):
time_start = time.time()
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
sum_time_data_copy += time.time() - time_start
time_start = time.time()
self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
sum_time_inference_only += time.time() - time_start
time_start = time.time()
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
sum_time_data_copy += time.time() - time_start
logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
% ((sum_time_data_copy + sum_time_inference_only) * 1000,
(sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only)))