Python driver.memcpy_dtoh方法代码示例

本文整理汇总了Python中pycuda.driver.memcpy_dtoh方法的典型用法代码示例。如果您正苦于以下问题：Python driver.memcpy_dtoh方法的具体用法？Python driver.memcpy_dtoh怎么用？Python driver.memcpy_dtoh使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pycuda.driver的用法示例。

在下文中一共展示了driver.memcpy_dtoh方法的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: inference

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
    # async version
    # with engine.create_execution_context() as context:  # cost time to initialize
    # cuda.memcpy_htod_async(in_gpu, inputs, stream)
    # context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
    # cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
    # stream.synchronize()

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu

开发者ID:ahmetgunduz，项目名称:Real-time-GesRec，代码行数:15，代码来源:speed_gpu.py

示例2: memcpy_dtoh

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def memcpy_dtoh(self, dest, src):
        """perform a device to host memory copy

        :param dest: A numpy array in host memory to store the data
        :type dest: numpy.ndarray

        :param src: A GPU memory allocation unit
        :type src: pycuda.driver.DeviceAllocation
        """
        if isinstance(src, drv.DeviceAllocation):
            drv.memcpy_dtoh(dest, src)
        else:
            dest = src

开发者ID:benvanwerkhoven，项目名称:kernel_tuner，代码行数:15，代码来源:cuda.py

示例3: get_volume

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def get_volume(self):
    if self.gpu_mode:
      cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
      cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
    return self._tsdf_vol_cpu, self._color_vol_cpu

开发者ID:andyzeng，项目名称:tsdf-fusion-python，代码行数:7，代码来源:fusion.py

示例4: copy_dtoh

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def copy_dtoh(self, device_buffer, stream=None):
        if stream:
            cuda.memcpy_dtoh_async(self.ptr, device_buffer.ptr, stream)
        else:
            cuda.memcpy_dtoh(self.ptr, device_buffer.ptr)

    # Return a view of the buffer which has the correct shape

开发者ID:NVIDIA，项目名称:NeMo，代码行数:9，代码来源:tensorrt_runner.py

示例5: execute

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def execute(self, batch_size):
        [
            cuda.memcpy_htod(inp.device, inp.host[:batch_size])
            for inp in self.inputs if inp.device_input is False
        ]
        self.context.execute(batch_size=batch_size, bindings=self.bindings)
        [
            cuda.memcpy_dtoh(out.host[:batch_size], out.device)
            for out in self.outputs
        ]
        return {n: v.host[:batch_size] for n, v in self.output_dict.items()}

开发者ID:traveller59，项目名称:torch2trt，代码行数:13，代码来源:inference.py

示例6: rng_get_state

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def rng_get_state(self):
        """
        Return the current state of the on-host and on-device RNGs.

        Returns:
            (np.array, np.array): the on-host and on-device RNG state vectors,
                                  respectively
        """
        dev_state = self._get_rand_state_dev()
        dev_state_local = np.zeros(NervanaGPU._RNG_POOL_SIZE).astype(np.uint32)
        drv.memcpy_dtoh(dev_state_local, dev_state)
        return (self.rng.get_state(), dev_state_local)

开发者ID:NervanaSystems，项目名称:neon，代码行数:14，代码来源:nervanagpu.py

示例7: run_speed_eval

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):

        def allocate_buffers(engine):
            inputs = []
            outputs = []
            bindings = []
            for binding in engine:
                size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                # Allocate host and device buffers
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
                # Append the device buffer to device bindings.
                bindings.append(int(device_mem))
                # Append to the appropriate list.
                if engine.binding_is_input(binding):
                    inputs.append(HostDeviceMem(host_mem, device_mem))
                else:
                    outputs.append(HostDeviceMem(host_mem, device_mem))
            return inputs, outputs, bindings

        inputs, outputs, bindings = allocate_buffers(self.engine)
        # warm run
        for i in range(warm_run_loops):
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]

        # real run
        logging.info('Start real run loop.')
        sum_time_data_copy = 0.
        sum_time_inference_only = 0.
        for i in range(real_run_loops):
            time_start = time.time()
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            sum_time_data_copy += time.time() - time_start

            time_start = time.time()
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            sum_time_inference_only += time.time() - time_start

            time_start = time.time()
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
            sum_time_data_copy += time.time() - time_start

        logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
                     % ((sum_time_data_copy + sum_time_inference_only) * 1000,
                        (sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
                        real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only)))

开发者ID:becauseofAI，项目名称:lffd-pytorch，代码行数:51，代码来源:inference_speed_eval_with_tensorrt_cudnn.py

注：本文中的pycuda.driver.memcpy_dtoh方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。