当前位置: 首页>>代码示例>>Python>>正文


Python driver.memcpy_dtoh方法代码示例

本文整理汇总了Python中pycuda.driver.memcpy_dtoh方法的典型用法代码示例。如果您正苦于以下问题:Python driver.memcpy_dtoh方法的具体用法?Python driver.memcpy_dtoh怎么用?Python driver.memcpy_dtoh使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pycuda.driver的用法示例。


在下文中一共展示了driver.memcpy_dtoh方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: inference

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
    # async version
    # with engine.create_execution_context() as context:  # cost time to initialize
    # cuda.memcpy_htod_async(in_gpu, inputs, stream)
    # context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
    # cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
    # stream.synchronize()

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu 
开发者ID:ahmetgunduz,项目名称:Real-time-GesRec,代码行数:15,代码来源:speed_gpu.py

示例2: memcpy_dtoh

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def memcpy_dtoh(self, dest, src):
        """perform a device to host memory copy

        :param dest: A numpy array in host memory to store the data
        :type dest: numpy.ndarray

        :param src: A GPU memory allocation unit
        :type src: pycuda.driver.DeviceAllocation
        """
        if isinstance(src, drv.DeviceAllocation):
            drv.memcpy_dtoh(dest, src)
        else:
            dest = src 
开发者ID:benvanwerkhoven,项目名称:kernel_tuner,代码行数:15,代码来源:cuda.py

示例3: get_volume

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def get_volume(self):
    if self.gpu_mode:
      cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
      cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
    return self._tsdf_vol_cpu, self._color_vol_cpu 
开发者ID:andyzeng,项目名称:tsdf-fusion-python,代码行数:7,代码来源:fusion.py

示例4: copy_dtoh

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def copy_dtoh(self, device_buffer, stream=None):
        if stream:
            cuda.memcpy_dtoh_async(self.ptr, device_buffer.ptr, stream)
        else:
            cuda.memcpy_dtoh(self.ptr, device_buffer.ptr)

    # Return a view of the buffer which has the correct shape 
开发者ID:NVIDIA,项目名称:NeMo,代码行数:9,代码来源:tensorrt_runner.py

示例5: execute

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def execute(self, batch_size):
        [
            cuda.memcpy_htod(inp.device, inp.host[:batch_size])
            for inp in self.inputs if inp.device_input is False
        ]
        self.context.execute(batch_size=batch_size, bindings=self.bindings)
        [
            cuda.memcpy_dtoh(out.host[:batch_size], out.device)
            for out in self.outputs
        ]
        return {n: v.host[:batch_size] for n, v in self.output_dict.items()} 
开发者ID:traveller59,项目名称:torch2trt,代码行数:13,代码来源:inference.py

示例6: rng_get_state

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def rng_get_state(self):
        """
        Return the current state of the on-host and on-device RNGs.

        Returns:
            (np.array, np.array): the on-host and on-device RNG state vectors,
                                  respectively
        """
        dev_state = self._get_rand_state_dev()
        dev_state_local = np.zeros(NervanaGPU._RNG_POOL_SIZE).astype(np.uint32)
        drv.memcpy_dtoh(dev_state_local, dev_state)
        return (self.rng.get_state(), dev_state_local) 
开发者ID:NervanaSystems,项目名称:neon,代码行数:14,代码来源:nervanagpu.py

示例7: run_speed_eval

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_dtoh [as 别名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):

        def allocate_buffers(engine):
            inputs = []
            outputs = []
            bindings = []
            for binding in engine:
                size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                # Allocate host and device buffers
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
                # Append the device buffer to device bindings.
                bindings.append(int(device_mem))
                # Append to the appropriate list.
                if engine.binding_is_input(binding):
                    inputs.append(HostDeviceMem(host_mem, device_mem))
                else:
                    outputs.append(HostDeviceMem(host_mem, device_mem))
            return inputs, outputs, bindings

        inputs, outputs, bindings = allocate_buffers(self.engine)
        # warm run
        for i in range(warm_run_loops):
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]

        # real run
        logging.info('Start real run loop.')
        sum_time_data_copy = 0.
        sum_time_inference_only = 0.
        for i in range(real_run_loops):
            time_start = time.time()
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            sum_time_data_copy += time.time() - time_start

            time_start = time.time()
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            sum_time_inference_only += time.time() - time_start

            time_start = time.time()
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
            sum_time_data_copy += time.time() - time_start

        logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
                     % ((sum_time_data_copy + sum_time_inference_only) * 1000,
                        (sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
                        real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only))) 
开发者ID:becauseofAI,项目名称:lffd-pytorch,代码行数:51,代码来源:inference_speed_eval_with_tensorrt_cudnn.py


注:本文中的pycuda.driver.memcpy_dtoh方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。