当前位置: 首页>>代码示例>>Python>>正文


Python driver.memcpy_htod方法代码示例

本文整理汇总了Python中pycuda.driver.memcpy_htod方法的典型用法代码示例。如果您正苦于以下问题:Python driver.memcpy_htod方法的具体用法?Python driver.memcpy_htod怎么用?Python driver.memcpy_htod使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pycuda.driver的用法示例。


在下文中一共展示了driver.memcpy_htod方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ready_argument_list

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def ready_argument_list(self, arguments):
        """ready argument list to be passed to the kernel, allocates gpu mem

        :param arguments: List of arguments to be passed to the kernel.
            The order should match the argument list on the CUDA kernel.
            Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
        :type arguments: list(numpy objects)

        :returns: A list of arguments that can be passed to an CUDA kernel.
        :rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
        """
        gpu_args = []
        for arg in arguments:
            # if arg i is a numpy array copy to device
            if isinstance(arg, numpy.ndarray):
                alloc = drv.mem_alloc(arg.nbytes)
                self.allocations.append(alloc)
                gpu_args.append(alloc)
                drv.memcpy_htod(gpu_args[-1], arg)
            else: # if not an array, just pass argument along
                gpu_args.append(arg)
        return gpu_args 
开发者ID:benvanwerkhoven,项目名称:kernel_tuner,代码行数:24,代码来源:cuda.py

示例2: copy_constant_memory_args

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def copy_constant_memory_args(self, cmem_args):
        """adds constant memory arguments to the most recently compiled module

        :param cmem_args: A dictionary containing the data to be passed to the
            device constant memory. The format to be used is as follows: A
            string key is used to name the constant memory symbol to which the
            value needs to be copied. Similar to regular arguments, these need
            to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
        :type cmem_args: dict( string: numpy.ndarray, ... )
        """
        logging.debug('copy_constant_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        for k, v in cmem_args.items():
            symbol = self.current_module.get_global(k)[0]
            logging.debug('copying to symbol: ' + str(symbol))
            logging.debug('array to be copied: ')
            logging.debug(v.nbytes)
            logging.debug(v.dtype)
            logging.debug(v.flags)
            drv.memcpy_htod(symbol, v) 
开发者ID:benvanwerkhoven,项目名称:kernel_tuner,代码行数:22,代码来源:cuda.py

示例3: do_inference

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def do_inference(context, h_input, d_input, h_output, d_output, iterations=None):
        # Transfer input data to the GPU.
        cuda.memcpy_htod(d_input, h_input)
        # warm-up
        for _ in range(10):
            context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
        # test proper iterations
        if iterations is None:
            elapsed_time = 0
            iterations = 100
            while elapsed_time < 1:
                t_start = time.time()
                for _ in range(iterations):
                    context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
                elapsed_time = time.time() - t_start
                iterations *= 2
            FPS = iterations / elapsed_time
            iterations = int(FPS * 3)
        # Run inference.
        t_start = time.time()
        for _ in tqdm(range(iterations)):
            context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
        elapsed_time = time.time() - t_start
        latency = elapsed_time / iterations * 1000
        return latency 
开发者ID:TAMU-VITA,项目名称:FasterSeg,代码行数:27,代码来源:darts_utils.py

示例4: create_memory

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def create_memory(engine, name,  buf, mem, batchsize, inp, inp_idx):
    binding_idx = engine.get_binding_index(name)
    if binding_idx == -1:
        raise AttributeError("Not a valid binding")
    print("Binding: name={}, bindingIndex={}".format(name, str(binding_idx)))
    dims = engine.get_binding_dimensions(binding_idx).to_DimsCHW()
    eltCount = dims.C() * dims.H() * dims.W() * batchsize

    if engine.binding_is_input(binding_idx):
        h_mem = inp[inp_idx]
        inp_idx = inp_idx + 1
    else:
        h_mem = np.random.uniform(0.0, 255.0, eltCount).astype(np.dtype('f4'))

    d_mem = cuda.mem_alloc(eltCount * 4)
    cuda.memcpy_htod(d_mem, h_mem)
    buf.insert(binding_idx, int(d_mem))
    mem.append(d_mem)
    return inp_idx


#Run inference on device 
开发者ID:CUHKSZ-TQL,项目名称:EverybodyDanceNow_reproduce_pytorch,代码行数:24,代码来源:run_engine.py

示例5: _set_rand_state_dev

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def _set_rand_state_dev(self, state=None):
        """
        Set on device RNG states to values given by "state" input.

        Arguments:
            state (np.array or None): an array of uint32 values used to
                                      set the state of the on device LFSRs.
                                      if set to None, the state will be created
                                      randomly
        """
        ctx = drv.Context.get_current()
        if state is None:
            state = self._gen_dev_randstate()
        if ctx in self.context_rand_state_map:
            rand_state = self.context_rand_state_map[ctx]
        else:
            rand_state = drv.mem_alloc(state.nbytes)
            self.context_rand_state_map[ctx] = rand_state
        drv.memcpy_htod(rand_state, state)
        self.context_rand_state_alive[ctx] = True
        return 
开发者ID:NervanaSystems,项目名称:neon,代码行数:23,代码来源:nervanagpu.py

示例6: get_batch

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def get_batch(self, names):
        if self.current_index + self.batch_size > self.data.shape[0]:
            return None

        current_batch = int(self.current_index / self.batch_size)
        if current_batch % 10 == 0:
            print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size))

        batch = self.data[self.current_index:self.current_index + self.batch_size].ravel()
        cuda.memcpy_htod(self.device_input, batch)
        self.current_index += self.batch_size
        return [self.device_input] 
开发者ID:aimuch,项目名称:iAI,代码行数:14,代码来源:calibrator.py

示例7: get_batch

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def get_batch(self, names):
        try:
            # Assume self.batches is a generator that provides batch data.
            batch = next(self.batches)
            # Assume that self.device_input is a device buffer allocated by the constructor.
            cuda.memcpy_htod(self.device_input, batch)
            return [int(self.device_input)]
        except StopIteration:
            # When we're out of batches, we return either [] or None.
            # This signals to TensorRT that there is no calibration data remaining.
            return None 
开发者ID:rmccorm4,项目名称:tensorrt-utils,代码行数:13,代码来源:ImagenetCalibrator.py

示例8: memcpy_htod

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def memcpy_htod(self, dest, src):
        """perform a host to device memory copy

        :param dest: A GPU memory allocation unit
        :type dest: pycuda.driver.DeviceAllocation

        :param src: A numpy array in host memory to store the data
        :type src: numpy.ndarray
        """
        if isinstance(dest, drv.DeviceAllocation):
            drv.memcpy_htod(dest, src)
        else:
            dest = src 
开发者ID:benvanwerkhoven,项目名称:kernel_tuner,代码行数:15,代码来源:cuda.py

示例9: get_batch

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def get_batch(self, bindings, names):
        batch = self.stream.next_batch()
        if not batch.size:
            return None

        cuda.memcpy_htod(self.d_input, batch)
        for i in self.input_layers[0]:
            assert names[0] != i

        bindings[0] = int(self.d_input)
        return bindings 
开发者ID:modricwang,项目名称:Pytorch-Model-to-TensorRT,代码行数:13,代码来源:calib.py

示例10: inference

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
    # async version
    # with engine.create_execution_context() as context:  # cost time to initialize
    # cuda.memcpy_htod_async(in_gpu, inputs, stream)
    # context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
    # cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
    # stream.synchronize()

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu 
开发者ID:ahmetgunduz,项目名称:Real-time-GesRec,代码行数:15,代码来源:speed_gpu.py

示例11: get_batch

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def get_batch(self, bindings, names):
    try:
      data = next(self.batches)
      cuda.memcpy_htod(self.device_input, data)
      return [int(self.device_input)]
    except StopIteration:
      return None 
开发者ID:PRBonn,项目名称:bonnetal,代码行数:9,代码来源:trtCalibINT8.py

示例12: copy_htod

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def copy_htod(self, np_buffer, stream=None):
        if stream:
            # PyCUDA requires the host buffer to be pagelocked for asynchronous memcpys.
            pagelocked = cuda.register_host_memory(np.ascontiguousarray(np_buffer.ravel()))
            cuda.memcpy_htod_async(self.ptr, pagelocked, stream)
        else:
            cuda.memcpy_htod(self.ptr, np.ascontiguousarray(np_buffer.ravel())) 
开发者ID:NVIDIA,项目名称:NeMo,代码行数:9,代码来源:tensorrt_runner.py

示例13: execute

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def execute(self, batch_size):
        [
            cuda.memcpy_htod(inp.device, inp.host[:batch_size])
            for inp in self.inputs if inp.device_input is False
        ]
        self.context.execute(batch_size=batch_size, bindings=self.bindings)
        [
            cuda.memcpy_dtoh(out.host[:batch_size], out.device)
            for out in self.outputs
        ]
        return {n: v.host[:batch_size] for n, v in self.output_dict.items()} 
开发者ID:traveller59,项目名称:torch2trt,代码行数:13,代码来源:inference.py

示例14: run_speed_eval

# 需要导入模块: from pycuda import driver [as 别名]
# 或者: from pycuda.driver import memcpy_htod [as 别名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):

        def allocate_buffers(engine):
            inputs = []
            outputs = []
            bindings = []
            for binding in engine:
                size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                # Allocate host and device buffers
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
                # Append the device buffer to device bindings.
                bindings.append(int(device_mem))
                # Append to the appropriate list.
                if engine.binding_is_input(binding):
                    inputs.append(HostDeviceMem(host_mem, device_mem))
                else:
                    outputs.append(HostDeviceMem(host_mem, device_mem))
            return inputs, outputs, bindings

        inputs, outputs, bindings = allocate_buffers(self.engine)
        # warm run
        for i in range(warm_run_loops):
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]

        # real run
        logging.info('Start real run loop.')
        sum_time_data_copy = 0.
        sum_time_inference_only = 0.
        for i in range(real_run_loops):
            time_start = time.time()
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            sum_time_data_copy += time.time() - time_start

            time_start = time.time()
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            sum_time_inference_only += time.time() - time_start

            time_start = time.time()
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
            sum_time_data_copy += time.time() - time_start

        logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
                     % ((sum_time_data_copy + sum_time_inference_only) * 1000,
                        (sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
                        real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only))) 
开发者ID:becauseofAI,项目名称:lffd-pytorch,代码行数:51,代码来源:inference_speed_eval_with_tensorrt_cudnn.py


注:本文中的pycuda.driver.memcpy_htod方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。