當前位置: 首頁>>代碼示例>>Python>>正文


Python driver.memcpy_htod方法代碼示例

本文整理匯總了Python中pycuda.driver.memcpy_htod方法的典型用法代碼示例。如果您正苦於以下問題:Python driver.memcpy_htod方法的具體用法?Python driver.memcpy_htod怎麽用?Python driver.memcpy_htod使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pycuda.driver的用法示例。


在下文中一共展示了driver.memcpy_htod方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: ready_argument_list

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def ready_argument_list(self, arguments):
        """ready argument list to be passed to the kernel, allocates gpu mem

        :param arguments: List of arguments to be passed to the kernel.
            The order should match the argument list on the CUDA kernel.
            Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
        :type arguments: list(numpy objects)

        :returns: A list of arguments that can be passed to an CUDA kernel.
        :rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
        """
        gpu_args = []
        for arg in arguments:
            # if arg i is a numpy array copy to device
            if isinstance(arg, numpy.ndarray):
                alloc = drv.mem_alloc(arg.nbytes)
                self.allocations.append(alloc)
                gpu_args.append(alloc)
                drv.memcpy_htod(gpu_args[-1], arg)
            else: # if not an array, just pass argument along
                gpu_args.append(arg)
        return gpu_args 
開發者ID:benvanwerkhoven,項目名稱:kernel_tuner,代碼行數:24,代碼來源:cuda.py

示例2: copy_constant_memory_args

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def copy_constant_memory_args(self, cmem_args):
        """adds constant memory arguments to the most recently compiled module

        :param cmem_args: A dictionary containing the data to be passed to the
            device constant memory. The format to be used is as follows: A
            string key is used to name the constant memory symbol to which the
            value needs to be copied. Similar to regular arguments, these need
            to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
        :type cmem_args: dict( string: numpy.ndarray, ... )
        """
        logging.debug('copy_constant_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        for k, v in cmem_args.items():
            symbol = self.current_module.get_global(k)[0]
            logging.debug('copying to symbol: ' + str(symbol))
            logging.debug('array to be copied: ')
            logging.debug(v.nbytes)
            logging.debug(v.dtype)
            logging.debug(v.flags)
            drv.memcpy_htod(symbol, v) 
開發者ID:benvanwerkhoven,項目名稱:kernel_tuner,代碼行數:22,代碼來源:cuda.py

示例3: do_inference

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def do_inference(context, h_input, d_input, h_output, d_output, iterations=None):
        # Transfer input data to the GPU.
        cuda.memcpy_htod(d_input, h_input)
        # warm-up
        for _ in range(10):
            context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
        # test proper iterations
        if iterations is None:
            elapsed_time = 0
            iterations = 100
            while elapsed_time < 1:
                t_start = time.time()
                for _ in range(iterations):
                    context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
                elapsed_time = time.time() - t_start
                iterations *= 2
            FPS = iterations / elapsed_time
            iterations = int(FPS * 3)
        # Run inference.
        t_start = time.time()
        for _ in tqdm(range(iterations)):
            context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
        elapsed_time = time.time() - t_start
        latency = elapsed_time / iterations * 1000
        return latency 
開發者ID:TAMU-VITA,項目名稱:FasterSeg,代碼行數:27,代碼來源:darts_utils.py

示例4: create_memory

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def create_memory(engine, name,  buf, mem, batchsize, inp, inp_idx):
    binding_idx = engine.get_binding_index(name)
    if binding_idx == -1:
        raise AttributeError("Not a valid binding")
    print("Binding: name={}, bindingIndex={}".format(name, str(binding_idx)))
    dims = engine.get_binding_dimensions(binding_idx).to_DimsCHW()
    eltCount = dims.C() * dims.H() * dims.W() * batchsize

    if engine.binding_is_input(binding_idx):
        h_mem = inp[inp_idx]
        inp_idx = inp_idx + 1
    else:
        h_mem = np.random.uniform(0.0, 255.0, eltCount).astype(np.dtype('f4'))

    d_mem = cuda.mem_alloc(eltCount * 4)
    cuda.memcpy_htod(d_mem, h_mem)
    buf.insert(binding_idx, int(d_mem))
    mem.append(d_mem)
    return inp_idx


#Run inference on device 
開發者ID:CUHKSZ-TQL,項目名稱:EverybodyDanceNow_reproduce_pytorch,代碼行數:24,代碼來源:run_engine.py

示例5: _set_rand_state_dev

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def _set_rand_state_dev(self, state=None):
        """
        Set on device RNG states to values given by "state" input.

        Arguments:
            state (np.array or None): an array of uint32 values used to
                                      set the state of the on device LFSRs.
                                      if set to None, the state will be created
                                      randomly
        """
        ctx = drv.Context.get_current()
        if state is None:
            state = self._gen_dev_randstate()
        if ctx in self.context_rand_state_map:
            rand_state = self.context_rand_state_map[ctx]
        else:
            rand_state = drv.mem_alloc(state.nbytes)
            self.context_rand_state_map[ctx] = rand_state
        drv.memcpy_htod(rand_state, state)
        self.context_rand_state_alive[ctx] = True
        return 
開發者ID:NervanaSystems,項目名稱:neon,代碼行數:23,代碼來源:nervanagpu.py

示例6: get_batch

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, names):
        if self.current_index + self.batch_size > self.data.shape[0]:
            return None

        current_batch = int(self.current_index / self.batch_size)
        if current_batch % 10 == 0:
            print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size))

        batch = self.data[self.current_index:self.current_index + self.batch_size].ravel()
        cuda.memcpy_htod(self.device_input, batch)
        self.current_index += self.batch_size
        return [self.device_input] 
開發者ID:aimuch,項目名稱:iAI,代碼行數:14,代碼來源:calibrator.py

示例7: get_batch

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, names):
        try:
            # Assume self.batches is a generator that provides batch data.
            batch = next(self.batches)
            # Assume that self.device_input is a device buffer allocated by the constructor.
            cuda.memcpy_htod(self.device_input, batch)
            return [int(self.device_input)]
        except StopIteration:
            # When we're out of batches, we return either [] or None.
            # This signals to TensorRT that there is no calibration data remaining.
            return None 
開發者ID:rmccorm4,項目名稱:tensorrt-utils,代碼行數:13,代碼來源:ImagenetCalibrator.py

示例8: memcpy_htod

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def memcpy_htod(self, dest, src):
        """perform a host to device memory copy

        :param dest: A GPU memory allocation unit
        :type dest: pycuda.driver.DeviceAllocation

        :param src: A numpy array in host memory to store the data
        :type src: numpy.ndarray
        """
        if isinstance(dest, drv.DeviceAllocation):
            drv.memcpy_htod(dest, src)
        else:
            dest = src 
開發者ID:benvanwerkhoven,項目名稱:kernel_tuner,代碼行數:15,代碼來源:cuda.py

示例9: get_batch

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, bindings, names):
        batch = self.stream.next_batch()
        if not batch.size:
            return None

        cuda.memcpy_htod(self.d_input, batch)
        for i in self.input_layers[0]:
            assert names[0] != i

        bindings[0] = int(self.d_input)
        return bindings 
開發者ID:modricwang,項目名稱:Pytorch-Model-to-TensorRT,代碼行數:13,代碼來源:calib.py

示例10: inference

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
    # async version
    # with engine.create_execution_context() as context:  # cost time to initialize
    # cuda.memcpy_htod_async(in_gpu, inputs, stream)
    # context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
    # cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
    # stream.synchronize()

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu 
開發者ID:ahmetgunduz,項目名稱:Real-time-GesRec,代碼行數:15,代碼來源:speed_gpu.py

示例11: get_batch

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def get_batch(self, bindings, names):
    try:
      data = next(self.batches)
      cuda.memcpy_htod(self.device_input, data)
      return [int(self.device_input)]
    except StopIteration:
      return None 
開發者ID:PRBonn,項目名稱:bonnetal,代碼行數:9,代碼來源:trtCalibINT8.py

示例12: copy_htod

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def copy_htod(self, np_buffer, stream=None):
        if stream:
            # PyCUDA requires the host buffer to be pagelocked for asynchronous memcpys.
            pagelocked = cuda.register_host_memory(np.ascontiguousarray(np_buffer.ravel()))
            cuda.memcpy_htod_async(self.ptr, pagelocked, stream)
        else:
            cuda.memcpy_htod(self.ptr, np.ascontiguousarray(np_buffer.ravel())) 
開發者ID:NVIDIA,項目名稱:NeMo,代碼行數:9,代碼來源:tensorrt_runner.py

示例13: execute

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def execute(self, batch_size):
        [
            cuda.memcpy_htod(inp.device, inp.host[:batch_size])
            for inp in self.inputs if inp.device_input is False
        ]
        self.context.execute(batch_size=batch_size, bindings=self.bindings)
        [
            cuda.memcpy_dtoh(out.host[:batch_size], out.device)
            for out in self.outputs
        ]
        return {n: v.host[:batch_size] for n, v in self.output_dict.items()} 
開發者ID:traveller59,項目名稱:torch2trt,代碼行數:13,代碼來源:inference.py

示例14: run_speed_eval

# 需要導入模塊: from pycuda import driver [as 別名]
# 或者: from pycuda.driver import memcpy_htod [as 別名]
def run_speed_eval(self, warm_run_loops=10, real_run_loops=100):

        def allocate_buffers(engine):
            inputs = []
            outputs = []
            bindings = []
            for binding in engine:
                size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                # Allocate host and device buffers
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
                # Append the device buffer to device bindings.
                bindings.append(int(device_mem))
                # Append to the appropriate list.
                if engine.binding_is_input(binding):
                    inputs.append(HostDeviceMem(host_mem, device_mem))
                else:
                    outputs.append(HostDeviceMem(host_mem, device_mem))
            return inputs, outputs, bindings

        inputs, outputs, bindings = allocate_buffers(self.engine)
        # warm run
        for i in range(warm_run_loops):
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]

        # real run
        logging.info('Start real run loop.')
        sum_time_data_copy = 0.
        sum_time_inference_only = 0.
        for i in range(real_run_loops):
            time_start = time.time()
            [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
            sum_time_data_copy += time.time() - time_start

            time_start = time.time()
            self.executor.execute(batch_size=self.max_batch_size, bindings=bindings)
            sum_time_inference_only += time.time() - time_start

            time_start = time.time()
            [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
            sum_time_data_copy += time.time() - time_start

        logging.info('Total time (data transfer & inference) elapsed: %.02f ms. [%.02f ms] for each image (%.02f PFS)'
                     % ((sum_time_data_copy + sum_time_inference_only) * 1000,
                        (sum_time_data_copy + sum_time_inference_only) * 1000 / real_run_loops / self.max_batch_size,
                        real_run_loops * self.max_batch_size / (sum_time_data_copy + sum_time_inference_only))) 
開發者ID:becauseofAI,項目名稱:lffd-pytorch,代碼行數:51,代碼來源:inference_speed_eval_with_tensorrt_cudnn.py


注:本文中的pycuda.driver.memcpy_htod方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。