Python driver.memcpy_htod_async函数代码示例

本文整理汇总了Python中pycuda.driver.memcpy_htod_async函数的典型用法代码示例。如果您正苦于以下问题：Python memcpy_htod_async函数的具体用法？Python memcpy_htod_async怎么用？Python memcpy_htod_async使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了memcpy_htod_async函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: set

    def set(self, ary, device=None):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
            device: device id, if not the one attached to current context
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == self.strides

        if device is None:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
        else:
            # with multithreaded datasets, make a context before copying
            # and destroy it again once done.
            lctx = drv.Device(device).make_context()
            drv.memcpy_htod_async(self.gpudata, ary, stream)
            lctx.pop()
            del lctx

        return self

开发者ID:leonardt，项目名称:nervanagpu，代码行数:27，代码来源:nervanagpu.py

示例2: set_refsmiles

    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return

开发者ID:ihaque，项目名称:SIML，代码行数:56，代码来源:GPULingo.py

示例3: set_async

    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from " "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)

开发者ID:abergeron，项目名称:pycuda，代码行数:10，代码来源:gpuarray.py

示例4: set

 def set(self, tensor, data):
     assert isinstance(tensor, MGPUTensor)
     if tensor.ptype == 'replica':
         for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
             ctx.push()
             drv.memcpy_htod_async(dest.ptr, data, strm)
             ctx.pop()
         # tensor.copy_from(data)
     else:
         self.scatter(data, tensor)

开发者ID:neuroidss，项目名称:neon，代码行数:10，代码来源:mgpu.py

示例5: exchange

def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
	ctx1 = cuda.Device(dev1).make_context()
	a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
	ctx1.pop()

	ctx2 = cuda.Device(dev2).make_context()
	cuda.memcpy_htod(int(b_gpu), a)
	b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
	ctx2.pop()

	ctx1 = cuda.Device(dev1).make_context()
	cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
	ctx1.pop()

开发者ID:wbkifun，项目名称:fdtd_accelerate，代码行数:13，代码来源:03-3GPU.py

示例6: kernel_write

def kernel_write(function_name, dest_devptr, dest_info, source_devptr, source_info, work_range, stream=None):
	global KD

	# initialize variables
	global tb_cnt
	tb_cnt = 0

	# dest
	cuda_args = [dest_devptr]
	cuda_args += [dest_info]

	# source
	cuda_args += [source_devptr]
	cuda_args += [source_info]

	# work_range
	cuda_args += make_cuda_list(work_range)

	# initialize model view
	eye = numpy.eye(4,dtype=numpy.float32)
	cuda.memcpy_htod_async(mmtx, eye, stream=stream)
	cuda.memcpy_htod_async(inv_mmtx, eye, stream=stream)
	
	try:
		if Debug:
			print "Function name: ", function_name
		func = mod.get_function(function_name) #cutting function
	except:
		print "Function not found ERROR"
		print "Function name: ", function_name
		assert(False)
		
	# set work range
	block, grid = range_to_block_grid(work_range)

	if log_type in ['time', 'all']:
		st = time.time()

	func(*cuda_args, block=block, grid=grid, stream=stream)

	#ctx.synchronize()
	
	KD.append((dest_info, source_info))
	

	if log_type in ['time', 'all']:
		bytes = make_bytes(work_range,3)
		t = MPI.Wtime()-st
		ms = 1000*t
		bw = bytes/GIGA/t
		log("rank%d, GPU%d, , kernel write time, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec "%(rank, device_number, bytes/MEGA, ms, bw),'time', log_type)

开发者ID:davidhildebrand，项目名称:Vivaldi_public，代码行数:51，代码来源:GPU_unit.py

示例7: set_async

 def set_async(self, ary, stream=None):
     assert ary.ndim <= 3
     assert ary.dtype == ary.dtype
     
     assert ary.size == self.size
     
     if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
             
     if self.size:
         if self.M == 1:
             cuda.memcpy_htod_async(self.gpudata, ary, stream)
         else:
             PitchTrans(self.shape, self.gpudata, self.ld, ary, _pd(self.shape), self.dtype, async = True, stream = stream)

开发者ID:bionet，项目名称:vtem，代码行数:14，代码来源:parray.py

示例8: synchronize_isdone

    def synchronize_isdone(self):
        """ Complete synchronization process. """

        # Use shorter, easier names for class variables.
        bufs = self._sync_buffers
        ptrs = self._sync_ptrs
        streams = self._sync_streams
        adj = self._sync_adj
        part2_start = self._sync_part2_start 
        is_done = [False, False, False, False]

        # Forward send.
        if streams[0].is_done(): # Device-to-host copy completed.
            if not part2_start[0]: # Initialize MPI send.
                comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0])
                part2_start[0] = True
                is_done[0] = True
            else: # No more work to do.
                is_done[0] = True

        # Backward send.
        if streams[1].is_done(): # Device-to-host copy completed.
            if not part2_start[1]: # Initialize MPI send.
                comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1])
                part2_start[1] = True
                is_done[1] = True
            else: # No more work to do.
                is_done[1] = True

        # Forward receive.
        if self._sync_req_forw.Test(): # MPI receive completed.
            if not part2_start[2]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \
                                        stream=streams[2]) # Host-to-device.
                part2_start[2] = True
            elif streams[2].is_done(): # Host-to-device copy completed.
                is_done[2] = True

        # Backward receive.
        if self._sync_req_back.Test(): # MPI receive completed.
            if not part2_start[3]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \
                                        stream=streams[3]) # Host-to-device.
                part2_start[3] = True
            elif streams[3].is_done(): # Host-to-device copy completed.
                is_done[3] = True
        # print '~', is_done[0:4],
        # Return true only when all four transfers are complete.
        return all(is_done)

开发者ID:JesseLu，项目名称:maxwell-solver，代码行数:49，代码来源:grid.py

示例9: load_data_on_gpu

    def load_data_on_gpu(tl_args, module):
        d_V = module.get_global('d_V')[0]
        cuda.memcpy_htod_async(d_V, tl_args.V)

        d_c = module.get_global('d_c')[0]
        cuda.memcpy_htod_async(d_c, tl_args.c)

        d_I = module.get_global('d_I')[0]
        cuda.memcpy_htod_async(d_I, tl_args.I)

        d_E = module.get_global('d_E')[0]
        cuda.memcpy_htod_async(d_E, tl_args.E)

        d_x_0 = module.get_global('d_x_0')[0]
        cuda.memcpy_htod_async(d_x_0, tl_args.x_0)

开发者ID:ucl-cssb，项目名称:py_stoch，代码行数:15，代码来源:cuGillespie.py

示例10: set_async

    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        if ary.strides != self.strides:
            from warnings import warn
            warn("Setting array from one with different strides/storage order. "
                    "This will cease to work in 2013.x.",
                    stacklevel=2)

        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from "
                    "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)

开发者ID:hannes-brt，项目名称:pycuda，代码行数:17，代码来源:gpuarray.py

示例11: test_register_host_memory

    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()

开发者ID:chunggi，项目名称:pycuda，代码行数:17，代码来源:test_driver.py

示例12: run

            def run(self, scomp, scopy):
                # If we are unpacking then copy the host buffer to the GPU
                if op == 'unpack':
                    cuda.memcpy_htod_async(m.data, m.hdata, scopy)
                    event.record(scopy)
                    scomp.wait_for_event(event)

                # Call the CUDA kernel (pack or unpack)
                fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
                                       v.mapping, v.strides, m,
                                       v.mapping.leaddim, v.strides.leaddim,
                                       m.leaddim)

                # If we have been packing then copy the GPU buffer to the host
                if op == 'pack':
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

开发者ID:bartwozniak，项目名称:PyFR，代码行数:18，代码来源:packing.py

示例13: set

    def set(self, ary):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == tuple(self.dtype.itemsize*s for s in self.strides)

        drv.memcpy_htod_async(self.gpudata, ary, stream)

        return self

开发者ID:davidoj，项目名称:nervanagpu，代码行数:18，代码来源:nervanagpu.py

示例14: _interp

    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))

开发者ID:stevenrobertson，项目名称:cuburn，代码行数:19，代码来源:render.py

示例15: test_streamed_kernel

    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0

开发者ID:minrk，项目名称:PyCUDA，代码行数:41，代码来源:test_driver.py

注：本文中的pycuda.driver.memcpy_htod_async函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。