Python driver.memcpy_dtoh_async函数代码示例

本文整理汇总了Python中pycuda.driver.memcpy_dtoh_async函数的典型用法代码示例。如果您正苦于以下问题：Python memcpy_dtoh_async函数的具体用法？Python memcpy_dtoh_async怎么用？Python memcpy_dtoh_async使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了memcpy_dtoh_async函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: data_finder

def data_finder(u, ss, sp, gpu_direct=True):
	data_package = data_list[u][ss][sp]
	dp = data_package.copy()

	memory_type = dp.memory_type
	if memory_type == 'devptr':
		if gpu_direct:
			devptr = data_list[u][ss][sp].devptr
			return devptr, dp
		else:
			devptr = data_list[u][ss][sp].devptr
			shape = dp.data_memory_shape
			bcmd = dp.data_contents_memory_dtype
			if log_type in ['time','all']: st = time.time()

			buf = numpy.empty((shape), dtype=bcmd)
			cuda.memcpy_dtoh_async(buf, devptr, stream=stream[1])
#			buf = cuda.from_device(devptr, shape, bcmd)
			if log_type in ['time','all']:
				u = dp.unique_id
				bytes = dp.data_bytes
				t = MPI.Wtime()-st
				ms = 1000*t
				bw = bytes/GIGA/t
				log("rank%d, \"%s\", u=%d, GPU%d data transfer from GPU memory to CPU memory, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, device_number, bytes/MEGA, ms, bw),'time', log_type)
			
			dp.memory_type = 'memory'
			dp.data_dtype = type(buf)
			return buf, dp
	else:
		data = data_list[u][ss][sp].data
		return data, dp
	return None, None

开发者ID:Anukura，项目名称:Vivaldi，代码行数:33，代码来源:GPU_unit.py

示例2: synchronize_start

    def synchronize_start(self):
        """ Start the synchronization process. """

        # Use shorter, easier names for class variables.
        bufs = self._sync_buffers
        ptrs = self._sync_ptrs
        streams = self._sync_streams
        adj = self._sync_adj

        # Start the transfer operations needed.
        self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags.

        # Forward send.
        drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0]) 

        # Backward send.
        drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1]) 

        # Forward receive.
        self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \
                                            tag=self._sync_tags[0])

        # Backward receive.
        self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \
                                            tag=self._sync_tags[1])

        # Signalling variables needed to complete transfers.
        self._sync_part2_start = [False, False, False, False]

开发者ID:JesseLu，项目名称:maxwell-solver，代码行数:28，代码来源:grid.py

示例3: send

def send(data, data_package, dest=None, gpu_direct=True):
	global s_requests
	tag = 52
	dp = data_package
	# send data_package
	send_data_package(dp, dest=dest, tag=tag)

	bytes = dp.data_bytes
	memory_type = dp.memory_type
	
	if log_type in ['time','all']: st = time.time()

	flag = False
	request = None
	if memory_type == 'devptr': # data in the GPU
		if gpu_direct: # want to use GPU direct
			devptr = data
			buf = MPI.make_buffer(devptr.__int__(), bytes)
			ctx.synchronize()
			request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, buf, devptr))
			flag = True
		else:# not want to use GPU direct
		
			# copy to CPU
			shape = dp.data_memory_shape
			dtype = dp.data_contents_memory_dtype
			buf = numpy.empty(shape, dtype=dtype)
			cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1])

			request = comm.Isend(buf, dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, buf, None))
			
	else: # data in the CPU
		# want to use GPU direct, not exist case
		# not want to use GPU direct
		if dp.data_dtype == numpy.ndarray: 
			request = comm.Isend(data, dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, data, None))
			
	if log_type in ['time','all']:
		u = dp.unique_id
		bytes = dp.data_bytes
		t = MPI.Wtime()-st
		ms = 1000*t
		bw = bytes/GIGA/t
	
		if flag:
			log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
		else:
			log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
	
	return request

开发者ID:Anukura，项目名称:Vivaldi，代码行数:56，代码来源:GPU_unit.py

示例4: run

            def run(self, scomp, scopy):
                # Pack
                kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                         v.nvcol, v.basedata, v.mapping,
                                         v.cstrides or 0, v.rstrides or 0, m)

                # Copy the packed buffer to the host
                event.record(scomp)
                scopy.wait_for_event(event)
                cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

开发者ID:Aerojspark，项目名称:PyFR，代码行数:10，代码来源:packing.py

示例5: copy

 def copy(self, fb, dim, pool, stream=None):
     fmt = 'u1'
     if self.pix_fmt in ('yuv444p10', 'yuv420p10', 'yuv444p12'):
         fmt = 'u2'
     dims =  (3, dim.h, dim.w)
     if self.pix_fmt == 'yuv420p10':
         dims = (dim.h * dim.w * 6 / 4,)
     h_out = pool.allocate(dims, fmt)
     cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
     return h_out

开发者ID:stevenrobertson，项目名称:cuburn，代码行数:10，代码来源:output.py

示例6: get_async

    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary

开发者ID:minrk，项目名称:PyCUDA，代码行数:10，代码来源:gpuarray.py

示例7: get

 def get(self, stream=None):
     """
     copy device array to host.
     Returns:
         the host numpy array
     """
     assert self.is_contiguous, "Array in get() must be contiguous"
     ary = np.empty(self.shape, self.dtype)
     drv.memcpy_dtoh_async(ary, self.gpudata, stream)
     return ary

开发者ID:davidoj，项目名称:nervanagpu，代码行数:10，代码来源:nervanagpu.py

示例8: cpy_back

def cpy_back(a, a_gpu, auto_init_context=True):
    """Data transfer from device to host.
    
    Asynchronous will be enabled when auto_init_context is True, otherwise
    use normal transfer.
    """

    import pycuda.driver as drv

    if auto_init_context:
        strm = drv.Stream()
        drv.memcpy_dtoh_async(a, a_gpu, strm)
        return strm
    else:
        drv.memcpy_dtoh(a, a_gpu)

开发者ID:budiaji，项目名称:anuga-cuda，代码行数:15，代码来源:utility.py

示例9: get_async

    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)

            ary = _as_strided(ary, strides=self.strides)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype
            assert ary.flags.forc

        assert self.flags.forc, "Array in get() must be contiguous"

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary

开发者ID:hannes-brt，项目名称:pycuda，代码行数:15，代码来源:gpuarray.py

示例10: get_async

 def get_async(self, stream = None, ary = None):
     if ary is None:
         ary = cuda.pagelocked_empty(self.shape, self.dtype)
         
     else:
         assert ary.size == self.size
         assert ary.dtype == ary.dtype
         if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
     
     if self.size:
         if self.M == 1:
             cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
         else:
             PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
             
     return ary

开发者ID:bionet，项目名称:vtem，代码行数:17，代码来源:parray.py

示例11: get_host_result

 def get_host_result(self):
     if not self.gpu_finished:
         if self.gpu_finished_evt.query():
             self.gpu_finished = True
             self.copy_stream = get_stream()
             self.host_dest = self.pagelocked_allocator(
                     self.gpu_result.shape, self.gpu_result.dtype,
                     self.copy_stream)
             drv.memcpy_dtoh_async(self.host_dest,
                     self.gpu_result.gpudata,
                     self.copy_stream)
             self.copy_finished_evt = drv.Event()
             self.copy_finished_evt.record()
     else:
         if self.copy_finished_evt.query():
             STREAM_POOL.append(self.copy_stream)
             return self.host_dest

开发者ID:FreddieWitherden，项目名称:pycuda，代码行数:17，代码来源:inner.py

示例12: run

            def run(self, scomp, scopy):
                # If we are unpacking then copy the host buffer to the GPU
                if op == 'unpack':
                    cuda.memcpy_htod_async(m.data, m.hdata, scopy)
                    event.record(scopy)
                    scomp.wait_for_event(event)

                # Call the CUDA kernel (pack or unpack)
                fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
                                       v.mapping, v.strides, m,
                                       v.mapping.leaddim, v.strides.leaddim,
                                       m.leaddim)

                # If we have been packing then copy the GPU buffer to the host
                if op == 'pack':
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

开发者ID:bartwozniak，项目名称:PyFR，代码行数:18，代码来源:packing.py

示例13: test_streamed_kernel

    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0

开发者ID:minrk，项目名称:PyCUDA，代码行数:41，代码来源:test_driver.py

示例14: threshold_integrated

def threshold_integrated(series, value):
    global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
        
    t = numpy.float32(value**2)
    nb = int(numpy.ceil(float(len(series))/nt/gs))
    
    if _bn is None or len(_bn) < nb:
        _bn = gpuarray.zeros(nb, dtype=numpy.uint32)
        
    if _n is None:
        _n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
        ptr = numpy.intp(_n.base.get_device_pointer())
        class T():
            pass
        _dn = T()
        _dn.gpudata = ptr
        _dn.flags = _n.flags
        
    if _loc_tmp is None or len(series) > len(_loc_tmp):
        _loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _val_out = gpuarray.zeros(len(series), dtype=series.dtype)
        _val = driver.pagelocked_empty((4096*256), numpy.complex64)
        _loc = driver.pagelocked_empty((4096*256), numpy.uint32)
    
    #Do the thresholding by block
    stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
    
    # Recombine the blocks into a final output
    stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
    
    # We need to get the data back now
    pycbc.scheme.mgr.state.context.synchronize()
    if _n != 0: 
        driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
        driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
        pycbc.scheme.mgr.state.context.synchronize()
    return _loc[0:_n], _val[0:_n]

开发者ID:AbhayMK，项目名称:pycbc，代码行数:38，代码来源:threshold_cuda.py

示例15: len

    if len(shape) <= 1:
        if isinstance(src, GPUArray):
            if isinstance(dst, GPUArray):
                if async:
                    drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                else:
                    drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
            else:
                # The arrays might be contiguous in the sense of
                # having no gaps, but the axes could be transposed
                # so that the order is neither Fortran or C.
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
                drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
            else:
                drv.memcpy_htod(dst.gpudata, src)
        return

    if len(shape) == 2:
        copy = drv.Memcpy2D()
    elif len(shape) == 3:
        copy = drv.Memcpy3D()
    else:

开发者ID:Benli11，项目名称:pycuda，代码行数:30，代码来源:gpuarray.py

注：本文中的pycuda.driver.memcpy_dtoh_async函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。