本文整理汇总了Python中pycuda.driver.memcpy_dtoh_async函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_dtoh_async函数的具体用法?Python memcpy_dtoh_async怎么用?Python memcpy_dtoh_async使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_dtoh_async函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: data_finder
def data_finder(u, ss, sp, gpu_direct=True):
data_package = data_list[u][ss][sp]
dp = data_package.copy()
memory_type = dp.memory_type
if memory_type == 'devptr':
if gpu_direct:
devptr = data_list[u][ss][sp].devptr
return devptr, dp
else:
devptr = data_list[u][ss][sp].devptr
shape = dp.data_memory_shape
bcmd = dp.data_contents_memory_dtype
if log_type in ['time','all']: st = time.time()
buf = numpy.empty((shape), dtype=bcmd)
cuda.memcpy_dtoh_async(buf, devptr, stream=stream[1])
# buf = cuda.from_device(devptr, shape, bcmd)
if log_type in ['time','all']:
u = dp.unique_id
bytes = dp.data_bytes
t = MPI.Wtime()-st
ms = 1000*t
bw = bytes/GIGA/t
log("rank%d, \"%s\", u=%d, GPU%d data transfer from GPU memory to CPU memory, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, device_number, bytes/MEGA, ms, bw),'time', log_type)
dp.memory_type = 'memory'
dp.data_dtype = type(buf)
return buf, dp
else:
data = data_list[u][ss][sp].data
return data, dp
return None, None
示例2: synchronize_start
def synchronize_start(self):
""" Start the synchronization process. """
# Use shorter, easier names for class variables.
bufs = self._sync_buffers
ptrs = self._sync_ptrs
streams = self._sync_streams
adj = self._sync_adj
# Start the transfer operations needed.
self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags.
# Forward send.
drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0])
# Backward send.
drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1])
# Forward receive.
self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \
tag=self._sync_tags[0])
# Backward receive.
self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \
tag=self._sync_tags[1])
# Signalling variables needed to complete transfers.
self._sync_part2_start = [False, False, False, False]
示例3: send
def send(data, data_package, dest=None, gpu_direct=True):
global s_requests
tag = 52
dp = data_package
# send data_package
send_data_package(dp, dest=dest, tag=tag)
bytes = dp.data_bytes
memory_type = dp.memory_type
if log_type in ['time','all']: st = time.time()
flag = False
request = None
if memory_type == 'devptr': # data in the GPU
if gpu_direct: # want to use GPU direct
devptr = data
buf = MPI.make_buffer(devptr.__int__(), bytes)
ctx.synchronize()
request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, buf, devptr))
flag = True
else:# not want to use GPU direct
# copy to CPU
shape = dp.data_memory_shape
dtype = dp.data_contents_memory_dtype
buf = numpy.empty(shape, dtype=dtype)
cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1])
request = comm.Isend(buf, dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, buf, None))
else: # data in the CPU
# want to use GPU direct, not exist case
# not want to use GPU direct
if dp.data_dtype == numpy.ndarray:
request = comm.Isend(data, dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, data, None))
if log_type in ['time','all']:
u = dp.unique_id
bytes = dp.data_bytes
t = MPI.Wtime()-st
ms = 1000*t
bw = bytes/GIGA/t
if flag:
log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
else:
log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
return request
示例4: run
def run(self, scomp, scopy):
# Pack
kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
v.nvcol, v.basedata, v.mapping,
v.cstrides or 0, v.rstrides or 0, m)
# Copy the packed buffer to the host
event.record(scomp)
scopy.wait_for_event(event)
cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
示例5: copy
def copy(self, fb, dim, pool, stream=None):
fmt = 'u1'
if self.pix_fmt in ('yuv444p10', 'yuv420p10', 'yuv444p12'):
fmt = 'u2'
dims = (3, dim.h, dim.w)
if self.pix_fmt == 'yuv420p10':
dims = (dim.h * dim.w * 6 / 4,)
h_out = pool.allocate(dims, fmt)
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
示例6: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
示例7: get
def get(self, stream=None):
"""
copy device array to host.
Returns:
the host numpy array
"""
assert self.is_contiguous, "Array in get() must be contiguous"
ary = np.empty(self.shape, self.dtype)
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
示例8: cpy_back
def cpy_back(a, a_gpu, auto_init_context=True):
"""Data transfer from device to host.
Asynchronous will be enabled when auto_init_context is True, otherwise
use normal transfer.
"""
import pycuda.driver as drv
if auto_init_context:
strm = drv.Stream()
drv.memcpy_dtoh_async(a, a_gpu, strm)
return strm
else:
drv.memcpy_dtoh(a, a_gpu)
示例9: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
ary = _as_strided(ary, strides=self.strides)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
assert ary.flags.forc
assert self.flags.forc, "Array in get() must be contiguous"
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
示例10: get_async
def get_async(self, stream = None, ary = None):
if ary is None:
ary = cuda.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == ary.dtype
if ary.base.__class__ != cuda.HostAllocation:
raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
if self.size:
if self.M == 1:
cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
else:
PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
return ary
示例11: get_host_result
def get_host_result(self):
if not self.gpu_finished:
if self.gpu_finished_evt.query():
self.gpu_finished = True
self.copy_stream = get_stream()
self.host_dest = self.pagelocked_allocator(
self.gpu_result.shape, self.gpu_result.dtype,
self.copy_stream)
drv.memcpy_dtoh_async(self.host_dest,
self.gpu_result.gpudata,
self.copy_stream)
self.copy_finished_evt = drv.Event()
self.copy_finished_evt.record()
else:
if self.copy_finished_evt.query():
STREAM_POOL.append(self.copy_stream)
return self.host_dest
示例12: run
def run(self, scomp, scopy):
# If we are unpacking then copy the host buffer to the GPU
if op == 'unpack':
cuda.memcpy_htod_async(m.data, m.hdata, scopy)
event.record(scopy)
scomp.wait_for_event(event)
# Call the CUDA kernel (pack or unpack)
fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
v.mapping, v.strides, m,
v.mapping.leaddim, v.strides.leaddim,
m.leaddim)
# If we have been packing then copy the GPU buffer to the host
if op == 'pack':
event.record(scomp)
scopy.wait_for_event(event)
cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
示例13: test_streamed_kernel
def test_streamed_kernel(self):
# this differs from the "simple_kernel" case in that *all* computation
# and data copying is asynchronous. Observe how this necessitates the
# use of page-locked memory.
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x*blockDim.y + threadIdx.y;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
import numpy
shape = (32,8)
a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
a[:] = numpy.random.randn(*shape)
b[:] = numpy.random.randn(*shape)
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
strm = drv.Stream()
drv.memcpy_htod_async(a_gpu, a, strm)
drv.memcpy_htod_async(b_gpu, b, strm)
strm.synchronize()
dest = drv.pagelocked_empty_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=shape+(1,), stream=strm)
strm.synchronize()
drv.memcpy_dtoh_async(a, a_gpu, strm)
drv.memcpy_dtoh_async(b, b_gpu, strm)
strm.synchronize()
assert la.norm(dest-a*b) == 0
示例14: threshold_integrated
def threshold_integrated(series, value):
global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
t = numpy.float32(value**2)
nb = int(numpy.ceil(float(len(series))/nt/gs))
if _bn is None or len(_bn) < nb:
_bn = gpuarray.zeros(nb, dtype=numpy.uint32)
if _n is None:
_n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
ptr = numpy.intp(_n.base.get_device_pointer())
class T():
pass
_dn = T()
_dn.gpudata = ptr
_dn.flags = _n.flags
if _loc_tmp is None or len(series) > len(_loc_tmp):
_loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
_loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
_val_out = gpuarray.zeros(len(series), dtype=series.dtype)
_val = driver.pagelocked_empty((4096*256), numpy.complex64)
_loc = driver.pagelocked_empty((4096*256), numpy.uint32)
#Do the thresholding by block
stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
# Recombine the blocks into a final output
stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
# We need to get the data back now
pycbc.scheme.mgr.state.context.synchronize()
if _n != 0:
driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
pycbc.scheme.mgr.state.context.synchronize()
return _loc[0:_n], _val[0:_n]
示例15: len
if len(shape) <= 1:
if isinstance(src, GPUArray):
if isinstance(dst, GPUArray):
if async:
drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
else:
drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
else:
# The arrays might be contiguous in the sense of
# having no gaps, but the axes could be transposed
# so that the order is neither Fortran or C.
# So, we attempt to get a contiguous view of dst.
dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
if async:
drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
else:
drv.memcpy_dtoh(dst, src.gpudata)
else:
src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
if async:
drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
else:
drv.memcpy_htod(dst.gpudata, src)
return
if len(shape) == 2:
copy = drv.Memcpy2D()
elif len(shape) == 3:
copy = drv.Memcpy3D()
else: