本文整理汇总了Python中pycuda.driver.memcpy_htod_async函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_htod_async函数的具体用法?Python memcpy_htod_async怎么用?Python memcpy_htod_async使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_htod_async函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: set
def set(self, ary, device=None):
"""
copy host array to device.
Arguments:
ary: host array, needs to be contiguous
device: device id, if not the one attached to current context
Returns:
self
"""
stream = self.backend.stream
assert ary.size == self.size
assert self.is_contiguous, "Array in set() must be contiguous"
if ary.dtype is not self.dtype:
ary = ary.astype(self.dtype)
assert ary.strides == self.strides
if device is None:
drv.memcpy_htod_async(self.gpudata, ary, stream)
else:
# with multithreaded datasets, make a context before copying
# and destroy it again once done.
lctx = drv.Device(device).make_context()
drv.memcpy_htod_async(self.gpudata, ary, stream)
lctx.pop()
del lctx
return self
示例2: set_refsmiles
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
"""Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.
Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
"""
# Set up lingo and count matrices on device #{{{
if self.usePycudaArray:
# Set up using PyCUDA CUDAArray support
self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
self.gpu.tex2lr.set_array(self.gpu.rsmiles)
self.gpu.tex2cr.set_array(self.gpu.rcounts)
else:
# Manually handle setup
temprlmat = self._padded_array(refsmilesmat)
if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)
temprcmat = self._padded_array(refcountsmat)
self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)
descriptor = cuda.ArrayDescriptor()
descriptor.width = temprcmat.shape[1]
descriptor.height = temprcmat.shape[0]
descriptor.format = cuda.array_format.UNSIGNED_INT32
descriptor.num_channels = 1
self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
self.gpu.stream.synchronize()
del temprlmat
del temprcmat
#}}}
self.rlengths = reflengths
self.rshape = refsmilesmat.shape
self.nref = refsmilesmat.shape[0]
# Copy reference lengths to GPU
self.gpu.rl_gpu = cuda.to_device(reflengths)
# Allocate buffers for query set magnitudes
self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
if refmags is not None:
cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
else:
# Calculate query set magnitudes on GPU
magthreads = 256
self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
return
示例3: set_async
def set_async(self, ary, stream=None):
assert ary.size == self.size
assert ary.dtype == self.dtype
assert self.flags.forc
if not ary.flags.forc:
raise RuntimeError("cannot asynchronously set from " "non-contiguous array")
if self.size:
drv.memcpy_htod_async(self.gpudata, ary, stream)
示例4: set
def set(self, tensor, data):
assert isinstance(tensor, MGPUTensor)
if tensor.ptype == 'replica':
for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
ctx.push()
drv.memcpy_htod_async(dest.ptr, data, strm)
ctx.pop()
# tensor.copy_from(data)
else:
self.scatter(data, tensor)
示例5: exchange
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
ctx1 = cuda.Device(dev1).make_context()
a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
ctx1.pop()
ctx2 = cuda.Device(dev2).make_context()
cuda.memcpy_htod(int(b_gpu), a)
b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
ctx2.pop()
ctx1 = cuda.Device(dev1).make_context()
cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
ctx1.pop()
示例6: kernel_write
def kernel_write(function_name, dest_devptr, dest_info, source_devptr, source_info, work_range, stream=None):
global KD
# initialize variables
global tb_cnt
tb_cnt = 0
# dest
cuda_args = [dest_devptr]
cuda_args += [dest_info]
# source
cuda_args += [source_devptr]
cuda_args += [source_info]
# work_range
cuda_args += make_cuda_list(work_range)
# initialize model view
eye = numpy.eye(4,dtype=numpy.float32)
cuda.memcpy_htod_async(mmtx, eye, stream=stream)
cuda.memcpy_htod_async(inv_mmtx, eye, stream=stream)
try:
if Debug:
print "Function name: ", function_name
func = mod.get_function(function_name) #cutting function
except:
print "Function not found ERROR"
print "Function name: ", function_name
assert(False)
# set work range
block, grid = range_to_block_grid(work_range)
if log_type in ['time', 'all']:
st = time.time()
func(*cuda_args, block=block, grid=grid, stream=stream)
#ctx.synchronize()
KD.append((dest_info, source_info))
if log_type in ['time', 'all']:
bytes = make_bytes(work_range,3)
t = MPI.Wtime()-st
ms = 1000*t
bw = bytes/GIGA/t
log("rank%d, GPU%d, , kernel write time, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec "%(rank, device_number, bytes/MEGA, ms, bw),'time', log_type)
示例7: set_async
def set_async(self, ary, stream=None):
assert ary.ndim <= 3
assert ary.dtype == ary.dtype
assert ary.size == self.size
if ary.base.__class__ != cuda.HostAllocation:
raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
if self.size:
if self.M == 1:
cuda.memcpy_htod_async(self.gpudata, ary, stream)
else:
PitchTrans(self.shape, self.gpudata, self.ld, ary, _pd(self.shape), self.dtype, async = True, stream = stream)
示例8: synchronize_isdone
def synchronize_isdone(self):
""" Complete synchronization process. """
# Use shorter, easier names for class variables.
bufs = self._sync_buffers
ptrs = self._sync_ptrs
streams = self._sync_streams
adj = self._sync_adj
part2_start = self._sync_part2_start
is_done = [False, False, False, False]
# Forward send.
if streams[0].is_done(): # Device-to-host copy completed.
if not part2_start[0]: # Initialize MPI send.
comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0])
part2_start[0] = True
is_done[0] = True
else: # No more work to do.
is_done[0] = True
# Backward send.
if streams[1].is_done(): # Device-to-host copy completed.
if not part2_start[1]: # Initialize MPI send.
comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1])
part2_start[1] = True
is_done[1] = True
else: # No more work to do.
is_done[1] = True
# Forward receive.
if self._sync_req_forw.Test(): # MPI receive completed.
if not part2_start[2]: # Initialize host-to-device copy.
drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \
stream=streams[2]) # Host-to-device.
part2_start[2] = True
elif streams[2].is_done(): # Host-to-device copy completed.
is_done[2] = True
# Backward receive.
if self._sync_req_back.Test(): # MPI receive completed.
if not part2_start[3]: # Initialize host-to-device copy.
drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \
stream=streams[3]) # Host-to-device.
part2_start[3] = True
elif streams[3].is_done(): # Host-to-device copy completed.
is_done[3] = True
# print '~', is_done[0:4],
# Return true only when all four transfers are complete.
return all(is_done)
示例9: load_data_on_gpu
def load_data_on_gpu(tl_args, module):
d_V = module.get_global('d_V')[0]
cuda.memcpy_htod_async(d_V, tl_args.V)
d_c = module.get_global('d_c')[0]
cuda.memcpy_htod_async(d_c, tl_args.c)
d_I = module.get_global('d_I')[0]
cuda.memcpy_htod_async(d_I, tl_args.I)
d_E = module.get_global('d_E')[0]
cuda.memcpy_htod_async(d_E, tl_args.E)
d_x_0 = module.get_global('d_x_0')[0]
cuda.memcpy_htod_async(d_x_0, tl_args.x_0)
示例10: set_async
def set_async(self, ary, stream=None):
assert ary.size == self.size
assert ary.dtype == self.dtype
if ary.strides != self.strides:
from warnings import warn
warn("Setting array from one with different strides/storage order. "
"This will cease to work in 2013.x.",
stacklevel=2)
assert self.flags.forc
if not ary.flags.forc:
raise RuntimeError("cannot asynchronously set from "
"non-contiguous array")
if self.size:
drv.memcpy_htod_async(self.gpudata, ary, stream)
示例11: test_register_host_memory
def test_register_host_memory(self):
if drv.get_version() < (4,):
from py.test import skip
skip("register_host_memory only exists on CUDA 4.0 and later")
import sys
if sys.platform == "darwin":
from py.test import skip
skip("register_host_memory is not supported on OS X")
a = drv.aligned_empty((2**20,), np.float64)
a_pin = drv.register_host_memory(a)
gpu_ary = drv.mem_alloc_like(a)
stream = drv.Stream()
drv.memcpy_htod_async(gpu_ary, a_pin, stream)
drv.Context.synchronize()
示例12: run
def run(self, scomp, scopy):
# If we are unpacking then copy the host buffer to the GPU
if op == 'unpack':
cuda.memcpy_htod_async(m.data, m.hdata, scopy)
event.record(scopy)
scomp.wait_for_event(event)
# Call the CUDA kernel (pack or unpack)
fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
v.mapping, v.strides, m,
v.mapping.leaddim, v.strides.leaddim,
m.leaddim)
# If we have been packing then copy the GPU buffer to the host
if op == 'pack':
event.record(scomp)
scopy.wait_for_event(event)
cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
示例13: set
def set(self, ary):
"""
copy host array to device.
Arguments:
ary: host array, needs to be contiguous
Returns:
self
"""
stream = self.backend.stream
assert ary.size == self.size
assert self.is_contiguous, "Array in set() must be contiguous"
if ary.dtype is not self.dtype:
ary = ary.astype(self.dtype)
assert ary.strides == tuple(self.dtype.itemsize*s for s in self.strides)
drv.memcpy_htod_async(self.gpudata, ary, stream)
return self
示例14: _interp
def _interp(self, rdr, gnm, dim, ts, td):
d_acc_size = rdr.mod.get_global('acc_size')[0]
p_dim = self.fb.pool.allocate((len(dim),), u32)
p_dim[:] = dim
cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
tref = self.mod.get_surfref('flatpal')
tref.set_array(self.info_a.d_pal_array, 0)
launch('interp_palette_flat', self.mod, self.stream_a,
256, self.info_a.palette_height,
self.fb.d_rb, self.fb.d_seeds,
self.src_a.d_ptimes, self.src_a.d_pals,
f32(ts), f32(td / self.info_a.palette_height))
nts = self.info_a.ntemporal_samples
launch('interp_iter_params', rdr.mod, self.stream_a,
256, np.ceil(nts / 256.),
self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
f32(ts), f32(td / nts), i32(nts))
示例15: test_streamed_kernel
def test_streamed_kernel(self):
# this differs from the "simple_kernel" case in that *all* computation
# and data copying is asynchronous. Observe how this necessitates the
# use of page-locked memory.
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x*blockDim.y + threadIdx.y;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
import numpy
shape = (32,8)
a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
a[:] = numpy.random.randn(*shape)
b[:] = numpy.random.randn(*shape)
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
strm = drv.Stream()
drv.memcpy_htod_async(a_gpu, a, strm)
drv.memcpy_htod_async(b_gpu, b, strm)
strm.synchronize()
dest = drv.pagelocked_empty_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=shape+(1,), stream=strm)
strm.synchronize()
drv.memcpy_dtoh_async(a, a_gpu, strm)
drv.memcpy_dtoh_async(b, b_gpu, strm)
strm.synchronize()
assert la.norm(dest-a*b) == 0