本文整理汇总了Python中pyopencl.enqueue_copy函数的典型用法代码示例。如果您正苦于以下问题:Python enqueue_copy函数的具体用法?Python enqueue_copy怎么用?Python enqueue_copy使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了enqueue_copy函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fromGpu
def fromGpu(self, gpu_buf, target_shape, target_dtype):
import pyopencl as cl
data = numpy.empty(target_shape, target_dtype)
queue = self._createQueue()
cl.enqueue_copy(queue, data, gpu_buf, is_blocking=True)
return data
示例2: _set
def _set(self, ary):
# Allocate a new buffer with suitable padding and assign
buf = np.zeros(self.datashape, dtype=self.dtype)
buf[...,:self.ioshape[-1]] = ary
# Copy
cl.enqueue_copy(self.backend.qdflt, self.data, buf)
示例3: run
def run(self, kernel, shape, *args):
kargs = []
for arg in args:
if isinstance(arg, np.ndarray):
if id(arg) in self.buffers:
buf = self.buffers[id(arg)]
cl.enqueue_copy(self.runtime.queues[0], buf, arg)
else:
flags = cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR
buf = cl.Buffer(self.runtime.context, flags, arg.nbytes, hostbuf=arg)
self.buffers[id(arg)] = buf
kargs.append(buf)
else:
kargs.append(np.float32(arg))
# TODO: use user-supplied information if necessary
first_np_array = [a for a in args if isinstance(a, np.ndarray)][0]
workspace = shape if shape else first_np_array.shape
if self.output is None:
self.output = np.empty(workspace).astype(np.float32)
out_buffer = cl.Buffer(self.runtime.context, cl.mem_flags.WRITE_ONLY, self.output.nbytes)
self.buffers[id(self.output)] = out_buffer
else:
out_buffer = self.buffers[id(self.output)]
kargs.append(out_buffer)
start = time.time()
kernel(self.runtime.queues[0], workspace, None, *kargs)
cl.enqueue_copy(self.runtime.queues[0], self.output, out_buffer)
self.time = time.time() - start
return self.output
示例4: calc_range
def calc_range(start, num, perexec):
"""Calculate the otp-md5 of the 64-bit numbers range(start, num),
with otp sequence of rounds."""
assert(num % perexec == 0)
# Boilerplate OpenCL stuff
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
# Read the program source and compile
sourcecode = open("otpmd5.cl").read()
prg = cl.Program(ctx, sourcecode).build()
for i in xrange(num / perexec):
offset = start + (perexec * i)
host_input = numpy.arange(offset, offset+perexec, dtype=numpy.uint64)
result = numpy.empty_like(host_input)
dev_input = cl.Buffer(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=host_input)
dev_output = cl.Buffer(ctx, mf.READ_WRITE, size=result.size * result.itemsize)
prg.get_otpmd5_64k_rounds(queue, host_input.shape, None, dev_input, dev_output).wait()
cl.enqueue_copy(queue, result, dev_output).wait()
send_output(host_input, result)
示例5: generate
def generate(self, chunk_array, ctx, queue, heightmap_kernel):
assert isinstance(chunk_array, ChunkArray)
hmap = self._generate_hmap()
x_bounds = (0, 8)
y_bounds = (0, 8)
for z in range(1):
chunk_array.allocate_layer(z, x_bounds, y_bounds)
for x in range(8):
for y in range(8):
chunk_array.allocate_chunk(x, y, z, level=0)
print("allocated!")
ihmap = numpy.empty((256,256), dtype=numpy.int32)
for x in range(256):
for y in range(256):
height = hmap[x, y]
ihmap[x, y] = int(max(min(height*7.4 + 8, 32), 0))
"""for x in range(256):
print(x)
for y in range(256):
height = hmap[x, y]
z_max = int(max(min(height + 8, 32), 0))
for z in range(32):
voxel = chunk_array.get_voxel(x, y, z)
voxel['flags'] = 0 if z_max < z else 1"""
chunk_array.upload_buffers()
buffer = pyopencl.Buffer(ctx, pyopencl.mem_flags.READ_ONLY|pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = ihmap)
#pyopencl.enqueue_copy(queue, buffer, hmap)
heightmap_kernel(queue, (255, 255, 32), None, chunk_array.array_buffer._d_buffer, buffer)
pyopencl.enqueue_copy(queue, chunk_array.voxel_data.level_buffers[0]._h_buffer, chunk_array.voxel_data.level_buffers[0]._d_buffer)
chunk_array.upload_buffers()
示例6: final
def final(config, ctx, queue, program, buffers, debug=False):
matrixSize = config['matrixSize']
bandwidth = config['bandwidth']
partitionNumber = config['partitionNumber']
partitionSize = config['partitionSize']
offdiagonalSize = config['offdiagonalSize']
rhsSize = config['rhsSize']
xo = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)
tmp = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)
mf = cl.mem_flags
xo_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=xo)
tmp_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=tmp)
kernel = program.reconstruct
kernel.set_scalar_arg_dtypes([None, None, None, None, np.int32, np.int32, np.int32])
cl.enqueue_barrier(queue)
kernel(
queue,
(partitionNumber,),
None,
buffers[1], # Avwg buffer from factor, see if it is also readable and still valide
buffers[3], # x buffer from solve, see if it is still valide
xo_buf,
tmp_buf,
np.int32(partitionSize),
np.int32(offdiagonalSize),
np.int32(rhsSize)
)
xtb = np.ones((partitionNumber * 2 * offdiagonalSize, rhsSize), dtype=np.float32)
cl.enqueue_copy(queue, xtb, buffers[3])
if (debug) :
print "X(t,b):"
print xtb
cl.enqueue_copy(queue, xo, xo_buf)
if (debug) :
print "X':"
print xo
xtb = sparse.csr_matrix(xtb)
xo = sparse.csr_matrix(xo)
x = []
for i in range(0, partitionNumber) :
t = i * (2 * offdiagonalSize)
b = (i + 1) * (2 * offdiagonalSize)
u = i * (partitionSize - 2 * offdiagonalSize)
v = (i + 1) * (partitionSize - 2 * offdiagonalSize)
x.append(xtb[t : t + offdiagonalSize, 0 : rhsSize])
x.append(xo[u : v, 0 : rhsSize])
x.append(xtb[b - offdiagonalSize : b, 0 : rhsSize])
return sp.sparse.vstack(x)
示例7: lnlikelihood_ocl
def lnlikelihood_ocl(self, pv):
self._lnl2d(pv)
self.prg_lnl.lnl1d_chunked(self.cl_queue, [self.lnl2d.shape[0], self.cl_lnl_chunks], None,
uint32(self.lnl2d.shape[1]), self._b_lnl2d, self._b_lnl1d)
cl.enqueue_copy(self.cl_queue, self.lnl1d, self._b_lnl1d)
lnl = self.lnl1d.astype('d').sum(1)
return lnl
示例8: run
def run(self):
for ii in range(0,10):
for jj in range(0,10):
r = np.random.random([self.nsample,3])
r[:,0]=(r[:,0]+ii)*0.1
r[:,1]=(r[:,1]+jj)*0.1
self.X = np.zeros((self.nsample,4), dtype = np.float32)
self.X[:,0:3] = r
self.X[:,3] = 1.
self.I = np.zeros((self.nsample,4), dtype = np.float32)
self.I[:,0:3] = 1.
#self.I[:,3] = 0.
cl.enqueue_acquire_gl_objects(self.queue, [self.X_cl,self.I_cl])
cl.enqueue_copy(self.queue, self.X_cl, self.X)
cl.enqueue_copy(self.queue, self.I_cl, self.I)
self.program.Solve(self.queue, (self.nsample, self.na), None, self.A_cl, self.X_cl, self.I_cl, self.alpha)
cl.enqueue_release_gl_objects(self.queue, [self.X_cl,self.I_cl])
self.queue.finish()
self.draw()
self.scrnData = np.zeros((self.width,self.height), dtype = np.float32)
glReadPixels(0, 0, self.width, self.height, GL_ALPHA, GL_FLOAT, self.scrnData)
print np.max(self.scrnData)
scipy.misc.imsave('render.png', np.flipud(self.scrnData))
示例9: execute
def execute(self):
kernel = self.program.fact
self.event = kernel(self.queue,[self.a_dim],None,self.d_a_buf,self.d_c_buf)
self.event.wait()
cl.enqueue_copy(self.queue, self.h_c, self.d_c_buf)
print "a", self.h_a
print "ris", self.h_c
示例10: get_edges
def get_edges(clctx, features, reductions, blurs, buf_in, summarise=True):
"""
Using the *features* and *reductions* programs, and *blurs* program with
sigma=2.0, find all edge pixels in *buf_in* and return the count.
"""
gs, wgs = clctx.gs, clctx.wgs
bufa = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))
bufb = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))
bufc = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))
blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), buf_in, bufb)
blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa)
blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), bufa, bufc)
blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufb)
features.subtract(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa, bufc)
features.edges(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufa)
counts = reduction.run_reduction(clctx, reductions.reduction_sum, bufa)
if not summarise:
edges = np.empty((gs, gs, 4), np.float32)
cl.enqueue_copy(clctx.queue, edges, bufa,
origin=(0, 0), region=(gs, gs))
bufa.release()
bufb.release()
bufc.release()
if summarise:
return counts
else:
return edges
示例11: __init__
def __init__(self):
t_np = np.arange(0, 100000000, dtype=np.float32)
self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx)
self.mf = cl.mem_flags
self.t_g = cl.Buffer(
self.ctx,
self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
hostbuf=t_np)
f = open("ex.cl", "r")
fstr = "".join(f.readlines())
f.close()
self.prg = cl.Program(self.ctx, fstr).build()
self.res_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, t_np.nbytes)
self.prg.proc(self.queue, t_np.shape, None, self.t_g, self.res_g)
res_np = np.empty_like(t_np)
cl.enqueue_copy(self.queue, res_np, self.res_g)
# Check on CPU with Numpy:
print(res_np)
print(np.amax(res_np))
示例12: to_host
def to_host(queue, data, dtype, start, shape, elemstrides):
"""Copy memory off the device, into a Numpy array"""
m, n = shape
Sm, Sn = elemstrides
if m * n == 0:
return np.zeros(shape, dtype=dtype)
if min(elemstrides) < 0:
raise NotImplementedError()
itemsize = dtype.itemsize
bytestart = itemsize * start
# -- TODO: is there an extra element transferred here?
byteend = bytestart + itemsize * ((m-1) * Sm + (n-1) * Sn + 1)
temp_buf = np.zeros((byteend - bytestart), dtype=np.int8)
cl.enqueue_copy(queue, temp_buf, data,
device_offset=bytestart, is_blocking=True)
bytestrides = (itemsize * Sm, itemsize * Sn)
try:
view = np.ndarray(
shape=(m, n),
dtype=dtype,
buffer=temp_buf.data,
offset=0,
strides=bytestrides)
except:
raise
return view
示例13: get_color
def get_color(self, img):
# OpenCL only supports RGBA images, not RGB, so add an alpha channel
src = np.array(img.convert('RGBA'))
src.shape = w, h, _ = img.width, img.height, 4
w = int(w * self.SCALE_FACTOR)
h = int(h * self.SCALE_FACTOR)
local_size = self.max_work_item_sizes
global_size = (math.ceil(h / local_size[0]), math.ceil(w / local_size[1]))
total_work_groups = global_size[0] * global_size[1]
mf = cl.mem_flags
src_buf = cl.image_from_array(self.ctx, src, 4, norm_int=True)
out = np.zeros(4 * total_work_groups, dtype=np.int32)
out_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=out.itemsize * 4 * total_work_groups)
kernel = self.prg.get_color
kernel.set_scalar_arg_dtypes([None, None, np.uint32, np.uint32])
kernel(self.queue, global_size, local_size, src_buf, out_buf, w, h, g_times_l=True)
cl.enqueue_copy(self.queue, dest=out, src=out_buf, is_blocking=True)
# this sum takes .1 ms at 3440x1440, don't even bother OpenCL-ifying it
resized_out = np.reshape(out, (out.shape[0] / 4, 4))
summed_out = np.sum(resized_out, axis=0)
avg_out = (summed_out / summed_out[3])[:3].astype(int)
return avg_out
示例14: likelihood
def likelihood(self, outcomes, modelparams, expparams):
# By calling the superclass implementation, we can consolidate
# call counting there.
super(AcceleratedPrecessionModel, self).likelihood(outcomes, modelparams, expparams)
# Possibly add a second axis to modelparams.
if len(modelparams.shape) == 1:
modelparams = modelparams[..., np.newaxis]
# Convert to float32 if needed.
mps = modelparams.astype(np.float32)
eps = expparams.astype(np.float32)
# Allocating a buffer for the pr0 returns.
pr0 = np.empty((mps.shape[0], eps.shape[0]), dtype=mps.dtype)
# Move buffers to the GPU.
mf = cl.mem_flags
mps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mps)
eps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=eps)
dest_buf = cl.Buffer(self._ctx, mf.WRITE_ONLY, pr0.nbytes)
# Run the kernel with global worksize (n_models, n_experiments).
self._prg.cos_model(self._queue, pr0.shape, None, np.int32(eps.shape[0]), mps_buf, eps_buf, dest_buf)
# Copy the buffer back from the GPU and free memory there.
cl.enqueue_copy(self._queue, pr0, dest_buf)
mps_buf.release()
eps_buf.release()
dest_buf.release()
# Now we concatenate over outcomes.
return FiniteOutcomeModel.pr0_to_likelihood_array(outcomes, pr0)
示例15: eval
def eval(self, pars):
_ctx,queue = card()
radius, length = \
[GaussianDispersion(int(pars[base+'_pd_n']), pars[base+'_pd'], pars[base+'_pd_nsigma'])
for base in OneDGpuCylinder.PD_PARS]
#Get the weights for each
radius.value, radius.weight = radius.get_weights(pars['radius'], 0, 10000, True)
length.value, length.weight = length.get_weights(pars['length'], 0, 10000, True)
#Perform the computation, with all weight points
sum, norm, vol = 0.0, 0.0, 0.0,
sub = pars['sldCyl'] - pars['sldSolv']
real = np.float32 if self.q.dtype == np.dtype('float32') else np.float64
#Loop over radius, length, theta, phi weight points
for r in xrange(len(radius.weight)):
for l in xrange(len(length.weight)):
self.prg.OneDCylKernel(queue, self.q.shape, None, self.q_b, self.res_b, real(sub),
real(length.value[l]), real(radius.value[r]), real(pars['scale']),
np.uint32(self.q.size), real(pars['uplim']), real(pars['bolim']))
cl.enqueue_copy(queue, self.res, self.res_b)
sum += radius.weight[r]*length.weight[l]*self.res*pow(radius.value[r],2)*length.value[l]
vol += radius.weight[r]*length.weight[l] *pow(radius.value[r],2)*length.value[l]
norm += radius.weight[r]*length.weight[l]
if vol != 0.0 and norm != 0.0:
sum *= norm/vol
return sum/norm + pars['background']