本文整理汇总了Python中pyopencl.enqueue_nd_range_kernel函数的典型用法代码示例。如果您正苦于以下问题:Python enqueue_nd_range_kernel函数的具体用法?Python enqueue_nd_range_kernel怎么用?Python enqueue_nd_range_kernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了enqueue_nd_range_kernel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: do_opencl_pow
def do_opencl_pow(hash, target):
output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
if (ctx == False):
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order='C')
data[0]['v'] = ("0000000000000000" + hash).decode("hex")
data[0]['target'] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize*2000
while output[0][0] == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
print sofar, progress / sofar, "hashes/sec"
taken = time.time() - start
print progress, taken
return output[0][0]
示例2: do_opencl_pow
def do_opencl_pow(hash, target):
global ctx, queue, program, gpus, hash_dt
output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)])
if ctx == False:
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order="C")
data[0]["v"] = ("0000000000000000" + hash).decode("hex")
data[0]["target"] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize * 2000
while output[0][0] == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
# logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
taken = time.time() - start
# logger.debug("Took %d tries.", progress)
return output[0][0]
示例3: do_opencl_pow
def do_opencl_pow(hash, target):
output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
if (len(enabledGpus) == 0):
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order='C')
data[0]['v'] = ("0000000000000000" + hash).decode("hex")
data[0]['target'] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize*2000
while output[0][0] == 0 and shutdown == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
# logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
if shutdown != 0:
raise Exception ("Interrupted")
taken = time.time() - start
# logger.debug("Took %d tries.", progress)
return output[0][0]
示例4: max_length_real4
def max_length_real4(ipt):
out = CLReal(len(ipt))
kern = _lengthkern_real4.kern
kern.set_arg(0, ipt._buffer)
kern.set_arg(1, out._buffer)
cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
return max_reduce(out)
示例5: __call__
def __call__(self, thread_count, work_group_size, *args):
fun = self.compile()
for i, arg in enumerate(args):
fun.set_arg(i, arg)
with timed_region("ParLoop kernel"):
cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,),
(work_group_size,), g_times_l=False).wait()
示例6: filterPrepare
def filterPrepare(self, e, data, keys, ndata, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
ndata = data.size
if keys.size != ndata: raise Exception()
filtbytes = np.bool8(False).nbytes * ndata
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
else:
data_buf = data
if not isinstance(keys, cl.Buffer):
keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
else:
keys_buf = keys
filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filtbytes)
kernel = self.prg.filterPrepare
kernel.set_args(data_buf, keys_buf, np.uint64(ndata), np.uint8(33), np.uint8(66), filt_buf)
global_dims = self.get_global(self.get_grid_dims(ndata))
print "filterPrepare"
if e is None:
e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ]
else:
e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ]
events += e
return (e, data_buf, keys_buf, filt_buf)
示例7: prefixSumUp
def prefixSumUp(self, e, data, ndata, data2, ndata2, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)
else:
data_buf = data
if not isinstance(data2, cl.Buffer):
data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data2)
else:
data2_buf = data2
kernel = self.prg.prefixSumUp
kernel.set_args(data_buf, np.uint64(ndata), data2_buf, np.uint64(ndata2))
global_dims = self.get_global(self.get_grid_dims(ndata))
print "prefixSumUp"
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
return (e, data_buf, data2_buf)
示例8: exec_lsz_safe
def exec_lsz_safe(self, localsize):
"""execute the kernel with specific localsize.
Safe also for lernels with local variables"""
oldloc = int(self._localsize)
self.localsize = localsize
cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (self.globalsize,), (self.localsize,))
self._solverobj.clqueue.finish()
self.localsize = oldloc
示例9: test_algorithm
def test_algorithm(self):
print "\n**************************"
print "test_pbrs:"
passed = 0
buffersize_in = 188*8
buffersize_out = 188*8
# opencl buffer uint
self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_in*4)
# opencl buffer uint
self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_out*4)
for k in self.kernelname:
kernel = self.load_kernel(self.filename, k)
passed = 0
self.fd_input = open('test_bench_pbrs_input.csv', 'r')
self.fd_output = open('test_bench_pbrs_output.csv', 'r')
for j in range(0,6):
encoded_data = numpy.array(numpy.zeros(buffersize_out/4), dtype=numpy.uint32)
data_to_encode = string.replace(self.fd_input.readline(),'\n','')
reference_data = string.replace(self.fd_output.readline(),'\n','')
for i in range(0,7):
data_to_encode = "%s,%s" % (data_to_encode, string.replace(self.fd_input.readline(),'\n',''))
reference_data = "%s,%s" % (reference_data, string.replace(self.fd_output.readline(),'\n',''))
data_to_encode = numpy.fromstring(numpy.fromstring(data_to_encode, dtype=numpy.uint8, sep=",").tostring(), dtype=numpy.uint32)
reference_data = numpy.fromstring(reference_data, dtype=numpy.uint8, sep=",")
cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait()
kernel.set_args(self.inputbuffer, self.outputbuffer)
cl.enqueue_nd_range_kernel(self.queue,kernel,(8,),(8,),None ).wait()
cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait()
encoded_data = (numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
if encoded_data.tostring() == reference_data.tostring():
passed += 1
print "Test %d PASSED" % (j+1)
else:
print "Test %d FAILED" % (j+1)
print "input data:"
print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8)
print "encoded data:"
print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)
print "reference data:"
print reference_data
print "error data:"
print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
print "%d pass out of 6" % passed
self.fd_input.close()
self.fd_output.close()
if passed == 6:
print "All pbrs tests PASS\n"
return True
else:
print "at least one pbrs test FAILED\n"
return False
示例10: max_reduce_real4
def max_reduce_real4(ipt):
x = CLReal(len(ipt))
y = CLReal(len(ipt))
z = CLReal(len(ipt))
kern = _splitkern_real4.kern
kern.set_arg(0, ipt._buffer)
kern.set_arg(1, x._buffer)
kern.set_arg(2, y._buffer)
kern.set_arg(3, z._buffer)
cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
return max_reduce(x), max_reduce(y), max_reduce(z)
示例11: prefixSum
def prefixSum(self, e, data, keys, ndata, low, hi, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
else:
data_buf = data
if not isinstance(keys, cl.Buffer):
keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
else:
keys_buf = keys
grid_dims = self.get_grid_dims(ndata)
psumbytes = ndata * np.uint64(0).nbytes
bsumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes)
nbsumbytes = np.uint64(0).nbytes
psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes)
nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes)
low = PrefixSum.HOST_TYPE_KEYS(low)
hi = PrefixSum.HOST_TYPE_KEYS(hi)
kernel = self.prg.prefixSumDown
kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi, psum_buf, bsum_buf, nbsum_buf)
global_dims = self.get_global(grid_dims)
print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims))
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
nbsum = np.zeros(1, dtype = np.uint64)
events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e),)
if nbsum>1:
(e, bsum_buf, bsum1_buf, nbsum1_buf, ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(), events)
else:
ndata2 = np.zeros(1, dtype = np.uint64)
events += (cl.enqueue_copy(self.queue, ndata2, bsum_buf, wait_for=e),)
ndata2 = ndata2.item()
print ndata2
self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events)
return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
示例12: solve
def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128):
self.simulations = simulations
self.iterations = iterations
self.workGroupSize = workGroupSize
self.workGroups = int(self.simulations / self.workGroupSize)
self.width = np.int8(puzzle['width'])
self.height = np.int8(puzzle['height'])
#initialise buffers
self.initBuffers(puzzle)
#create kernel
self.kernel = cl.Kernel(self.program,"montecarlo")
self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations))
#execute program for a number of iterations
cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,))
#unmap group lengths buffer from device
cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype)
self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype)
#unmap solutions buffer from device
cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype)
self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype)
#release buffers
self.lengthsBuffer.release()
self.groupLengthsBuffer.release()
self.puzzlesBuffer.release()
self.solutionsBuffer.release()
#get the best solution
i = self.groupLengths.argmin()
bestSolution = np.array(self.solutions[i])
#convert solution to list format used by challenge
solution = []
for row in range(0,puzzle['height']):
for col in range(0,puzzle['width']):
if bestSolution[row][col]!=-1:
s = bestSolution[row][col]
#add to solution list
solution.append({'X': int(col),'Y': int(row),'Size':int(s)})
#clear cells in solution
for i in range(0,s):
for j in range(0,s):
bestSolution[row+i][col+j]=-1
return solution
示例13: filter
def filter(self, data, keys, low, hi, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
ndata = data.size
(e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf, ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events)
filt = np.zeros(ndata, dtype = np.bool8)
indices = np.zeros(ndata, dtype = np.uint64)
data2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_DATA)
keys2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_KEYS)
ndata2bytes = np.uint64(0).nbytes
if PrefixSum.RETURN_FILTER == 1:
filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes)
print data2.nbytes
data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes)
keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes)
ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes)
low = PrefixSum.HOST_TYPE_KEYS(low)
hi = PrefixSum.HOST_TYPE_KEYS(hi)
kernel = self.prg.filter
if PrefixSum.RETURN_FILTER == 1:
kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, filt_buf, data2_buf, keys2_buf, ndata2_buf)
else:
kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, data2_buf, keys2_buf, ndata2_buf)
global_dims = self.get_global(self.get_grid_dims(ndata))
print "filter"
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
if PrefixSum.RETURN_FILTER == 1:
events += ( cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e),
cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
else:
events += ( cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
return (filt, indices, data2, keys2)
示例14: _exec_chunked_unsafe
def _exec_chunked_unsafe(self, chunksize=0):
"""Unsafe for kernels with local variables."""
if chunksize > 0:
self._prep_chunked_exec(chunksize)
lenarr = self.leadingvar.length
ncnk = int(ceil(float(lenarr)/float(self._cnksz)))
cnksz = self._cnksz
for i in range(ncnk):
if (i == (ncnk - 1)) and not(lenarr % cnksz == 0):
cnksz = lenarr % cnksz
self._solverobj.__setattr__(self._cnk_name, i)
cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (cnksz,), None)
self._solverobj.clqueue.finish()
示例15: change_display
def change_display(image) :
image_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image)
mem = cl.GLBuffer(ctx, mf.WRITE_ONLY, numpy.float32(buf))
cl.enqueue_acquire_gl_objects(queue, [mem])
add_knl = prog.add
add_knl.set_args(image_buf, mem)
cl.enqueue_nd_range_kernel(queue, add_knl, image.shape, None)
cl.enqueue_release_gl_objects(queue, [mem])
queue.finish()
glFlush()