本文整理汇总了C++中cl::CommandQueue::enqueueNDRangeKernel方法的典型用法代码示例。如果您正苦于以下问题:C++ CommandQueue::enqueueNDRangeKernel方法的具体用法?C++ CommandQueue::enqueueNDRangeKernel怎么用?C++ CommandQueue::enqueueNDRangeKernel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类cl::CommandQueue
的用法示例。
在下文中一共展示了CommandQueue::enqueueNDRangeKernel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: EnqueueAdvancePathsKernel
void PathOCLRenderThread::EnqueueAdvancePathsKernel(cl::CommandQueue &oclQueue) {
PathOCLRenderEngine *engine = (PathOCLRenderEngine *)renderEngine;
const u_int taskCount = engine->taskCount;
// Micro kernels version
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_NEXT_VERTEX, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_NOTHING, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_OBJECT, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_DL, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_ILLUMINATE, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_SAMPLE_BSDF, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_NEXT_VERTEX_RAY, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_SPLAT_SAMPLE, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_NEXT_SAMPLE, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange,
cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
}
示例2: run_kernel
float clPeak::run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters)
{
float timed = 0;
// Dummy calls
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
queue.finish();
if(useEventTimer)
{
for(int i=0; i<iters; i++)
{
cl::Event timeEvent;
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
queue.finish();
timed += timeInUS(timeEvent);
}
} else // std timer
{
Timer timer;
timer.start();
for(int i=0; i<iters; i++)
{
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
}
queue.finish();
timed = timer.stopAndTime();
}
return (timed / iters);
}
示例3: runKernelLatency
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
if(!isKernelLatency)
return 0;
cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
cl::NDRange globalSize = (numItems / FETCH_PER_WI);
cl::NDRange localSize = devInfo.maxWGSize;
int iters = devInfo.kernelLatencyIters;
float latency;
try
{
log->print(NEWLINE TAB TAB "Kernel launch latency : ");
log->xmlOpenTag("kernel_launch_latency");
log->xmlAppendAttribs("unit", "us");
cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);
// Dummy calls
queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
queue.finish();
latency = 0;
for(int i=0; i<iters; i++)
{
cl::Event timeEvent;
queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
queue.finish();
cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
latency += (float)((int)end - (int)start);
}
latency /= iters;
log->print(latency); log->print(" us" NEWLINE);
log->xmlSetContent(latency);
log->xmlCloseTag();
}
catch(cl::Error error)
{
log->print(error.err() + NEWLINE);
log->print(TAB TAB "Tests skipped" NEWLINE);
return -1;
}
return 0;
}
示例4: helper
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice)
{
int set_size=8;
try {
cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
isize*sizeof(cl_uchar), in, NULL);
cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
gNV21Kernel.setArg(2,w);
gNV21Kernel.setArg(3,h);
gNV21Kernel.setArg(1,bufferIn);
gNV21Kernel.setArg(0,bufferOut);
gQueue.enqueueNDRangeKernel(gNV21Kernel,
cl::NullRange,
cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
cl::NDRange(set_size,set_size),
NULL,
NULL);
if (choice==1) {
gLaplacianK.setArg(2,w);
gLaplacianK.setArg(3,h);
gLaplacianK.setArg(1,bufferOut);
gLaplacianK.setArg(0,bufferOut2);
gQueue.enqueueNDRangeKernel(gLaplacianK,
cl::NullRange,
cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
cl::NDRange(set_size,set_size),
NULL,
NULL);
}
else if (choice>1) {
gNegative.setArg(2,w);
gNegative.setArg(3,h);
gNegative.setArg(1,bufferOut);
gNegative.setArg(0,bufferOut2);
gQueue.enqueueNDRangeKernel(gNegative,
cl::NullRange,
cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
cl::NDRange(set_size,set_size),
NULL,
NULL);
}
gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out);
}
catch (cl::Error e) {
LOGI("@oclDecoder: %s %d \n",e.what(),e.err());
}
}
示例5: updateParticles
void updateParticles(float timeDelta)
{
try
{
vector<cl::Memory> glBuffers;
glBuffers.push_back(m_positions);
glBuffers.push_back(m_colors);
//this will update our system by calculating new velocity and updating the positions of our particles
//Make sure OpenGL is done using our VBOs
glFinish();
// map OpenGL buffer object for writing from OpenCL
// this passes in the vector of VBO buffer objects (position and color)
m_queue.enqueueAcquireGLObjects(&glBuffers);
m_particleKernel.setArg(5, timeDelta); //pass in the timestep
//execute the kernel
m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles),
cl::NullRange);
//Release the VBOs so OpenGL can play with them
m_queue.enqueueReleaseGLObjects(&glBuffers, NULL);
m_queue.finish();
}
catch(cl::Error &error)
{
LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")";
}
}
示例6: simulationStep
void simulationStep() {
try {
// copy
auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY,
sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
nullptr, nullptr);
queue.enqueueWriteBuffer(buffer, CL_TRUE, 0,
sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
visualizationBufferCPU, NULL, NULL);
// enque
stepKernel.setArg(2, buffer);
cl::NDRange global((size_t) (fieldWidth * fieldHeight));
queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange);
// read back
queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0,
sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
visualizationBufferCPU, NULL, NULL);
// finish
queue.finish();
} catch (cl::Error err) {
std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl;
exit(3);
}
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE,
visualizationBufferCPU);
}
示例7: procOCL_I2I
void procOCL_I2I(int texIn, int texOut, int w, int h)
{
if(!haveOpenCL) return;
LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h);
cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn);
cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
std::vector < cl::Memory > images;
images.push_back(imgIn);
images.push_back(imgOut);
int64_t t = getTimeMs();
theQueue.enqueueAcquireGLObjects(&images);
theQueue.finish();
LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t));
t = getTimeMs();
cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once
Laplacian.setArg(0, imgIn);
Laplacian.setArg(1, imgOut);
theQueue.finish();
LOGD("Kernel() costs %d ms", getTimeInterval(t));
t = getTimeMs();
theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange);
theQueue.finish();
LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t));
t = getTimeMs();
theQueue.enqueueReleaseGLObjects(&images);
theQueue.finish();
LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t));
}
示例8: runKernel
cl::Event runKernel(const cl::CommandQueue& queue, const cl::Kernel& kernel, const cl::NDRange& globalSize, const cl::NDRange& groupSize, std::vector<cl::Event>& events)
{
cl::Event event;
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, groupSize, &events, &event);
events.push_back(event);
return event;
}
示例9: kernel
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue)
{
static std::once_flag compileFlag;
static cl::Program prog;
static cl::Kernel kern;
std::call_once(compileFlag,
[queue]() {
prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true);
kern = cl::Kernel(prog, "julia");
});
//auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern);
static const NDRange local(8, 8);
NDRange global(local[0] * divup(DIMX, local[0]),
local[1] * divup(DIMY, local[1]));
kern.setArg(0, devOut);
kern.setArg(1, DIMX);
kern.setArg(2, DIMY);
queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local);
//juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY);
}
示例10: draw
void PTWeekend::draw()
{
/*
* BEGIN - each frame part
*/
/* Enqueue kernel for execution */
glm::vec3 origin,lower_left, hor, ver;
float theta = camera.getFov() * M_PI / 180.0f;
float half_height = tan(theta / 2.0f);
float half_width = camera.getAspectRatio() * half_height;
origin = camera.getEyePoint();
glm::vec3 u, v, w;
w = -glm::normalize(camera.getViewDirection()); //odd...
u = glm::normalize(glm::cross(glm::vec3(0,1,0), w));
v = glm::cross(w, u);
lower_left = origin - half_width * u - half_height * v - w;
hor = 2.0f * half_width * u;
ver = 2.0f * half_height * v;
pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer");
clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL);
pt_assert(clStatus, "Could not acquire gl objects");
cl::Event profiling_evt;
clStatus = cmd_queue.enqueueNDRangeKernel(kernel,
cl::NDRange(0,0),
cl::NDRange(img_width, img_height),
cl::NDRange(local_width,local_height),
NULL,
&profiling_evt);
profiling_evt.wait();
pt_assert(clStatus, "Could not enqueue the kernel");
clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL);
pt_assert(clStatus, "Could not release gl objects");
cmd_queue.finish();
cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>();
cl_ulong total_time = time_end - time_start;
std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n";
/*
* END - each frame part
*/
gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight()));
}
示例11: sumTest
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
cl::Buffer& device_result, int iterations,
ProgramCache& cache,
cl::CommandQueue& queue)
{
cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
std::vector<std::string> sources;
sources.push_back("ParallelQueue");
sources.push_back("ParallelQueueTests");
cl::Program& program = cache.getProgram(sources);
cl::Kernel sum_test_kernel(program, "sum_test");
cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
int warp_size = sum_test_kernel
.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);
std::cout << "warp size: " << warp_size << std::endl;
int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int queue_num_threads = 512;
if(queue_num_threads > max_group_size)
queue_num_threads = max_group_size;
cl::LocalSpaceArg local_queue
= cl::__local(sizeof(int) * queue_num_threads * 2);
cl::LocalSpaceArg reduction_buffer
= cl::__local(sizeof(int) * queue_num_threads);
cl::LocalSpaceArg got_work
= cl::__local(sizeof(int));
cl::LocalSpaceArg prefix_sum_input
= cl::__local(sizeof(int) * queue_num_threads);
cl::LocalSpaceArg prefix_sum_output
= cl::__local(sizeof(int) * queue_num_threads);
sum_test_kernel.setArg(0, queue_data);
sum_test_kernel.setArg(1, queue_metadata);
sum_test_kernel.setArg(2, device_result);
sum_test_kernel.setArg(3, iterations);
sum_test_kernel.setArg(4, local_queue);
sum_test_kernel.setArg(5, reduction_buffer);
sum_test_kernel.setArg(6, got_work);
sum_test_kernel.setArg(7, prefix_sum_input);
sum_test_kernel.setArg(8, prefix_sum_output);
cl::NDRange nullRange;
cl::NDRange global(queue_num_threads, 1);
cl::NDRange local(queue_num_threads, 1);
cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel,
nullRange, global, local);
}
示例12: findMinSeamVert
void findMinSeamVert(cl::Context &ctx,
cl::CommandQueue &cmdQueue,
cl::Event &event,
std::vector<cl::Event> &deps,
cl::Buffer &energyMatrix,
cl::Buffer &vertMinEnergy,
cl::Buffer &vertMinIdx,
int width,
int height,
int pitch,
int colsRemoved) {
cl_int errNum;
errNum = findMinSeamVertKernel.setArg(0, energyMatrix);
errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy);
errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx);
errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float)));
errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float)));
errNum |= findMinSeamVertKernel.setArg(5, width);
errNum |= findMinSeamVertKernel.setArg(6, height);
errNum |= findMinSeamVertKernel.setArg(7, pitch);
errNum |= findMinSeamVertKernel.setArg(8, colsRemoved);
if (errNum != CL_SUCCESS) {
std::cerr << "Error setting findMinSeamVert arguments." << std::endl;
exit(-1);
}
// This kernel could be written to use more than one work group, but its probably not worth it.
cl::NDRange offset = cl::NDRange(0);
cl::NDRange localWorkSize = cl::NDRange(256);
cl::NDRange globalWorkSize = cl::NDRange(256);
errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel,
offset,
globalWorkSize,
localWorkSize,
&deps,
&event);
if (errNum != CL_SUCCESS) {
std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl;
exit(-1);
}
/** DEBUG **/
// int deviceResultIdx[1];
// float deviceResultEnergy[1];
// mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx);
// mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy);
// std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl;
// std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl;
}
示例13: enqueue
void Reduce::enqueue(
const cl::CommandQueue &commandQueue,
const cl::Buffer &inBuffer,
const cl::Buffer &outBuffer,
::size_t first,
::size_t elements,
::size_t outPosition,
const VECTOR_CLASS<cl::Event> *events,
cl::Event *event)
{
/* Validate parameters */
if (first + elements < first)
{
// Only happens if first + elements overflows. size_t is unsigned so behaviour
// is well-defined.
throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds");
}
if (inBuffer.getInfo<CL_MEM_SIZE>() / elementSize < first + elements)
throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds");
if (outBuffer.getInfo<CL_MEM_SIZE>() / elementSize <= outPosition)
throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output position out of buffer bounds");
if (!(inBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY)))
{
throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: input buffer is not readable");
}
if (!(outBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY)))
{
throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output buffer is not writable");
}
if (elements == 0)
throw cl::Error(CL_INVALID_GLOBAL_WORK_SIZE, "clogs::Reduce::enqueue: elements is zero");
const ::size_t blockSize = roundUp(elements, reduceWorkGroupSize * reduceBlocks) / reduceBlocks;
reduceKernel.setArg(1, outBuffer);
reduceKernel.setArg(2, (cl_uint) outPosition);
reduceKernel.setArg(3, inBuffer);
reduceKernel.setArg(4, (cl_uint) first);
reduceKernel.setArg(5, (cl_uint) elements);
reduceKernel.setArg(7, (cl_uint) blockSize);
cl::Event reduceEvent;
commandQueue.enqueueNDRangeKernel(
reduceKernel,
cl::NullRange,
cl::NDRange(reduceWorkGroupSize * reduceBlocks),
cl::NDRange(reduceWorkGroupSize),
events, &reduceEvent);
doEventCallback(reduceEvent);
if (event != NULL)
*event = reduceEvent;
}
开发者ID:ResearchDaniel,项目名称:Correlated-Photon-Mapping-for-Interactive-Global-Illumination-of-Time-Varying-Volumetric-Data,代码行数:54,代码来源:reduce.cpp
示例14: zeroBuffer
void zeroBuffer(::cl::Context &context, ::cl::CommandQueue &commandQueue, ::cl::Buffer &buffer, size_t size, std::vector<::cl::Event> *events, ::cl::Event &event)
{
cl_int status;
::cl::Kernel kernel=getKernel(context, "zeroMemory", "utils.cl", utils_cl);
status=kernel.setArg(0, buffer);
::cl::NDRange globalThreads(size);
status=commandQueue.enqueueNDRangeKernel(kernel, ::cl::NullRange, globalThreads, ::cl::NullRange, events, &event);
}
示例15: backtrack
void backtrack(cl::Context &ctx,
cl::CommandQueue &cmdQueue,
cl::Event &event,
std::vector<cl::Event> &deps,
cl::Buffer &energyMatrix,
cl::Buffer &vertSeamPath,
cl::Buffer &vertMinIdx,
int width,
int height,
int pitch,
int colsRemoved) {
cl_int errNum;
// Set kernel arguments
errNum = backtrackKernel.setArg(0, energyMatrix);
errNum |= backtrackKernel.setArg(1, vertSeamPath);
errNum |= backtrackKernel.setArg(2, vertMinIdx);
errNum |= backtrackKernel.setArg(3, width);
errNum |= backtrackKernel.setArg(4, height);
errNum |= backtrackKernel.setArg(5, pitch);
errNum |= backtrackKernel.setArg(6, colsRemoved);
if (errNum != CL_SUCCESS) {
std::cerr << "Error setting backtrack kernel arguments." << std::endl;
exit(-1);
}
cl::NDRange offset = cl::NDRange(0);
cl::NDRange localWorkSize = cl::NDRange(1);
cl::NDRange globalWorkSize = cl::NDRange(256);
errNum = cmdQueue.enqueueNDRangeKernel(backtrackKernel,
offset,
globalWorkSize,
localWorkSize,
&deps,
&event);
if (errNum != CL_SUCCESS) {
std::cerr << "Error enqueueing backTrack kernel for execution." << std::endl;
exit(-1);
}
// /** DEBUGGING **/
// int deviceResult[height];
// mem::read(ctx, cmdQueue, deviceResult, vertSeamPath, height);
// for (int i = height - 5; i < height; ++i) {
// std::cout << "deviceResult[" << i << "]=\t" << deviceResult[i] << std::endl;
// }
}