当前位置: 首页>>代码示例>>C++>>正文


C++ clEnqueueNDRangeKernel函数代码示例

本文整理汇总了C++中clEnqueueNDRangeKernel函数的典型用法代码示例。如果您正苦于以下问题:C++ clEnqueueNDRangeKernel函数的具体用法?C++ clEnqueueNDRangeKernel怎么用?C++ clEnqueueNDRangeKernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了clEnqueueNDRangeKernel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: clSetKernelArg

void AdvancedMaxPoolingLayer::BackPropagate() {
#ifdef BUILD_OPENCL_MAX
  input_->delta.MoveToGPU(true);
  output_->delta.MoveToGPU();
  maximum_mask_.MoveToGPU();
  
  cl_uint error = 0;
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 0, sizeof (cl_mem), &input_->delta.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 2, sizeof (cl_mem), &output_->delta.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 3, sizeof (unsigned int), &input_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 4, sizeof (unsigned int), &input_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 5, sizeof (unsigned int), &maps_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 6, sizeof (unsigned int), &output_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 7, sizeof (unsigned int), &output_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 8, sizeof (unsigned int), &region_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 9, sizeof (unsigned int), &region_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 10, sizeof (unsigned int), &stride_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 11, sizeof (unsigned int), &stride_height_);
  if (error != CL_SUCCESS) {
    FATAL ("Error setting kernel args: " << (signed int) error);
  }

  size_t global_work_size[] = { input_width_, input_height_, maps_* input_->data.samples() };

  error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumBackward, 3, NULL,
                                  global_work_size, NULL, 0, NULL, NULL);
  if (error != CL_SUCCESS) {
    FATAL ("Error enqueueing kernel: " << (signed int) error);
  }

#ifdef BRUTAL_FINISH
  error = clFinish (CLHelper::queue);
  if (error != CL_SUCCESS) {
    FATAL ("Error finishing command queue: " << (signed int) error);
  }
#endif

#else
  
#define MP_HELPER_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
  
#pragma omp parallel for default(shared)
  for(std::size_t sample = 0; sample < input_->data.samples(); sample++) {
    for (unsigned int map = 0; map < maps_; map++) {
      for (unsigned int ix = 0; ix < input_width_; ix++) {
        for(unsigned int iy = 0; iy < input_width_; iy++) {
          const unsigned int mask_index = ix + input_width_ * iy;
          const unsigned int oxstart = (ix < region_width_) ? 
            0 : (ix - region_width_) / stride_width_+ 1;
          const unsigned int oxend = MP_HELPER_MIN(ix / stride_width_ + 1, output_width_);
          
          const unsigned int oystart = (iy < region_height_) ? 
            0 : (iy - region_height_) / stride_height_ + 1;
          const unsigned int oyend = MP_HELPER_MIN(iy / stride_height_ + 1, output_height_);
          
          datum sum = 0.0;
          for (unsigned int oy = oystart; oy < oyend; oy++) {
            for (unsigned int ox = oxstart; ox < oxend; ox++) {
              if(*maximum_mask_.data_ptr_const(ox, oy, map, sample) == mask_index)
                sum += *output_->delta.data_ptr_const(ox, oy, map, sample);
            }
          }
          *(input_->delta.data_ptr(ix, iy, map, sample)) = sum;
        }
      }
    }
  }
#endif
}
开发者ID:cnndabbler,项目名称:cn24,代码行数:70,代码来源:AdvancedMaxPoolingLayer.cpp

示例2: task

int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_)
{
  const TaskData* data = (const TaskData*) data_;
  cl_int err;

  if (data->points % data->points_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item");

  if (data->replications % data->replications_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item");


  // Lattice buffer

  size_t pointset_size;
  // gen_vec is given in common.c
  clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err);
  check_error(err, NULL);

  cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      pointset_size, pointset, &err);
  check_error(err, "cannot create point set buffer");


  // Shifts buffer
  
  clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype));

  // populate random shifts using a random stream
  clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err);
  check_error(err, NULL);
  for (cl_uint i = 0; i < data->replications; i++)
      for (cl_uint j = 0; j < DIMENSION; j++)
          shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream);
  err = clrngMrg31k3pDestroyStreams(stream);
  check_error(err, NULL);

  cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err);
  check_error(err, "cannot create shifts buffer");


  // Output buffer

  size_t points_block_count = data->points / data->points_per_work_item;
  cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, 
      data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err);
  check_error(err, "cannot create output buffer");


  // OpenCL kernel

  cl_program program = build_program_from_file(context, device,
      "client/DocsTutorial/example4_kernel.cl",
      NULL);
  check_error(err, NULL);
  cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err);
  check_error(err, "cannot create kernel");

  int iarg = 0;
  err  = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications);
  err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf);
  check_error(err, "cannot set kernel arguments");


  // Execution

  cl_event ev;
  size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count;
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev);
  check_error(err, "cannot enqueue kernel");

  err = clWaitForEvents(1, &ev);
  check_error(err, "error waiting for events");

  clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype));
  err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0,
      data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL);
  check_error(err, "cannot read output buffer");

  printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n");

  err = clqmcLatticeRuleWriteInfo(pointset, stdout);
  check_error(err, NULL);
  printf("\n");

  rqmcReport(data->replications, data->points, points_block_count, output);


  // Clean up

  clReleaseEvent(ev);
  clReleaseMemObject(output_buf);
  clReleaseMemObject(pointset_buf);
  clReleaseKernel(kernel);
  clReleaseProgram(program);

//.........这里部分代码省略.........
开发者ID:umontreal-simul,项目名称:clQMC,代码行数:101,代码来源:example4.c

示例3: main


//.........这里部分代码省略.........
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_int16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_int16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
开发者ID:xianggong,项目名称:m2c-llvm-devtools-host,代码行数:67,代码来源:relational_greater_than_or_equal_to_ulong16ulong16_src.c

示例4: opencl_scanhash

static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
				int64_t __maybe_unused max_nonce)
{
	const int thr_id = thr->id;
	struct opencl_thread_data *thrdata = thr->cgpu_data;
	struct cgpu_info *gpu = thr->cgpu;
	_clState *clState = clStates[thr_id];
	const cl_kernel *kernel = &clState->kernel;
	const int dynamic_us = opt_dynamic_interval * 1000;

	cl_int status;
	size_t globalThreads[1];
	size_t localThreads[1] = { clState->wsize };
	int64_t hashes;

	/* Windows' timer resolution is only 15ms so oversample 5x */
	if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) {
		struct timeval tv_gpuend;
		double gpu_us;

		gettimeofday(&tv_gpuend, NULL);
		gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals;
		if (gpu_us > dynamic_us) {
			if (gpu->intensity > MIN_INTENSITY)
				--gpu->intensity;
		} else if (gpu_us < dynamic_us / 2) {
			if (gpu->intensity < MAX_INTENSITY)
				++gpu->intensity;
		}
		memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval));
		gpu->intervals = 0;
	}

	set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity);
	if (hashes > gpu->max_hashes)
		gpu->max_hashes = hashes;

	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
		return -1;
	}

	if (clState->goffset) {
		size_t global_work_offset[1];

		global_work_offset[0] = work->blk.nonce;
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
						globalThreads, localThreads, 0,  NULL, NULL);
	} else
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
						globalThreads, localThreads, 0,  NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
		return -1;
	}

	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
		return -1;
	}

	/* The amount of work scanned can fluctuate when intensity changes
	 * and since we do this one cycle behind, we increment the work more
	 * than enough to prevent repeating work */
	work->blk.nonce += gpu->max_hashes;

	/* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */
	clFinish(clState->commandQueue);

	/* FOUND entry is used as a counter to say how many nonces exist */
	if (thrdata->res[FOUND]) {
		/* Clear the buffer again */
		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
				BUFFERSIZE, blank_res, 0, NULL, NULL);
		if (unlikely(status != CL_SUCCESS)) {
			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
			return -1;
		}
		applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
		postcalc_hash_async(thr, work, thrdata->res);
		memset(thrdata->res, 0, BUFFERSIZE);
		/* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
		clFinish(clState->commandQueue);
	}

	return hashes;
}
开发者ID:poorleno,项目名称:cgminer,代码行数:90,代码来源:driver-opencl.c

示例5: kernel_gpu_opencl_wrapper_2


//.........这里部分代码省略.........
	clSetKernelArg(	kernel, 
					4, 
					sizeof(cl_mem), 
					(void *) &offsetD);
	clSetKernelArg(	kernel, 
					5, 
					sizeof(cl_mem), 
					(void *) &lastKnodeD);
	clSetKernelArg(	kernel, 
					6, 
					sizeof(cl_mem), 
					(void *) &offset_2D);
	clSetKernelArg(	kernel, 
					7, 
					sizeof(cl_mem), 
					(void *) &startD);
	clSetKernelArg(	kernel, 
					8, 
					sizeof(cl_mem), 
					(void *) &endD);
	clSetKernelArg(	kernel, 
					9, 
					sizeof(cl_mem), 
					(void *) &ansDStart);
	clSetKernelArg(	kernel, 
					10, 
					sizeof(cl_mem), 
					(void *) &ansDLength);

	//====================================================================================================100
	//	Kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time4 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
开发者ID:briansp2020,项目名称:HIP-Examples,代码行数:67,代码来源:kernel_gpu_opencl_wrapper_2.c

示例6: clCreateFromGLTexture3D

void OpenCLExecuter::ocl_filter_shared(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	cl_sampler sampler;					// OpenCL sampler
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	// set Local work size dimensions
//	size_t local_threads[3] ={256,256,64};
	// set Global work size dimensions
//	size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]};

	// set Global work size dimensions
	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err);
	printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err));

	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// create a sampler object
	sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err);
	printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err));
 
    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// grab input data from OpenGL, compute, copy the results back to OpenGL
	// Runs asynchronous to host, up until blocking clFinish at the end

	glFinish();
	glFlush();
	
	// grab the OpenGL texture object for read/writing from OpenCL
	err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Execute a kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	/*
	// Blocking read of results from GPU to Host
	int size = volobj->texwidth*volobj->texheight*volobj->texdepth;
	unsigned char* result = new unsigned char[size];
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
	for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i];
	delete[] result;
	*/

	// copy OpenCL buffer to OpenGl texture
	size_t corigin[3] = {0,0,0};
	size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth};
	err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL);
	printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err));

	//make sure we block until we are done.
	//err = clFinish(ocl_wrapper->commandQue);
	//printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err));
	
	//release opengl objects now
	err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(dst_buffer)clReleaseMemObject(dst_buffer);
}
开发者ID:ut666,项目名称:VolViewer,代码行数:94,代码来源:OpenCLExecuter.cpp

示例7: roundup

void OpenCLExecuter::ocl_parrallelReduction(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem tmp_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	size_t numWorkGroups;
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = 65536; //65536 // Length of float arrays to process

	// set Local work size dimensions
	szLocalWorkSize = 512;
	// set Global work size dimensions
	szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;  
	//szGlobalWorkSize = iNumElements;
	numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize;
	printf("OPENCL: number of elements: %d\n", (int)iNumElements);
	printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
	printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
	printf("OPENCL: work groups: %d\n", (int)(numWorkGroups));
	
	//temp array
	int* data = new int[iNumElements];

	for(int i=0; i<iNumElements; i++)
		data[i] = randomFloat(1.0, (float)iNumElements);

	data[iNumElements/2] = -100.0;

	//for(int i=0; i<iNumElements; i++)
	//	printf("data: %d\n", data[i]);

	size_t global_threads[1] ={iNumElements};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the temp buffer memory object
	tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
	
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "min_reduce", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end
	
	int numb_iterations = sqrt((float)numWorkGroups);
	numb_iterations=0;
	bool cont = true;

	Timer timer;
	timer.startTimer();
	//for(int i=0; i<numb_iterations; i++)
	while(cont)
	{
		// Write data from host to GPU
		err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL);
		printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
		// Launch kernel 
		err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
		printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

		// Blocking read of results from GPU to Host
		err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL,  NULL);
		printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
		
		numb_iterations++;
		if(data[1]==0) cont = false;

		//printf("min: %d\n", data[0]);
		for(int i=0; i<numWorkGroups; i++)
			printf("min: %d\n", data[i]);
	}
	timer.endTimer("GPU find min");

	timer.startTimer();
	int min=iNumElements;
	for(int i=0; i<iNumElements; i++)
//.........这里部分代码省略.........
开发者ID:ut666,项目名称:VolViewer,代码行数:101,代码来源:OpenCLExecuter.cpp

示例8: main

int main()
{
	// Initiating opencl
	cl_device_id device_id;
	cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in device."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  	if (!context)
    {
        std::cout<<"Error in context."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        std::cout<<"Error in command queue."<<std::endl;
        return EXIT_FAILURE;
    }
	std::ifstream in("transpMatrix.cl");
	std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
    const char* kernelSource = contents.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    if (!program)
    {
        std::cout<<"Error in program."<<std::endl;
        return EXIT_FAILURE;
    }
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        std::cout<<"Error in compiling the opencl program."<<std::endl;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        std::cout<<buffer<<std::endl;
        return EXIT_FAILURE;
    }
    cl_kernel kernel = clCreateKernel(program, "simplecl", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        std::cout<<"Error in kernel "<<err<<std::endl;
        return EXIT_FAILURE;
    }

    // Data to compute
    float* data =  new float[count*count];

    for(int i = 0; i < count; ++i)
    {
        for(int j = 0; j < count; ++j)
        {
            data[i*count+j] = rand()%10;
            std::cout<<data[i*count+j]<<" ";
        }
        std::cout<<std::endl;
    }
    std::cout<<std::endl;
    // Creating communication buffers
    cl_mem input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count*count, NULL, NULL);
    cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL);

	if (!input || !output)
    {
        std::cout<<"Error in allocation."<<std::endl;
        return EXIT_FAILURE;
    }  

    // Copy data to input buffer
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in copy."<<std::endl;
        return EXIT_FAILURE;
    }

 	err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(int), &count);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in argument."<<std::endl;
        return EXIT_FAILURE;
    }
    size_t local[] = {1,1};
    size_t global[] = {10,10};
    // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    // if (err != CL_SUCCESS)
    // {
    //     std::cout<<"Error in getting loal."<<std::endl;
    //     return EXIT_FAILURE;
    // }
    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in pushing to queue "<<err<<std::endl;
//.........这里部分代码省略.........
开发者ID:AnisB,项目名称:Misc,代码行数:101,代码来源:transpMatrix.cpp

示例9: main

int main(void) {
    // se crea los 2 vectores de entrada
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }
 
    // cargamos el kernel en source_str
    FILE *fp;
    char *source_str;
    size_t source_size;
 
    fp = fopen("vector_add_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
 
    // obtenemos las plataformas y informacion de los devices
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, 
            &device_id, &ret_num_devices);
 
    // creamos un contexto OpenCL
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    // creamos la cola de comandos
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    // creamos el buffer de memoria en el device para cada vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
 
    // copiamos los vectores A y B a sus respectivas memorias buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
 
    // creamos un programa para el kernel 
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);
 
    // generamos el programa
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    // creamos el kernel 
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
 
    // establecemos los argumentos del kernel 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
 
    // ejecutamos el kernel de la lista
    size_t global_item_size = LIST_SIZE; 
    size_t local_item_size = 64; // dividimos los work items en grupos de 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
 
    // copiamos la memoria buffer C del device hacia la variable local C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
 
    // muestra el resultado
    for(i = 0; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
 
    
    free(A);
    free(B);
    free(C);
    return 0;
}
开发者ID:pioh123,项目名称:parallel,代码行数:90,代码来源:vector.c

示例10: main


//.........这里部分代码省略.........
     chk(status, "clCreatebuffer");

     // perform computing on GPU
     // copy data from host to device
     status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL);
     chk(status,"ClEnqueueWriteBuffer");
     status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL);
     chk(status, "clEnqueueWriteBuffer");
     
     // create program with source code
     cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status);
     chk(status, "clCreateProgramWithSource");

     // Compile program for the device
     status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL);
      // chk(status, "ClBuildProgram");
      if(status != CL_SUCCESS){
      printf("clBuildProgram failed (%d) \n", status);
      size_t log_size;
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
      
      char *log = (char *) malloc(log_size);
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
      printf("%s\n", log);
      exit(-1);
     }
      printf("successfully built program \n");
      
     // Create lattice-boltzman kernel
     cl_kernel kernel, kernel1;
     kernel = clCreateKernel(program, "lbiteration", &status);
     kernel1 = clCreateKernel(program, "Denrho", &status);
     chk(status, "clCreateKernel");
      printf("successfully create kernel \n");
     
     // Associate the input and output buffers with the kernel
     status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d);
     status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY);
     status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY);
     chk(status, "clSerKernelArg");
    
     // set the work dimensions
     size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y};
     int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2);
     int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2);
     size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y};

     // loop the kernel
     for( nsteps = 0; nsteps < 100; nsteps++){
     status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     }
     printf("Good so far \n");
     status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     chk(status, "clEnqueueNDR");
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     printf("running time is %0.3f \n",(total_time/1000000000.0));
     // retrieve data from device
     status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL);
     chk(status, "clEnqueueReadBuffer");

     // Output results
     fp = fopen("SolutionCL.txt", "wt");
     for(i= 0;i<ArraySizeX;i++){
       for(j=0;j<ArraySizeY;j++)
         fprintf(fp, " %f", u_h[i*ArraySizeY+j]);
        fprintf(fp, "\n");
     } 
     fclose(fp);

     //cleanup
     clReleaseKernel(kernel);
     clReleaseKernel(kernel1);
     clReleaseProgram(program);
     clReleaseCommandQueue(cmdQueue);
     clReleaseMemObject(u_d);
     clReleaseMemObject(f_d);
     clReleaseContext(context);

     free(u_h);
     free(f_h);
     free(platforms);
     free(devices);
     
     return 0;
}
开发者ID:hietwll,项目名称:parallel-lattice-Boltzmann,代码行数:101,代码来源:Lattice_BoltzmannOpenCL.c

示例11: main


//.........这里部分代码省略.........



   d_C = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_B, h_B, &errcode);


   FILE* fp = fopen("hw2.cl", "r");
   fseek (fp , 0 , SEEK_END);
   const size_t lSize = ftell(fp);
   rewind(fp);
   unsigned char* buffer;
   buffer = (unsigned char*) malloc (lSize);
   fread(buffer, 1, lSize, fp);
   fclose(fp);

   cl_int status;
   clProgram = clCreateProgramWithBinary(clGPUContext,
                1, (const cl_device_id *)clDevices,
                &lSize, (const unsigned char**)&buffer,
                &status, &errcode);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL,
                NULL, NULL);


   errcode = clBuildProgram(clProgram, 0,
              NULL, NULL, NULL, NULL);


   clKernel = clCreateKernel(clProgram,
               "MM", &errcode);




   size_t globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   errcode = clSetKernelArg(clKernel, 0,
              sizeof(cl_mem), (void *)&d_C);
   errcode |= clSetKernelArg(clKernel, 1,
              sizeof(cl_mem), (void *)&d_A);
   errcode |= clSetKernelArg(clKernel, 2,
              sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3,
              sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4,
              sizeof(int), (void *)&wC);



   globalWorkSize[0] = 16;
   globalWorkSize[1] = 16;

   cl_ulong time_start, time_end, total_time = 0;

   errcode = clEnqueueNDRangeKernel(clCommandQue,
              clKernel, 2, NULL, globalWorkSize,
              NULL, 0, NULL, &mm);
   printf("Average time = %lu\n");
   clFinish(clCommandQue);

         clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
              sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
               sizeof(time_end), &time_end, NULL);
         total_time += time_end - time_start;


         printf("Average time = %lu\n", total_time);
   errcode = clEnqueueReadBuffer(clCommandQue,
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);



   free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);

   free(clDevices);

   clReleaseContext(clGPUContext);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseCommandQueue(clCommandQue);

}
开发者ID:hemantjp,项目名称:HW2,代码行数:101,代码来源:hw2.c

示例12: main

int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);
//.........这里部分代码省略.........
开发者ID:NatTuck,项目名称:cakemark,代码行数:101,代码来源:ep.c

示例13: clCreateBuffer

void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem bbmin_buffer;				// OpenCL device source buffer
	cl_mem bbmax_buffer;				// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	cl_int4 minbb;
	cl_int4 maxbb;

	minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192;
	maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192;

	int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

 	bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&channel);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(bbmin_buffer)clReleaseMemObject(bbmin_buffer);
    if(bbmax_buffer)clReleaseMemObject(bbmax_buffer);
	
	maxbb.s[0] += (float)window_size/2.0;
	maxbb.s[1] += (float)window_size/2.0;
	maxbb.s[2] += (float)window_size/2.0;

	minbb.s[0] -= (float)window_size/2.0;
	minbb.s[1] -= (float)window_size/2.0;
	minbb.s[2] -= (float)window_size/2.0;

	maxbb.s[0] += 2;
	maxbb.s[1] += 2;
	maxbb.s[2] += 2;

	minbb.s[0] -= 2;
	minbb.s[1] -= 2;
	minbb.s[2] -= 2;	
//.........这里部分代码省略.........
开发者ID:ut666,项目名称:VolViewer,代码行数:101,代码来源:OpenCLExecuter.cpp

示例14: main


//.........这里部分代码省略.........
	ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE);
	ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR);
	ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE);
	// Bake the plan. 
	ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL);
	// Create temporary buffer. 
	cl_mem tmpBufferu = 0;
	cl_mem tmpBufferv = 0;
	// Size of temp buffer. 
	size_t tmpBufferSize = 0;
	status = clfftGetTmpBufSize(planHandle, &tmpBufferSize);
	if ((status == 0) && (tmpBufferSize > 0)) {
		tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		if (ret != CL_SUCCESS)
			printf("Error with tmpBuffer clCreateBuffer\n");
	}
//kernel grid
    	fp = fopen("./grid.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); }
    	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );
	
	p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL);
        grid = clCreateKernel(p_grid, "grid", &ret);
//first x
	cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret);
        ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx);
	size_t global_work_size_x[3] = {Nx, 0, 0};
        ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL);
	ret = clFinish(command_queue);
//then y
	cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret);	
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny);
	size_t global_work_size_y[3] = {Ny, 0, 0};

	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL);
	ret = clFinish(command_queue);

//last z
	cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret);
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz);
	size_t global_work_size_z[3] = {Nz, 0, 0};
	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL);
	ret = clFinish(command_queue);
    	ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid);

//kernel initial data
    	fp = fopen("./initialdata.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
开发者ID:MichaelQuell,项目名称:GrayScott-OpenCl,代码行数:67,代码来源:grayscottOpenCLs.c

示例15: main


//.........这里部分代码省略.........
        printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Now that we know the size of the work-groups, we can set the number of
    // work-groups, the actual number of steps, and the step size
    nwork_groups = in_nsteps/(work_group_size*niters);

    if (nwork_groups < 1)
    {
        err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL);
        work_group_size = in_nsteps / (nwork_groups * niters);
    }

    nsteps = work_group_size * niters * nwork_groups;
    step_size = 1.0f/(float)nsteps;
    h_psum = calloc(sizeof(float), nwork_groups);
    if (!h_psum)
    {
        printf("Error: could not allocate host memory for h_psum\n");
        return EXIT_FAILURE;
    }

    printf(" %ld work-groups of size %ld. %d Integration steps\n",
            nwork_groups,
            work_group_size,
            nsteps);

    d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Set kernel arguments
    err  = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters);
    err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size);
    err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL);
    err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments!\n");
        return EXIT_FAILURE;
    }

    // Execute the kernel over the entire range of our 1D input data set
    // using the maximum number of work items for this device
    size_t global = nwork_groups * work_group_size;
    size_t local = work_group_size;
    double rtime = wtime();
    err = clEnqueueNDRangeKernel(
        commands,
        kernel_pi,
        1, NULL,
        &global,
        &local,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to execute kernel\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


    err = clEnqueueReadBuffer(
        commands,
        d_partial_sums,
        CL_TRUE,
        0,
        sizeof(float) * nwork_groups,
        h_psum,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // complete the sum and compute the final integral value on the host
    pi_res = 0.0f;
    for (unsigned int i = 0; i < nwork_groups; i++)
    {
        pi_res += h_psum[i];
    }
    pi_res *= step_size;

    rtime = wtime() - rtime;

    printf("\nThe calculation ran in %lf seconds\n", rtime);
    printf(" pi = %f for %d steps\n", pi_res, nsteps);

    // clean up
    clReleaseMemObject(d_partial_sums);
    clReleaseProgram(program);
    clReleaseKernel(kernel_pi);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    free(kernelsource);
    free(h_psum);
}
开发者ID:Aestey,项目名称:Exercises-Solutions,代码行数:101,代码来源:pi_ocl.c


注:本文中的clEnqueueNDRangeKernel函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。