当前位置: 首页>>代码示例>>C++>>正文


C++ clSetKernelArg函数代码示例

本文整理汇总了C++中clSetKernelArg函数的典型用法代码示例。如果您正苦于以下问题:C++ clSetKernelArg函数的具体用法?C++ clSetKernelArg怎么用?C++ clSetKernelArg使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了clSetKernelArg函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: SetKernelArguments

cl_uint SetKernelArguments()
{
    cl_int err = CL_SUCCESS;

    err = clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void *)&ocl.Lights);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Lights, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 1, sizeof(cl_uint), (void *)&ocl.LightCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument LightCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void *)&ocl.Shapes);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Shapes, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 3, sizeof(cl_uint), (void *)&ocl.ShapeCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 4, sizeof(cl_uint), (void *)&ocl.sampleCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 5, sizeof(cl_uint), (void *)&ocl.width);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 6, sizeof(cl_uint), (void *)&ocl.height);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 7, sizeof(cl_mem), (void *)&ocl.cam);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    return err;
}
开发者ID:DennisJung,项目名称:SCWS2016,代码行数:62,代码来源:refer.cpp

示例2: main


//.........这里部分代码省略.........
                src_1_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_short16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_short16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
开发者ID:xianggong,项目名称:m2c-llvm-devtools-host,代码行数:67,代码来源:sub_sat_short16short16_src.c

示例3: clSetKernelArg

int lcs::GPUExclusiveScanForInt(int workGroupSize, int numOfBanks,
				cl_kernel scanKernel, cl_kernel reverseUpdateKernel, cl_mem d_arr, cl_int length,
				cl_command_queue commandQueue) {
	cl_int err;

	// Get the work group size
	size_t localWorkSize = workGroupSize;

	// Up-sweep and down-sweep	
	clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr);
	clSetKernelArg(scanKernel, 1, sizeof(cl_int), &length);
	clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / numOfBanks + 1), NULL);

	static int records[10];
	
	int problemSize = length;
	int numOfRecords = 0;

	cl_int d_step = 1;

	/// DEBUG ///
	printf("length = %d\n", length);

	for (; problemSize > 1; problemSize = (problemSize - 1) / (localWorkSize * 2) + 1) {
		if (numOfRecords) d_step *= localWorkSize * 2;
		records[numOfRecords++] = problemSize;
		clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);

		size_t globalWorkSize = ((problemSize - 1) / (localWorkSize * 2) + 1) * localWorkSize;

		err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize,
					     0, NULL, NULL);
		if (err) lcs::Error("Fail to enqueue scan");

		/// DEBUG ///
		err = clFinish(commandQueue);

		printf("err = %d\n", err);

		if (err) lcs::Error("Non-zero err in pre-scan");
	}

	int zero = 0, sum;
	err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &sum, 0, NULL, NULL);
	if (err) lcs::Error("Fail to read d_arr[0]");

	err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL);
	if (err) lcs::Error("Fail to clean d_arr[0]");

	// Reverse updates
	clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr);
	clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &length);

	size_t globalWorkSize;

	for (int i = numOfRecords - 1; i >= 0; i--, d_step /= localWorkSize * 2) {
		clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step);
		globalWorkSize = ((records[i] - 1) / (localWorkSize * 2) + 1) * localWorkSize;
		err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize,
					     0, NULL, NULL);
		if (err) lcs::Error("Fail to enqueue scan");
		clFinish(commandQueue);
	}

	return sum;
}
开发者ID:linyufly,项目名称:OpenCLTracer,代码行数:66,代码来源:lcsUtility.cpp

示例4: main


//.........这里部分代码省略.........
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    err = clGetKernelWorkGroupInfo(reverseUpdateKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(size_t), &maxWorkGroupSize, NULL);
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    // Set work group size to 64

    int workGroupSize = 512;

    int length = 2048000;
    int *arr = new int [length];
    for (int i = 0; i < length; i++)
        arr[i] = rand() % 100;

    int *prefixSum = new int [length];
    prefixSum[0] = 0;

    int t0 = clock();

    for (int i = 1; i < length; i++)
        prefixSum[i] = prefixSum[i - 1] + arr[i - 1];

    int t1 = clock();

    printf("time1: %lf\n", (t1 - t0) * 1.0 / CLOCKS_PER_SEC);

    cl_mem d_arr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, &err);
    if (err) Error("Fail to create d_arr");

    err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, arr, 0, NULL, NULL);
    if (err) Error("Fail to write d_arr");

    clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr);
    cl_int d_length = length;
    clSetKernelArg(scanKernel, 1, sizeof(cl_int), &d_length);
    cl_int d_step = 1;
    clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);
    clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / 16 + 1), NULL);

    int problemSize = length;
    int records[10];
    int num = 0;

    int t2 = clock();

    for (; problemSize > 1; problemSize = (problemSize - 1) / (workGroupSize * 2) + 1) {

        if (num) d_step *= workGroupSize * 2;

        printf("d_step = %d\n", d_step);

        records[num++] = problemSize;

        printf("problemSize = %d\n", problemSize);

        clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);

        size_t globalWorkSize = ((problemSize - 1) / (workGroupSize * 2) + 1) * workGroupSize;
        size_t localWorkSize = workGroupSize;

        err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize,
                                     0, NULL, NULL);
        if (err) Error("Fail to enqueue scan");
        clFinish(commandQueue);
    }
开发者ID:linyufly,项目名称:FlowVC,代码行数:67,代码来源:ExclusiveScan.cpp

示例5: sum_gpu

void sum_gpu(long long *in, long long *out, unsigned int n)
{
	size_t global_size;
	size_t local_size;

	char *kernel_src;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	cl_uint max_compute_units;
	size_t max_workgroup_size;

	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
	cl_mem d_array;

	cl_event event;
	cl_ulong start, end;

	/* start OpenCL */
	err = clGetPlatformIDs(1, &platform_id,NULL);
	clErrorHandling("clGetPlatformIDs");

	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	clErrorHandling("clGetDeviceIDs");

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	clErrorHandling("clCreateContext");

	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	clErrorHandling("clCreateCommandQueue");

	/* create kernel */
	kernel_src = file_to_string(KERNEL_SRC);
	program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err);
	free(kernel_src);
	clErrorHandling("clCreateProgramWithSource");

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	clErrorHandling("clBuildProgram");

	kernel = clCreateKernel(program, "matrix_mult", &err);
	clErrorHandling("clCreateKernel");

	/* allocate memory and send to gpu */
	d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err);
	clErrorHandling("clCreateBuffer");

	err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL);
	clErrorHandling("clEnqueueWriteBuffer");

	err  = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
	err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL);
	clErrorHandling("clGetDeviceInfo");

	/* prepare kernel args */
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array);
	err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n);

	/* execute */
	local_size = n / max_compute_units / 8;
	if (local_size > max_workgroup_size)
		local_size = max_workgroup_size;

	/*
	 *	Usually it would be
	 *	global_size = local_size * max_compute_units;
	 *	but that would only be valid if local_size = n / max_compute_units;
	 *	local_size is n / max_compute_units / 8 because it obtains its hightest performance.
	 */
	for (global_size = local_size; global_size < n; global_size += local_size);

	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
	clErrorHandling("clEnqueueNDRangeKernel");

	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
	fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0);

	err = clFinish(commands);
	clErrorHandling("clFinish");

	/* transfer back */
	err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long
	clErrorHandling("clEnqueueReadBuffer");

	/* cleanup*/
	clReleaseMemObject(d_array);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	clReleaseEvent(event);
}
开发者ID:Exceltior,项目名称:dei-hpc,代码行数:98,代码来源:sum.c

示例6: InitOpenCL

void InitOpenCL()
{
    // 1. Get a platform.
    cl_platform_id platform;
    
    clGetPlatformIDs( 1, &platform, NULL );
    // 2. Find a gpu device.
    cl_device_id device;
    
    clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
                   1,
                   &device,
                   NULL);
    // 3. Create a context and command queue on that device.
    cl_context context = clCreateContext( NULL,
                                         1,
                                         &device,
                                         NULL, NULL, NULL);
    queue = clCreateCommandQueue( context,
                                 device,
                                 0, NULL );
    // 4. Perform runtime source compilation, and obtain kernel entry point.
    std::ifstream file("scene.cl");
    std::string source;
    if (file){
    while(!file.eof()){
        char line[256];
        file.getline(line,255);
        source += std::string(line) + "\n";
    }
    }
    if (source.length()==0)
    {
        std::string err = "fail to load shader";
    }
    
    cl_ulong maxSize;
    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0);
    
    const char* str = source.c_str();
    cl_program program = clCreateProgramWithSource( context,
                                                   1,
                                                   &str,
                                                   NULL, NULL );
    cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    if ( result ){
        char* build_log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = new char[log_size+1];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        if( log_size > 2 ) {
            std::cout << "build log: " << build_log << std::endl;
        }
        delete[] build_log;
        std::cout << "Error during compilation! (" << result << ")" << std::endl;
    }
    kernel = clCreateKernel( program, "tracekernel", NULL );
    // 5. Create a data buffer.
    buffer        = clCreateBuffer( context,
                                   CL_MEM_WRITE_ONLY,
                                   kWidth * kHeight *sizeof(cl_float4),
                                   NULL, 0 );
    viewTransform = clCreateBuffer( context,
                                   CL_MEM_READ_WRITE,
                                   16 *sizeof(cl_float),
                                   NULL, 0 );
    
    worldTransforms = clCreateBuffer( context,
                                     CL_MEM_READ_WRITE,
                                     16 *sizeof(cl_float)*2,
                                     NULL, 0 );
    
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform);
    clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms);
}
开发者ID:autosquid,项目名称:CLRay,代码行数:80,代码来源:raytracer.cpp

示例7: simpleExample

int simpleExample()
{
    
    /* Create device and determine local size */
    device = create_device();
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);	
    if(err < 0) {
        perror("Couldn't obtain device information");
        exit(1);   
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }
    
    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);
    
    
    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err);
    if(err < 0) {
        perror("Couldn't create a buffer");
        exit(1);   
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if(err < 0) {
        perror("Couldn't create a command queue");
        exit(1);   
    };
    
    /* Create kernels */
    vector_kernel = clCreateKernel(program, KERNEL_1, &err);
    complete_kernel = clCreateKernel(program, KERNEL_2, &err);
    if(err < 0) {
        perror("Couldn't create a kernel");
        exit(1);
    };

    /* Set arguments for vector kernel */
    err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);
    err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL);

    /* Set arguments for complete kernel */
    err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);
    err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL);
    err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);
    if(err < 0) {
        perror("Couldn't create a kernel argument");
        exit(1);   
    }
    
    
    /* Enqueue kernels */
    global_size = ARRAY_SIZE/4;
    err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, &start_event);
    if(err < 0) {
        perror("Couldn't enqueue the kernel");
        exit(1);   
    }
    printf("Global size = %lu\n", global_size);

    /* Perform successive stages of the reduction */
    while(global_size/local_size > local_size) {
        global_size = global_size/local_size;
        err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
        printf("Global size = %lu\n", global_size);
        if(err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);   
        }
    }
    global_size = global_size/local_size;
    err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, NULL, 0, NULL, &end_event);
    printf("Global size = %lu\n", global_size);
    
    
    /* Finish processing the queue and get profiling information */
    clFinish(queue);
    clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
 
    /* Read the result */
    err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, sizeof(float), &sum, 0, NULL, NULL);
    if(err < 0) {
        perror("Couldn't read the buffer");
        exit(1);   
    }    
    
    
    /* Check result */
    actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1);
    if(fabs(sum - actual_sum) > 0.01*fabs(sum))
//.........这里部分代码省略.........
开发者ID:abednego1979,项目名称:tipster,代码行数:101,代码来源:device_check.c

示例8: ops_par_loop_PdV_kernel_nopredict


//.........这里部分代码省略.........
           args[15].dat->size[0] * 1 * args[15].dat->size[1] * 1 *
               (start[2] * args[15].stencil->stride[2] - args[15].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[16].dat->d_m[d] + OPS_sub_dat_list[args[16].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[16].dat->d_m[d];
#endif
  int base16 = 1 * 1 * (start[0] * args[16].stencil->stride[0] -
                        args[16].dat->base[0] - d_m[0]);
  base16 = base16 +
           args[16].dat->size[0] * 1 * (start[1] * args[16].stencil->stride[1] -
                                        args[16].dat->base[1] - d_m[1]);
  base16 = base16 +
           args[16].dat->size[0] * 1 * args[16].dat->size[1] * 1 *
               (start[2] * args[16].stencil->stride[2] - args[16].dat->base[2] -
                d_m[2]);

  ops_H_D_exchanges_device(args, 17);
  ops_halo_exchanges(args, 17, range);
  ops_H_D_exchanges_device(args, 17);

  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[103].mpi_time += t2 - t1;
  }

  if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) {

    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 0, sizeof(cl_mem),
                              (void *)&arg0.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 1, sizeof(cl_mem),
                              (void *)&arg1.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 2, sizeof(cl_mem),
                              (void *)&arg2.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 3, sizeof(cl_mem),
                              (void *)&arg3.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 4, sizeof(cl_mem),
                              (void *)&arg4.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 5, sizeof(cl_mem),
                              (void *)&arg5.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 6, sizeof(cl_mem),
                              (void *)&arg6.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 7, sizeof(cl_mem),
                              (void *)&arg7.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 8, sizeof(cl_mem),
                              (void *)&arg8.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 9, sizeof(cl_mem),
                              (void *)&arg9.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 10, sizeof(cl_mem),
                              (void *)&arg10.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 11, sizeof(cl_mem),
                              (void *)&arg11.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 12, sizeof(cl_mem),
                              (void *)&arg12.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 13, sizeof(cl_mem),
                              (void *)&arg13.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 14, sizeof(cl_mem),
                              (void *)&arg14.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 15, sizeof(cl_mem),
                              (void *)&arg15.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 16, sizeof(cl_mem),
开发者ID:gihanmudalige,项目名称:OPS,代码行数:67,代码来源:PdV_kernel_nopredict_opencl_kernel.cpp

示例9: main

int main(int argc, char* argv[])
{



		const size_t SIZE_execution_bit = (input_length - 3*filter_length +1);
		const size_t SIZE_input_bit = sizeof(gint32)*(input_length+1);
		const size_t SIZE_settings_bit = sizeof(gint32)*4;

		size_t output_bit_on_counts;
		size_t* SIZE_execution_pointer = &SIZE_execution_bit;

		gint32* filtersettings = (gint32*) malloc(SIZE_settings_bit);
		gint32* input_vector = (gint32*) malloc(SIZE_input_bit);
		gint32* positions = (gint32*) malloc(SIZE_input_bit);

		filtersettings[0] = filter_length;
		filtersettings[1] = threshhold;
		filtersettings[2] = input_length;
		filtersettings[3] = 0;



		//GPU-Init
		ocl = ocl_new(CL_DEVICE_TYPE_GPU,1);
		context = ocl_get_context(ocl);
		queue = ocl_get_cmd_queues (ocl)[0];
		clFinish(queue);

		program = ocl_create_program_from_file(ocl, "edel_kernel_secondder.cl", NULL, &errcode);
		OCL_CHECK_ERROR(errcode);

		filter1 = clCreateKernel(program, "second_filter", &errcode);
		OCL_CHECK_ERROR(errcode);

		//GPU-Buffer which can be done before the Computation
		settings = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, SIZE_settings_bit, filtersettings, &errcode);
		OCL_CHECK_ERROR(errcode);

		input = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE_input_bit, NULL, &errcode);
		OCL_CHECK_ERROR(errcode);


		if(debugmode != 0)
		{
			srand((unsigned) time( NULL ));
			counter = rand_rects(expected,1,input_length,3*filter_length,3*filter_length,3*filter_length,peak_length,base+peak, input_vector, noise, base, 0,positions);
			if(harddebug != 0)
			{
				for(i = 0; i < input_length;i++)
				{
					if(input_length < 10000)
					{
						printf("input_vector[%i] = %d\n",i,input_vector[i]);
					}
					else
					{
						printf("input_vector[%i] = %d\t",i,input_vector[i]);
					}
				}
			}

			printf("\n counts = %d\n", counter);
			printf("%lu Bits needed for Output-Vector \n", output_bit_on_counts);

		}

		output_bit_on_counts = sizeof(gint32) * safetyfactor * 2*((counter + 2));

		clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, SIZE_input_bit, input_vector, 0, NULL, NULL);

		gint32* energy_time = (gint32*)malloc(output_bit_on_counts);


		for(i = 0; i < safetyfactor * (2*counter+2); i++)
		{
			energy_time[i] = -9999;
		}


		output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, output_bit_on_counts, NULL , &errcode);
		OCL_CHECK_ERROR(errcode);


		OCL_CHECK_ERROR(clSetKernelArg(filter1, 0, sizeof(cl_mem), &input));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 1, sizeof(cl_mem), &output));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 2, sizeof(cl_mem), &settings));

		size_t local_item_size;
		size_t global_item_size = (size_t) (input_length - 3*filter_length +1);

		local_item_size = ocl_get_local_size(global_item_size, 2,1);

		             
                if(debugmode != 0)
                {
                        printf("local item size = %lu \n %lu", &local_item_size, local_item_size);
                        if(local_item_size != 0)
                        {
                              printf("This works because you divide %lu / %lu \n and this is %lu", global_item_size,local_item_size, global_item_size/local_item_size);
//.........这里部分代码省略.........
开发者ID:kevinkit,项目名称:EDELWEISS_ION,代码行数:101,代码来源:edel_host_second.c

示例10: exec_trig_kernel

int 
exec_trig_kernel(const char *program_source, 
                 int n, void *srcA, void *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[2]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 

  float c = 7.3f; // a scalar number to test non-pointer args
 
  // create the OpenCL context on a GPU device 
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 
  free(devices); 
 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float4) * n, NULL, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "trig", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(float), (void *) &c); 
 
//.........这里部分代码省略.........
开发者ID:Drako,项目名称:pocl,代码行数:101,代码来源:trig_exec.c

示例11: main

int main(int argc, char *argv[])
{
  struct ocl_ds     *o_ds;
  cl_int err;
  cl_event evt;
  cl_mem            o_in;
  cl_int4           *o_out;
  struct ocl_kernel *o_k;
  int len = LEN;
  int i;
  
  size_t workGroupSize[2], localz[2];

  localz[1] = 16;
  localz[0] = 16;
  workGroupSize[0] = 1024*1024;
  workGroupSize[1] = 1;

  o_ds = create_ocl_ds(KERNELDIR KERNELS_FILE);
  if (o_ds == NULL){
    return 1;
  }

  o_in = create_ocl_mem(o_ds, sizeof(cl_int4)*len);
  if (o_in == NULL){
    goto free_mem_in;
  }
  
  o_out = malloc(sizeof(cl_int4) * len);
  if (o_out == NULL){
    goto free_mem_out;
  }

  //bzero(o_out, len*sizeof(cl_int4));

  o_k = create_ocl_kernel(o_ds, "ocl_layout");
  if (o_k == NULL){
    goto free_kernel;
  }

#if 0
  err  = clSetKernelArg(o_k->k_kernel, 0, sizeof(cl_mem), (void *) &o_in);
  err |= clSetKernelArg(o_k->k_kernel, 1, sizeof(int), (void *) &len);
  if (err != CL_SUCCESS){
#ifdef DEBUG
    fprintf(stderr, "clSetKernelArg return %s\n", oclErrorString(err));
#endif
    goto clean_up;
  }

  err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 2, NULL, workGroupSize, localz, 0, NULL, &evt);
  //err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 3, NULL, workGroupSize, NULL, 0, NULL, &evt);
  if (err != CL_SUCCESS){
#ifdef DEBUG
    fprintf(stderr, "clEnqueueNDRangeKernel: %s\n", oclErrorString(err));
#endif
    goto clean_up;
  }

  clReleaseEvent(evt);
  clFinish(o_ds->d_cq);
#endif

#if 0
  fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(cl_mem), &o_in);
  fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(int), &len);
#endif

#if 0
  //if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, ((void*)(&(o_in))), (sizeof(o_in)), ((void*)(&(len))), (sizeof(len)), NULL) < 0){
  if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, OCL_PARAM(o_in), OCL_PARAM(len), NULL) < 0){
#ifdef DEBUG
    fprintf(stderr, "%s: error in run kernel\n", __func__);
#endif
    goto clean_up;
  }
#endif

  //for (i=1024; i<len; i+=1024){
    
    //if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * i, o_out) < 0){
    if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * len, o_out) < 0){
#ifdef DEBUG
      fprintf(stderr, "%s: xfer from ocl error\n", __func__);
#endif
      goto clean_up;
    }

 // }
  
#if 0
#if 1
#ifdef DEBUG
  for (i=0; i<len; i++){
    fprintf(stderr, "%d %d %d %d\n", o_out[i].w, o_out[i].x, o_out[i].y, o_out[i].z);
  }
#endif
#else
    fwrite(o_out, len, sizeof(cl_int4), stdout);
#endif
//.........这里部分代码省略.........
开发者ID:adambarta,项目名称:spead,代码行数:101,代码来源:device.c

示例12: magmablas_zlacpy

extern "C" void 
magmablas_zlacpy( magma_uplo_t uplo, magma_int_t m, magma_int_t n,
                  magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t lda,
                  magmaDoubleComplex_ptr dB, size_t dB_offset, magma_int_t ldb,
				  magma_queue_t queue)
{
/*
    Note
  ========
  - UPLO Parameter is disabled
  - Do we want to provide a generic function to the user with all the options?

  Purpose
  =======

  ZLACPY copies all or part of a two-dimensional matrix A to another
  matrix B.

  Arguments
  =========

  UPLO    (input) INTEGER
          Specifies the part of the matrix A to be copied to B.
          = 'U':      Upper triangular part
          = 'L':      Lower triangular part
          Otherwise:  All of the matrix A

  M       (input) INTEGER
          The number of rows of the matrix A.  M >= 0.

  N       (input) INTEGER
          The number of columns of the matrix A.  N >= 0.

  A       (input) COMPLEX DOUBLE PRECISION array, dimension (LDA,N)
          The m by n matrix A.  If UPLO = 'U', only the upper triangle
          or trapezoid is accessed; if UPLO = 'L', only the lower
          triangle or trapezoid is accessed.

  LDA     (input) INTEGER
          The leading dimension of the array A.  LDA >= max(1,M).

  B       (output) COMPLEX DOUBLE PRECISION array, dimension (LDB,N)
          On exit, B = A in the locations specified by UPLO.

  LDB     (input) INTEGER
          The leading dimension of the array B.  LDB >= max(1,M).

  =====================================================================   */

	size_t LocalWorkSize[1] = {64};
	size_t GlobalWorkSize[1] = {(m/64+(m%64 != 0))*64};
    
    if ( m == 0 || n == 0 )
        return;
    
    if ( uplo == MagmaUpper ) {
        fprintf(stderr, "lacpy upper is not implemented\n");
    }
    else if ( uplo == MagmaLower ) {
        fprintf(stderr, "lacpy lower is not implemented\n");
    }
    else {
		cl_int ciErrNum;
		cl_kernel ckKernel = NULL;
		ckKernel = rt->KernelPool["zlacpy_kernel"]; 
		if(!ckKernel){
			printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__);
			return;
		}
		
		int offset_A = (int)dA_offset;
		int offset_B = (int)dB_offset;
		int nn = 0;
		ciErrNum  = clSetKernelArg( ckKernel, nn++, sizeof(cl_int),           (void*)&m);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),           (void*)&n );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem),           (void*)&dA);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&offset_A );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&lda );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem),           (void*)&dB);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&offset_B );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&ldb );
		if (ciErrNum != CL_SUCCESS){
			printf("Error: clSetKernelArg at %d in file %s, %s\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
			return;
		}

		// launch kernel
		ciErrNum = clEnqueueNDRangeKernel(
			queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
		if (ciErrNum != CL_SUCCESS)
		{
			printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n",
				__LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
			return;
		}
	}
}
开发者ID:mauro-belgiovine,项目名称:belgiovi-clmagma,代码行数:97,代码来源:zlacpy.cpp

示例13: dimension

/**
    Purpose
    -------
    SLACPY_Q copies all or part of a two-dimensional matrix dA to another
    matrix dB.
    
    This is the same as SLACPY, but adds queue argument.
    
    Arguments
    ---------
    
    @param[in]
    uplo    magma_uplo_t
            Specifies the part of the matrix dA to be copied to dB.
      -     = MagmaUpper:      Upper triangular part
      -     = MagmaLower:      Lower triangular part
            Otherwise:  All of the matrix dA
    
    @param[in]
    m       INTEGER
            The number of rows of the matrix dA.  M >= 0.
    
    @param[in]
    n       INTEGER
            The number of columns of the matrix dA.  N >= 0.
    
    @param[in]
    dA      REAL array, dimension (LDDA,N)
            The m by n matrix dA.
            If UPLO = MagmaUpper, only the upper triangle or trapezoid is accessed;
            if UPLO = MagmaLower, only the lower triangle or trapezoid is accessed.
    
    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
    
    @param[out]
    dB      REAL array, dimension (LDDB,N)
            The m by n matrix dB.
            On exit, dB = dA in the locations specified by UPLO.
    
    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB.  LDDB >= max(1,M).
    
    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    @ingroup magma_saux2
    ********************************************************************/
extern "C" void
magmablas_slacpy(
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magmaFloat_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloat_ptr       dB, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue )
{
    cl_kernel kernel;
    cl_int err;
    int i;

    magma_int_t info = 0;
    if ( m < 0 )
        info = -2;
    else if ( n < 0 )
        info = -3;
    else if ( ldda < max(1,m))
        info = -5;
    else if ( lddb < max(1,m))
        info = -7;
    
    if ( info != 0 ) {
        magma_xerbla( __func__, -(info) );
        return;
    }
    
    if ( m == 0 || n == 0 )
        return;
    
    size_t threads[2] = { BLK_X, 1 };
    size_t grid[2] = { (m + BLK_X - 1)/BLK_X, (n + BLK_Y - 1)/BLK_Y };
    grid[0] *= threads[0];
    grid[1] *= threads[1];
    
    if ( uplo == MagmaLower ) {
        kernel = g_runtime.get_kernel( "slacpy_kernel_lower" );
        if ( kernel != NULL ) {
            err = 0;
            i   = 0;
            err |= clSetKernelArg( kernel, i++, sizeof(m        ), &m         );
            err |= clSetKernelArg( kernel, i++, sizeof(n        ), &n         );
            err |= clSetKernelArg( kernel, i++, sizeof(dA       ), &dA        );
            err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(ldda     ), &ldda      );
            err |= clSetKernelArg( kernel, i++, sizeof(dB       ), &dB        );
            err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(lddb     ), &lddb      );
            check_error( err );

//.........这里部分代码省略.........
开发者ID:kjbartel,项目名称:clmagma,代码行数:101,代码来源:slacpy.cpp

示例14: main


//.........这里部分代码省略.........
	command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret);
	if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); }

//OpenCL arrays
    cl_mem cl_u = NULL,cl_v = NULL;
   	cl_mem cl_uhat = NULL, cl_vhat = NULL;
    cl_mem cl_kx = NULL, cl_ky = NULL;

//FFT
	clfftPlanHandle planHandle;
    cl_mem tmpBuffer = NULL;
	fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny);

//allocate gpu memory/
	cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret);
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret);

	printf("allocated space\n");
//load the kernels
	loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies");
	loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); 
	loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); 
	loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); 
	loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); 

	size_t global_work_size[1] = {Nx*Ny};
	size_t global_work_size_X[1] = {Nx};
	size_t global_work_size_Y[1] = {Ny};
//frequencies
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//printCL(&cl_kx,&command_queue,Nx,1);
//printCL(&cl_ky,&command_queue,1,Ny);
//inintial data
    ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly);
    ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//make output
    writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
    umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	printf("Got initial data, starting timestepping\n");
	mtime_s(&tvs);

	for(n=0;n<=Tmax;n++){
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
开发者ID:MichaelQuell,项目名称:PSNM,代码行数:67,代码来源:main_gs.c

示例15: error_norm

//---------------------------------------------------------------------
// this function computes the norm of the difference between the
// computed solution and the exact solution
//---------------------------------------------------------------------
void error_norm(double rms[5])
{
  int i, m, d;

  cl_kernel k_error_norm;
  cl_mem m_rms;
  double (*g_rms)[5];
  size_t local_ws, global_ws, temp, wg_num, buf_size;
  cl_int ecode;

  int d0 = grid_points[0];
  int d1 = grid_points[1];
  int d2 = grid_points[2];

  for (m = 0; m < 5; m++) {
    rms[m] = 0.0;
  }

  temp = d2 / max_compute_units;
  local_ws  = temp == 0 ? 1 : temp;
  global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  wg_num = global_ws / local_ws;

  buf_size = sizeof(double) * 5 * wg_num;
  m_rms = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         buf_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  k_error_norm = clCreateKernel(p_error, "error_norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");

  ecode  = clSetKernelArg(k_error_norm, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_error_norm, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_error_norm, 2, sizeof(cl_mem), &m_rms);
  ecode |= clSetKernelArg(k_error_norm, 3, sizeof(double)*5*local_ws, NULL);
  ecode |= clSetKernelArg(k_error_norm, 4, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_error_norm, 5, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_error_norm, 6, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");
  
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_error_norm,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  g_rms = (double (*)[5])malloc(buf_size);

  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_rms,
                              CL_TRUE,
                              0, buf_size,
                              g_rms,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clReadBuffer()");

  // reduction
  for (i = 0; i < wg_num; i++) {
    for (m = 0; m < 5; m++) {
      rms[m] += g_rms[i][m];
    }
  }
  
  for (m = 0; m < 5; m++) {
    for (d = 0; d < 3; d++) {
      rms[m] = rms[m] / (double)(grid_points[d]-2);
    }
    rms[m] = sqrt(rms[m]);
  }

  free(g_rms);
  clReleaseMemObject(m_rms);
  clReleaseKernel(k_error_norm);
}
开发者ID:ashwinma,项目名称:multicl,代码行数:82,代码来源:error.c


注:本文中的clSetKernelArg函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。