当前位置: 首页>>代码示例>>C++>>正文


C++ clCreateBuffer函数代码示例

本文整理汇总了C++中clCreateBuffer函数的典型用法代码示例。如果您正苦于以下问题:C++ clCreateBuffer函数的具体用法?C++ clCreateBuffer怎么用?C++ clCreateBuffer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了clCreateBuffer函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: help


//.........这里部分代码省略.........
		std::cerr << "Retrieving OpenCL platforms" << std::endl;
		error = clGetPlatformIDs(4, platforms, &platformsNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL platforms");
		}

		if(platformId >= platformsNumber) {
			throw std::runtime_error("No platform found with the provided id");
		}

		std::cerr << "Retrieving OpenCL GPU devices" << std::endl;
		error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL devices");
		}

		if(deviceId >= devicesNumber) {
			throw std::runtime_error("No device found with the provided id");
		}

		std::cerr << "Creating OpenCL context" << std::endl;
		context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL context");
		}

		std::cerr << "Creating OpenCL command queue" << std::endl;
		commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL command queue");
		}

		std::cerr << "Creating OpenCL GPU generation buffer" << std::endl;
		bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer");
		}

		std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl;
		bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer");
		}

		std::cerr << "Creating OpenCL program" << std::endl;
		std::string source = loadSource("kernel/nonce.cl");
		const char* sources[] = {source.c_str()};
		size_t sourcesLength[] = {source.length()};
		program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL program");
		}

		std::cerr << "Building OpenCL program" << std::endl;
		error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0);
		if(error != CL_SUCCESS) {
			size_t logSize;
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);

			char* log = new char[logSize];
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0);
			std::cerr << log << std::endl;
			delete[] log;

			throw OpenclError(error, "Unable to build the OpenCL program");
		}
开发者ID:coventry,项目名称:BurstGPUPlotter,代码行数:67,代码来源:CommandGenerate.cpp

示例2: bpnn_train_kernel


//.........这里部分代码省略.........
	
	float *input_weights_one_dim;
    float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;
	
	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
	
	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
		input_weights_one_dim[m] = net->input_weights[k][j];
		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
	    m++;
		}
	}
	
	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;
  
	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
		
	printf("Performing GPU computation\n");
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
        if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
	if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
开发者ID:shvo,项目名称:Rodinia-FPGA,代码行数:67,代码来源:backprop_ocl_fpga.cpp

示例3: execute

void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) {
	cl_context context;
	cl_command_queue commandQueue;
	cl_program program;
	cl_kernel kernel;
	
	size_t dataBytes, kernelLength;
	cl_int errorCode;
	
	cl_mem gridBuffer;
	
	cl_device_id* devices;
	cl_device_id gpu;
	
	cl_uint numPlatforms;

	errorCode = clGetPlatformIDs(0, NULL, &numPlatforms);
	cl_platform_id platforms[numPlatforms];
	errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL);
	
	checkError(errorCode);
	
	cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};

	context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode);
	checkError(errorCode);
	
	errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes);
	devices = malloc(dataBytes);
	errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL);
	
	gpu = devices[0];
	
	commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode);
	checkError(errorCode);
	
	gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode);
	checkError(errorCode);
	
	const char* programBuffer = readFile("kernel.cl");
	kernelLength = strlen(programBuffer);
	program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode);
	checkError(errorCode);
	
	errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errorCode == CL_BUILD_PROGRAM_FAILURE) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		
		// Allocate memory for the log
		char *log = (char *) malloc(log_size);
		
		// Get the log
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
		
		// Print the log
		free(log);
		printf("%s\n", log);
	}
	checkError(errorCode);
	
	kernel = clCreateKernel(program, "diffuse", &errorCode);
	checkError(errorCode);

	size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width};

	errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer);
	errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL);
	errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width);
	errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize);
	errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations);
	checkError(errorCode);
	
	errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	checkError(errorCode);
	
	errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL);
	checkError(errorCode);



	free(devices);
	free((void *)programBuffer);
	clReleaseContext(context);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);

	
}
开发者ID:scrblnrd3,项目名称:GPGPU-Diffusion,代码行数:91,代码来源:main.c

示例4: initGPU

int initGPU(int n)
{
	#pragma mark Device Information
	// Find the CPU CL device, as a fallback
	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
	assert(err == CL_SUCCESS);

	// Find the GPU CL device, this is what we really want
	// If there is no GPU device is CL capable, fall back to CPU
	err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	if (err != CL_SUCCESS) device = cpu;
	assert(device);

	// Get some information about the returned device
	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
	assert(err == CL_SUCCESS);
	printf("Connecting to %s %s...", vendor_name, device_name);

	#pragma mark Context and Command Queue
	// Now create a context to perform our calculation with the 
	// specified device 
	context = clCreateContext(0, 1, &device, NULL, NULL, &err);
	assert(err == CL_SUCCESS);

	// And also a command queue for the context
	cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

	#pragma mark Program and Kernel Creation
	// Load the program source from disk
	// The kernel/program is the project directory and in Xcode the executable
	// is set to launch from that directory hence we use a relative path
	const char * filename = "kernel.cl";
	char *program_source = load_program_source(filename);
	program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err);
	assert(err == CL_SUCCESS);

	err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
	assert(err == CL_SUCCESS);

	// Now create the kernel "objects" that we want to use in the example file 
	kernel[0] = clCreateKernel(program[0], "add", &err);
	assert(err == CL_SUCCESS);

	#pragma mark Memory Allocation
	// Allocate memory on the device to hold our data and store the results into
	buffer_size = sizeof(int) * n;

	mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	// Get all of the stuff written and allocated
	clFinish(cmd_queue);

	printf(" done\n");

	return err; // CL_SUCCESS
}
开发者ID:Mikulas,项目名称:PoleBalanceGPU,代码行数:66,代码来源:main.c

示例5: magma_cgeqrf2_2q_gpu


//.........这里部分代码省略.........
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return MAGMA_SUCCESS;

    nb = magma_get_cgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;


    if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /*
       if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) {
     *info = MAGMA_ERR_HOST_ALLOC;
     magma_free( dwork );
     return *info;
     }
     */

    cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaFloatComplex)*lwork, NULL, NULL);
    work = (magmaFloatComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL);

    nbmin = 2;
    nx    = 2*nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL);
            clFlush(queues[0]);  
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                        m-old_i, n-old_i-2*old_ib, old_ib,
                        dA(old_i, old_i         ), ldda, dwork,0,      lddwork,
                        dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]);

                magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork,
                        dA(old_i, old_i), ldda, queues[1], NULL); 
                clFlush(queues[1]);
            }

            magma_queue_sync(queues[0]);
            lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);


            /* Form the triangular factor of the block reflector
开发者ID:EmergentOrder,项目名称:clmagma,代码行数:67,代码来源:cgeqrf2_2q_gpu.cpp

示例6: clGetPlatformIDs


//.........这里部分代码省略.........


    maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t));
    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_SIZES,
                             sizeof(size_t)*maxDimensions,
                             (void *)maxWorkItemSizes,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
        return SDK_FAILURE;




    {
        /* The block is to move the declaration of prop closer to its use */
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   0,
                                   "clCreateCommandQueue failed."))
            return SDK_FAILURE;
    }

    /* Input buffer */
    inputBuffer = clCreateBuffer(context, 
                                 CL_MEM_READ_ONLY,
                                 sizeof(cl_float4) * length,
                                 0, 
                                 &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;

    /* Write data to buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  1,
                                  0,
                                  sizeof(cl_float4) * length,
                                  input,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clEnqueueWriteBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;


    outputBuffer = clCreateBuffer(context, 
                                  CL_MEM_WRITE_ONLY,
                                  sizeof(cl_float4) * length,
                                  0, 
                                  &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
开发者ID:pbains,项目名称:m2s-bench-amdapp-2.5,代码行数:67,代码来源:MemoryOptimizations.cpp

示例7: main


//.........这里部分代码省略.........
        return 2;
    }

    INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0];
    
    INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo);
    FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult);
    freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult);
 //   printvector("dot prod", iExpOutputSize, fExpDotProdResult);
 //   printvector("reduce", iExpOutputSize, fExpReduceResult);

    FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFIncrease(iGlobalSize, inputDataF);
    
    // For the dot product.
    
    FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataD);

    // For the reduction.
    
    FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataR);
    
	struct timespec start[iNoKernels];
	struct timespec end[iNoKernels];
    
// create buffers for the input and ouput

    int err; 
    cl_mem inputF, outputF, outputAll;
    inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for F");
        return 3;
    }
    
    
    outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 7");
        return 9;
    }
    outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 8");
        return 9;
    }

    clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL);

    int iRep;
    int iKernel;
    int i;
    int iLengthTotal = iGlobalSize;
    size_t iGlobalWorkThing = iGlobalSize;
    int iSomething = 1;
    for (iKernel = 0; iKernel < iNoKernels; iKernel++)
    {    
		for (i = 0; i < iLengthTotal; i++)
		{
开发者ID:peterkmurphy,项目名称:thesis-control,代码行数:67,代码来源:dotproducttest.c

示例8: clCreateBuffer

    // Create the data array in device memory for our calculation
    //
    cl_mem device_$arg_ref = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof($arg_ref[0]) * grid_size, NULL, NULL);
    if (!device_$arg_ref)
    {
        printf("Error: Failed to allocate device memory!\n");
        return err;
    }

    // Write our data set into the data array in device memory
    //
    err = clEnqueueWriteBuffer(commands, device_$arg_ref, CL_TRUE, 0, sizeof($arg_ref[0]) * grid_size, $arg_ref, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return err;
    }

    // Set the arguments to our compute kernel
    //
    err  = clSetKernelArg(kernel, $arg_index, sizeof(cl_mem), &device_$arg_ref);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        return err;
    }
开发者ID:lowks,项目名称:stencil_code,代码行数:26,代码来源:OclLoadGrid.tmpl.c

示例9: test_csrmv


//.........这里部分代码省略.........
        }

        if (typeid(T) == typeid(cl_double) )
        {
            status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX,
                                    &gBeta, &gY, CLSE::control);

            ASSERT_EQ(clsparseSuccess, status);

            double* vals = (double*)&CSRE::ublasDCsr.value_data()[0];
            int* rows = &CSRE::ublasDCsr.index1_data()[0];
            int* cols = &CSRE::ublasDCsr.index2_data()[0];
            for (int row = 0; row < CSRE::n_rows; row++)
            {
                // Summation done using a compensated summation to decrease
                // summation errors from rounding. This allows us to get
                // smaller errors without requiring quad precision support.
                // This method is like performing summation at quad precision and
                // casting down to double in the end.
                hY[row] *= hBeta;
                int row_end = rows[row+1];
                double temp_sum;
                temp_sum = hY[row];
                T sumk_err = 0.;
                for (int i = rows[row]; i < rows[row+1]; i++)
                {
                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
                    temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
                }
                hY[row] = temp_sum + sumk_err;
            }

            T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                       CL_TRUE, CL_MAP_READ,
                                                       0, gY.num_values * sizeof(T),
                                                       0, nullptr, nullptr, &cl_status);
            ASSERT_EQ(CL_SUCCESS, cl_status);

            uint64_t max_ulps = 0;
            uint64_t min_ulps = ULLONG_MAX;
            uint64_t total_ulps = 0;
            for (int i = 0; i < hY.size(); i++)
            {
                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
                intDiff = llabs(intDiff);
                total_ulps += intDiff;
                if (max_ulps < intDiff)
                    max_ulps = intDiff;
                if (min_ulps > intDiff)
                    min_ulps = intDiff;
                // Debug printouts.
                //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl;
                //std::cout.precision(17);
                //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec;
                //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl;
            }
            if (extended_precision)
            {
#ifndef NDEBUG
                std::cout << "Double Min ulps: " << min_ulps << std::endl;
                std::cout << "Double Max ulps: " << max_ulps << std::endl;
                std::cout << "Double Total ulps: " << total_ulps << std::endl;
                std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() <<  " (Size: " << hY.size() << ")" << std::endl;
#endif

                for (int i = 0; i < hY.size(); i++)
                {
                    double compare_val = fabs(hY[i]*1e-14);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
                }
            }
            else
            {
                for (int i = 0; i < hY.size(); i++)
                {
                    double compare_val = 0.;
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*0.1);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
                }
            }

            cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                  host_result, 0, nullptr, nullptr);
            ASSERT_EQ(CL_SUCCESS, cl_status);
        }
        // Reset output buffer for next test.
        ::clReleaseMemObject(gY.values);
        clsparseInitVector(&gY);
        gY.values = clCreateBuffer(CLSE::context,
                CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                hY.size() * sizeof(T), hY.data().begin(),
                &cl_status);
        gY.num_values = hY.size();
        ASSERT_EQ(CL_SUCCESS, cl_status);
    }
开发者ID:kvaragan,项目名称:clSPARSE,代码行数:101,代码来源:test-blas2.cpp

示例10: vector_sum

inline void vector_sum(const int  arraySize, 
                       const double* inputA, 
                       const double* inputB, 
                             double* output)
{ 
    /* Allocate memory buffers */
    /*
    * Ask the OpenCL implementation to allocate buffers for the data.
    * We ask the OpenCL implemenation to allocate memory rather than 
    * allocating it on the CPU to avoid having to copy the data later.
    * The read/write flags relate to accesses to the memory from within 
    * the kernel.
    */

    bool createMemoryObjectSuccess = true;
    int numberOfMemoryObjects = 3;
    cl_mem memoryObjects[3] = {0, 0, 0};
    int errorNumber = 0;

    int bufferSize = arraySize*sizeof(double);

    memoryObjects[0] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputA, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 1.");
    
    memoryObjects[1] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputB, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 2.");
    
    memoryObjects[2] = clCreateBuffer(context, 
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, output, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 3.");

    /* Enqueue commands and kernels */
    /* Enqueue to the command queues the commands that control the sequence 
     * and synchronization of kernel execution, reading and writing of data,
     * and manipulation of memory objects
     */

    /* Execute a kernel function */
    /* Call clSetKernelArg() for each parameter in the kernel */
    bool setKernelArgumentsSuccess = true;
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, 
                                        sizeof(cl_mem), &memoryObjects[0]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, 
                                        sizeof(cl_mem), &memoryObjects[1]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, 
                                        sizeof(cl_mem), &memoryObjects[2]));
    if (not setKernelArgumentsSuccess) {
        cleanUpOpenCL();
        std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ 
                  << ":"<< __LINE__ << std::endl;
        exit(1);
    }

    /* Determine the work-group size and index space for the kernel */
    const size_t globalWorkSize[1] = {arraySize};
    const size_t localWorkSize[1] = { 1 };

    /* Enqueue the kernel for execution in the command queue */
    //for (int j = 0; j < ITER; j++) {
        if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, 
                NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) {
            
            cleanUpOpenCL();
            std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" 
                      << __LINE__ <<std::endl;
            exit(1);
        }
    //}

    /* Get a pointer to the output data */
    output = (double*)clEnqueueMapBuffer(commandQueue, 
                    memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, 
                    arraySize, 0, NULL, NULL, &errorNumber);

    if (not checkSuccess(errorNumber)) {

        cleanUpOpenCL();
        std::cerr << "Failed to map buffer " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1); 
    }

    /* Wait for kernel execution */
    if (not checkSuccess(clFinish(commandQueue))) {

        cleanUpOpenCL();
        std::cerr << "Failed waiting for kernel execution to finish. "
                  << __FILE__ << ":"<< __LINE__ << std::endl;
        exit(1);
    }


    /* Unmap the memory objects as we finished using them in the CPU */
    if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) {

//.........这里部分代码省略.........
开发者ID:bagustris,项目名称:LowPowerSupercomputer,代码行数:101,代码来源:es2.cpp

示例11: main

int main() {
// START:context
  cl_platform_id platform;
  clGetPlatformIDs(1, &platform, NULL);

  cl_device_id device;
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// END:context

// START:queue
  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// END:queue

// START:kernel
  char* source = read_source("multiply_arrays.cl");
  cl_program program = clCreateProgramWithSource(context, 1,
    (const char**)&source, NULL, NULL);
  free(source);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL);
// END:kernel

// START:buffers
  cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
  random_fill(a, NUM_ELEMENTS);
  random_fill(b, NUM_ELEMENTS);
  cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
  cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
// END:buffers

// START:execute
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

  size_t work_units = NUM_ELEMENTS;
  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL);
// END:execute

// START:results
  cl_float results[NUM_ELEMENTS];
  clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
    results, 0, NULL, NULL);
// END:results

// START:cleanup
  clReleaseMemObject(inputA);
  clReleaseMemObject(inputB);
  clReleaseMemObject(output);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
// END:cleanup

  for (int i = 0; i < NUM_ELEMENTS; ++i) {
    printf("%f * %f = %f\n", a[i], b[i], results[i]);
  }

  return 0;
}
开发者ID:kaizenoh,项目名称:book-source,代码行数:67,代码来源:multiply_arrays.c

示例12: init_kernel_platform

int32_t init_kernel_platform() {

	cl_uint plat_num;
	cl_platform_id plat_id = NULL;
	cl_uint dev_num = 0;
	cl_device_id *devices;

	ret = clGetPlatformIDs(0, NULL, &plat_num);
	if (ret < 0) {
		LOGD("MU1 Error: Getting plat_ids!\n");
		return -1;
	}

	if(plat_num > 0)
	{
		cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id));
		ret = clGetPlatformIDs(plat_num, plat_ids, NULL);
		plat_id = plat_ids[0];
		free(plat_ids);
	}

	ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num);	
	if (dev_num == 0) {
		LOGD("MU1: No GPU device available.\n");
		LOGD("MU1: Choose CPU as default device.\n");
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num);	
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL);
	} else {
		LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num);
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL);
	}
	
	context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);

	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
	char filename[] = "/data/mu1_kernel.cl";
	char file_context[10*1024]={0};
	const char *source = &file_context[0];

    ret = read_cl(filename, &file_context[0]);

	size_t sourceSize[10] = {strlen(source)};
	cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL);
	
	ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(ret < 0) {
        LOGD("MU1 Error: clBuildProgram error\n");
        return 0;
    }

	kernel = clCreateKernel(program, "process_iq", NULL);

	inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_i[0][0]), NULL);
	inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_q[0][0]), NULL);
	inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_o[0][0]), NULL);


	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o);



    if(devices != NULL) { free(devices);}

    LOGD("MU1: init cl plat success");
    return 0;
}
开发者ID:chengyake,项目名称:karch,代码行数:74,代码来源:opencl_mu1.c

示例13: main

////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#define GPU_PROFILING 1
#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
//.........这里部分代码省略.........
开发者ID:NatTuck,项目名称:cakemark,代码行数:101,代码来源:main.cpp

示例14: main


//.........这里部分代码省略.........
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));

	cl_command_queue queue;
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));

	cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0,0};


  // Create OpenCL program - first attempt to load cached binary.
  //  If that is not available, then create the program from source
  //  and store the binary for future use.
  std::cout << "Attempting to create program from binary..." << std::endl;
  cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
  if (program == NULL)
  {
      std::cout << "Binary not loaded, create from source..." << std::endl;
      program = CreateProgram(context, devices[1], "kernel.cl");
      if (program == NULL)
      {
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }

      std::cout << "Save program binary for future run..." << std::endl;
      if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
      {
          std::cerr << "Failed to write program binary" << std::endl;
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }
  }
  else
  {
      std::cout << "Read program from binary." << std::endl;
  }

  printf("attempting to create input buffer\n");
  fflush(stdout);
	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  printf("attempting to create output buffer\n");
  fflush(stdout);
	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  memObjects[0] = input_buffer;
  memObjects[1] = output_buffer;

  double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;;

  printf("attempting to create kernel\n");
  fflush(stdout);
	kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err));
  printf("setting up kernel args cl_mem:%lx \n",input_buffer);
  fflush(stdout);
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

  printf("attempting to enqueue write buffer\n");
  fflush(stdout);
	for (int i=0; i<NUM_DATA; i++) {
    double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;;
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
  printf("attempting to enqueue kernel\n");
  fflush(stdout);
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
  printf("Enqueue'd kerenel\n");
  fflush(stdout);
    cl_ulong time_start, time_end;
  CL_CHECK(clWaitForEvents(1, &kernel_completion));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
  double elapsed = time_end - time_start;
  printf("time(ns):%lg\n",elapsed);
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		double data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL));
		//printf(" %lg", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(memObjects[0]));
	CL_CHECK(clReleaseMemObject(memObjects[1]));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
开发者ID:JamesLinus,项目名称:opencl-kernels,代码行数:101,代码来源:daxpy.c

示例15: setup_buffer


//.........这里部分代码省略.........
                }
                else if (lda < K)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
            else
            {
                buffer_.a_num_vectors_ = K;
                if (transA_option == 1)
                {
                    buffer_.trans_a_ = clblasTrans;
                }
                else if (transA_option == 2)
                {
                    buffer_.trans_a_ = clblasConjTrans;
                }
                if (lda == 0)
                {
                    buffer_.lda_ = N;
                }
                else if (lda < N)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
        }
        else
        {
            order_ = clblasColumnMajor;
            if (transA_option == 0)
            {
                buffer_.a_num_vectors_ = K;
                buffer_.trans_a_ = clblasNoTrans;
                if (lda == 0)
                {
                    buffer_.lda_ = N;
                }
                else if (lda < N)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
            else
            {
                buffer_.a_num_vectors_ = N;
                if (transA_option == 1)
                {
                    buffer_.trans_a_ = clblasTrans;
                }
                else if (transA_option == 2)
                {
                    buffer_.trans_a_ = clblasConjTrans;
                }

                if (lda == 0)
                {
                    buffer_.lda_ = K;
                }
                else if (lda < K)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
        }

        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];

        cl_int err;
        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
                                        NULL, &err);

        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
                                        NULL, &err);
    }
开发者ID:ChenFengAndy,项目名称:clBLAS,代码行数:101,代码来源:clfunc_xsyrk.hpp


注:本文中的clCreateBuffer函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。