当前位置: 首页>>代码示例>>C++>>正文


C++ cudaMalloc函数代码示例

本文整理汇总了C++中cudaMalloc函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaMalloc函数的具体用法?C++ cudaMalloc怎么用?C++ cudaMalloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了cudaMalloc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: cumem

 cumem(int newsz){
   sz = newsz;
   cudaMalloc(&data, sz);
   status = inuse;
   next = NULL;
 };
开发者ID:darlliu,项目名称:Yeast-Cuda-Gillespie-Simulator,代码行数:6,代码来源:cudahelper.hpp

示例2: sscanf_s

ImGpu::ImGpu(const char* filename)
{
	FILE *fp = 0;
	int t1, t2, t3, t4;
	cudaError_t cudaStatus;

	sscanf_s(filename, "%dx%dx%dx%d_", &t1, &t2, &t3, &t4);

	width = t1;
	height = t2;
	bpp = t3;
	dimension = t4;

	void *pxl = 0;

	/* Allocate memory for the pixels on the Gpu */
	if (8 == bpp)
	{
		cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(char));
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaMalloc failed!");
			goto Error;
		}
		cudaMemset(dev_pxl, 255, sizeof(char) * width *height *dimension);
		pxl = new char[sizeof(char) * width *height *dimension];
	}
	else if (16 == bpp)
	{
		cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(unsigned short));
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaMalloc failed!");
			goto Error;
		}
		cudaMemset(dev_pxl, 255, sizeof(unsigned short) * width *height *dimension);
		pxl = new unsigned short[sizeof(unsigned short) * width *height *dimension];
	}

	/*
	* Open the file to read the pixels
	*/
	fopen_s(&fp, filename, "rb"); /* open for reading */

	if (0 != fp){
		std::fread(pxl, sizeof(unsigned char), width*height*dimension, fp);
		fclose(fp); /* close the file */
	}


	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_pxl, pxl, width *height *dimension * sizeof(char), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	delete(pxl);
	return;
Error:
	cudaFree(dev_pxl);
	//delete(pxl);
}
开发者ID:mattvend,项目名称:Gpu,代码行数:61,代码来源:ImGpu.cpp

示例3: mexFunction

/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[]) {

    if (nrhs != 7) {
        mexErrMsgTxt("sgemm requires 7 input arguments");
    } else if (nlhs != 1) {
        mexErrMsgTxt("sgemm requires 1 output argument");
    }

    if ( !mxIsSingle(prhs[4]) ||
            !mxIsSingle(prhs[5]) ||
            !mxIsSingle(prhs[6]))   {
        mexErrMsgTxt("Input arrays must be single precision.");
    }

    int ta = (int) mxGetScalar(prhs[0]);
    int tb = (int) mxGetScalar(prhs[1]);
    float alpha = (float) mxGetScalar(prhs[2]);
    float beta = (float) mxGetScalar(prhs[3]);
    float *h_A = (float*) mxGetData(prhs[4]);
    float *h_B = (float*) mxGetData(prhs[5]);
    float *h_C = (float*) mxGetData(prhs[6]);

    int M = mxGetM(prhs[4]);   /* gets number of rows of A */
    int K = mxGetN(prhs[4]);   /* gets number of columns of A */
    int L = mxGetM(prhs[5]);   /* gets number of rows of B */
    int N = mxGetN(prhs[5]);   /* gets number of columns of B */

    cublasOperation_t transa, transb;
    int MM, KK, NN;
    if (ta == 0) {
        transa = CUBLAS_OP_N;
        MM=M;
        KK=K;
    } else {
        transa = CUBLAS_OP_T;
        MM=K;
        KK=M;
    }

    if (tb == 0) {
        transb = CUBLAS_OP_N;
        NN=N;
    } else {
        transb = CUBLAS_OP_T;
        NN=L;
    }

    /*	printf("transa=%c\n",transa);
    	printf("transb=%c\n",transb);
    	printf("alpha=%f\n",alpha);
    	printf("beta=%f\n",beta);	*/

    /* Left hand side matrix set up */
    mwSize dims0[2];
    dims0[0]=MM;
    dims0[1]=NN;
    plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
    float *h_C_out = (float*) mxGetData(plhs[0]);

    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! CUBLAS initialization error\n");
    }

    float* d_A = 0;
    float* d_B = 0;
    float* d_C = 0;

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n");
    }
    if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n");

    }
    if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n");
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write A)\n");

    }
    status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write B)\n");
    }
    status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write C)\n");
    }

    /* Performs operation using cublas */
//.........这里部分代码省略.........
开发者ID:research2010,项目名称:sparse_linear_model,代码行数:101,代码来源:cudaSample.cpp

示例4: main

int main( int argc, char **argv )
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

		shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

	//-----
	// 64 bin histogram
	//------
	{
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );
//.........这里部分代码省略.........
开发者ID:shawndb,项目名称:demoTRISH,代码行数:101,代码来源:main.cpp

示例5: allocate

	void allocate() {
		this->destroy();
		check_error( cudaMalloc((void**)&_dptr, sizeof(value_type)) );
	}
开发者ID:MilesCranmer,项目名称:bifrost,代码行数:4,代码来源:value.hpp

示例6: cudaMalloc

void sparse_matrix_t::alloc_device() 
{
    cudaMalloc((void**)&devJc, (numCols+1) * sizeof(int));
    cudaMalloc((void**)&devIr, numNonZeroElems * sizeof(int));
    cudaMalloc((void**)&devRVals, numNonZeroElems * sizeof(float));
}
开发者ID:hksonngan,项目名称:Impatient-MRI,代码行数:6,代码来源:utils.cpp

示例7: runTestMax

bool
runTestMax( int argc, char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
		shrLog("METHOD: MAX\n");
    shrLog("%d elements\n", size);
    shrLog("%d threads (max)\n", maxThreads);

    cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {

        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;

        // allocate mem for the result on host side
        T* h_odata = (T*) malloc(numBlocks*sizeof(T));

        shrLog("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T* d_idata = NULL;
        T* d_odata = NULL;

        cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
        cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

        // copy data directly to device memory
        cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
        cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

        // warm-up
        maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
        
        int testIterations = 100;

        unsigned int timer = 0;
        cutilCheckError( cutCreateTimer( &timer));
        
        T gpu_result = 0;

        gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

		double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
        shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = maxreduceCPU<T>(h_idata, size);

        double threshold = 1e-12;
        double diff = 0;
		
        if (datatype == REDUCE_INT)
        {
            shrLog("\nGPU result = %d\n", gpu_result);
            shrLog("CPU result = %d\n\n", cpu_result);
        }
        else
        {
            shrLog("\nGPU result = %f\n", gpu_result);
            shrLog("CPU result = %f\n\n", cpu_result);
//.........这里部分代码省略.........
开发者ID:szabodabo,项目名称:CUDA-MPI-Reductions,代码行数:101,代码来源:reduction.cpp

示例8: LOG

void iB_FFTShift::FFTShift_2D_Float(int size_X, int size_Y, Sheet* xlSheet, int nLoop)
{
	LOG();

	INFO("2D FFT Shift Float - CPU " + ITS(size_X) + "x" + ITS(size_Y));

	/**********************************************************
	 * Float Case
	 **********************************************************/

	if (xlSheet)
	{
		for (int iLoop = 0; iLoop < nLoop; iLoop++)
		{
			// Headers
			xlSheet->writeStr(1, ((iLoop * 4) + 0), "I-CPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 1), "O-CPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 2), "I-GPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 3), "O-GPU");

			// Allocation: 2D, Flat, Device
			arr_2D_float = MEM_ALLOC_2D_FLOAT(size_X, size_Y);
			arr_2D_flat_float = MEM_ALLOC_1D_FLOAT(size_X * size_Y);
			int devMem = size_X * size_Y * sizeof(float);
			cudaMalloc((void**)(&dev_arr_2D_flat_float), devMem);

			// Filling arrays: 2D, Flat
			Array::fillArray_2D_float(arr_2D_float, size_X, size_Y, 1);
			Array::fillArray_2D_flat_float(arr_2D_flat_float, size_X, size_Y, 1);

			// Printing input
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
					xlSheet->writeNum((ctr++) + 2, iLoop * 4, arr_2D_float[i][j]);

			// FFT shift operation - CPU
			arr_2D_float = FFT::FFT_Shift_2D_float(arr_2D_float, size_X, size_Y);

			// Printing CPU output
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
					xlSheet->writeNum((ctr++) + 2, ((iLoop * 4 ) + 1), arr_2D_float[i][j]);

			// Printing GPU input
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
				{
					xlSheet->writeNum(ctr + 2, ((iLoop * 4 ) + 2), arr_2D_flat_float[ctr]);
					ctr++;
				}

			// Uploading array
			cuUtils::upload_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);

			// CUDA Gridding
			dim3 cuBlock(512, 512, 1);
			dim3 cuGrid(size_X / cuBlock.x, size_Y/ cuBlock.y, 1);

			// FFT shift
			cuFFTShift_2D( cuBlock, cuGrid, dev_arr_2D_flat_float, dev_arr_2D_flat_float, size_X);

			// Downloading array
			cuUtils::download_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);

			// Printing output
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
				{
					xlSheet->writeNum((ctr) + 2, ((iLoop * 4 ) + 3), arr_2D_flat_float[ctr]);
					ctr++;
				}

			// Dellocating memory
			FREE_MEM_2D_FLOAT(arr_2D_float, size_X, size_Y);
		}

	}
	else
	{
		INFO("No valid xlSheet was created, EXITTING ...");
		EXIT(0);
	}
}
开发者ID:marwan-abdellah,项目名称:Dummy,代码行数:87,代码来源:iB_FFTShift.cpp

示例9: cudaMallocWrapper

static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
{
  return cudaMalloc(devPtr, size);
}
开发者ID:ibcn-cloudlet,项目名称:cutorch,代码行数:4,代码来源:THCGeneral.c

示例10: gpujpeg_encoder_create

/** Documented at declaration */
struct gpujpeg_encoder*
gpujpeg_encoder_create(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters* param_image)
{
    assert(param_image->comp_count == 1 || param_image->comp_count == 3);
    assert(param_image->comp_count <= GPUJPEG_MAX_COMPONENT_COUNT);
    assert(param->quality >= 0 && param->quality <= 100);
    assert(param->restart_interval >= 0);
    assert(param->interleaved == 0 || param->interleaved == 1);

    struct gpujpeg_encoder* encoder = (struct gpujpeg_encoder*) malloc(sizeof(struct gpujpeg_encoder));
    if ( encoder == NULL )
        return NULL;

    // Get coder
    struct gpujpeg_coder* coder = &encoder->coder;

    // Set parameters
    memset(encoder, 0, sizeof(struct gpujpeg_encoder));
    coder->param_image = *param_image;
    coder->param = *param;

    int result = 1;

    // Create writer
    encoder->writer = gpujpeg_writer_create(encoder);
    if ( encoder->writer == NULL )
        result = 0;

    // Initialize coder
    if ( gpujpeg_coder_init(coder) != 0 )
        result = 0;

    // Init preprocessor
    if ( gpujpeg_preprocessor_encoder_init(&encoder->coder) != 0 ) {
        fprintf(stderr, "Failed to init preprocessor!");
        result = 0;
    }

    // Allocate quantization tables in device memory
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) )
            result = 0;
        if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table_forward, 64 * sizeof(float)) )
            result = 0;
    }
    gpujpeg_cuda_check_error("Encoder table allocation", return NULL);

    // Init quantization tables for encoder
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        if ( gpujpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], (enum gpujpeg_component_type)comp_type, coder->param.quality) != 0 )
            result = 0;
    }
    gpujpeg_cuda_check_error("Quantization init", return NULL);

    // Init huffman tables for encoder
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
            if ( gpujpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], (enum gpujpeg_component_type)comp_type, (enum gpujpeg_huffman_type)huff_type) != 0 )
                result = 0;
        }
    }
    gpujpeg_cuda_check_error("Encoder table init", return NULL);

    // Init huffman encoder
    if ( gpujpeg_huffman_gpu_encoder_init(encoder) != 0 )
        result = 0;

    if ( result == 0 ) {
        gpujpeg_encoder_destroy(encoder);
        return NULL;
    }

    // Timers
    GPUJPEG_CUSTOM_TIMER_CREATE(encoder->def);
    GPUJPEG_CUSTOM_TIMER_CREATE(encoder->in_gpu);

    return encoder;
}
开发者ID:VideoInsight,项目名称:TranscodeModules,代码行数:79,代码来源:gpujpeg_encoder.cpp

示例11: GPU_ENTRY

rk4_mem *SOLVER(rk4, init, TARGET, SIMENGINE_STORAGE, solver_props *props) {
#if defined TARGET_GPU
  GPU_ENTRY(init, SIMENGINE_STORAGE);

  // Temporary CPU copies of GPU datastructures
  rk4_mem tmem;
  // GPU datastructures
  rk4_mem *dmem;

  // Computes GPU kernel geometry
  size_t shmem_per_thread, total_shmem = 1<<14;
  int warp_size = 1<<5;
  uint threads_per_block;
  uint num_gpu_threads;
  uint num_gpu_blocks;

  // shared space for model states and solver overhead
  shmem_per_thread = sizeof(CDATAFORMAT) * props->statesize * 6; // 6 = magic for rk4
  // shared space for a vector of time
  shmem_per_thread += sizeof(CDATAFORMAT);
  // shared space for a vector of `running' flags
  shmem_per_thread += sizeof(int);

  
  threads_per_block = total_shmem / shmem_per_thread;
  threads_per_block = warp_size * (threads_per_block / warp_size);

  num_gpu_threads = threads_per_block < props->num_models ? threads_per_block : props->num_models;
  num_gpu_blocks = (props->num_models + threads_per_block - 1) / threads_per_block;

  props->gpu.blockx = num_gpu_threads;
  props->gpu.blocky = 1;
  props->gpu.blockz = 1;
  props->gpu.gridx = num_gpu_blocks;
  props->gpu.gridy = 1;
  props->gpu.gridz = 1;
  props->gpu.shmem_per_block = shmem_per_thread * num_gpu_threads;

  
  // Allocate GPU space for mem and pointer fields of mem (other than props)
  cutilSafeCall(cudaMalloc((void**)&dmem, sizeof(rk4_mem)));
  tmem.props = GPU_ENTRY(init_props, SIMENGINE_STORAGE, props);

  cutilSafeCall(cudaMalloc((void**)&tmem.k1, props->statesize*props->num_models*sizeof(CDATAFORMAT)));

  cutilSafeCall(cudaMalloc((void**)&tmem.k2, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.k3, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.k4, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.temp, props->statesize*props->num_models*sizeof(CDATAFORMAT)));

  // Copy mem structure to GPU
  cutilSafeCall(cudaMemcpy(dmem, &tmem, sizeof(rk4_mem), cudaMemcpyHostToDevice));

  return dmem;
  
#else // Used for CPU and OPENMP targets

  rk4_mem *mem = (rk4_mem*)malloc(sizeof(rk4_mem));

  mem->props = props;
  mem->k1 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k2 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k3 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k4 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->temp = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));

  return mem;
#endif
}
开发者ID:joshuaecook,项目名称:simengine,代码行数:69,代码来源:rk4.c

示例12: CudaThrowsCall

float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
                const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
                size_t coeffSize )
{
    CuScalarT *dev_in = NULL;
    size_t pitchIn;

    CuScalarT *dev_prev = NULL;
    size_t pitchPrev;

    CuScalarT *dev_out = NULL;
    size_t pitchOut;

    CuScalarT *dev_co = NULL;

    try
    {
        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
                                        coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );

        CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
        CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        if( dev_in )
        {
            CudaSafeCall( cudaFree( ( void* )dev_in ) );
        }
        if( dev_prev )
        {
            CudaSafeCall( cudaFree( ( void* )dev_prev ) );
        }
        if( dev_out )
        {
            CudaSafeCall( cudaFree( ( void* )dev_out ) );
        }
        if( dev_co )
        {
            CudaSafeCall( cudaFree( ( void* )dev_co ) );
        }
        throw WLBadAllocException( "Could not allocate CUDA memory!" );
    }

    size_t threadsPerBlock = 32;
    size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
    size_t sharedMem = coeffSize * sizeof( CuScalarT );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    cudaEventRecord( start, 0 );
    cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
                    pitchOut, pitchIn, pitchPrev );
    cudaError_t kernelError = cudaGetLastError();

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    try
    {
        if( kernelError != cudaSuccess )
        {
            const std::string err( cudaGetErrorString( kernelError ) );
            throw WException( "CUDA kernel failed: " + err );
        }
        CudaThrowsCall(
                        cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyDeviceToHost ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        elapsedTime = -1.0;
    }

    CudaSafeCall( cudaFree( ( void* )dev_in ) );
    CudaSafeCall( cudaFree( ( void* )dev_prev ) );
    CudaSafeCall( cudaFree( ( void* )dev_out ) );
    CudaSafeCall( cudaFree( ( void* )dev_co ) );

    if( elapsedTime > -1.0 )
    {
        return elapsedTime;
//.........这里部分代码省略.........
开发者ID:labp,项目名称:na-online_ow-toolbox,代码行数:101,代码来源:WFIRFilterCuda.cpp

示例13: checkCudaErrors

BaseData<Dtype>::BaseData(const int length)
{
	length_ = length;
	checkCudaErrors(cudaHostAlloc(&cpu_data_, sizeof(Dtype)*length_, cudaHostAllocDefault));
	checkCudaErrors(cudaMalloc(&gpu_data_, sizeof(Dtype)*length_));
}
开发者ID:Haybla,项目名称:Latte,代码行数:6,代码来源:BaseData.cpp

示例14: ckm

void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma  )
{
	cublasStatus_t status;

	double g_val = *gamma;

	long int nfa;
	
	int len_tv;
	int ntv;
	int i_v;
	int i_el;
	int i_r, i_c;
	int trvei;

	double *tv_sq;
	double *v_f_g;

	float *tr_ar;
	float *tva, *vtm, *DP;
	float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0;

	cudaError_t cudaStat;   
	cublasHandle_t handle;
	
	status = cublasCreate(&handle);

	len_tv = prob-> x[0].dim;
	ntv   = prob-> l;

	nfa = len_tv * ntv; 

	tva = (float*) malloc ( len_tv * ntv* sizeof(float) );
	vtm = (float*) malloc ( len_tv * sizeof(float) );
	DP  = (float*) malloc ( ntv * sizeof(float) );

	tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) );

	tv_sq = (double*) malloc ( ntv * sizeof(double) );

	v_f_g  = (double*) malloc ( ntv * sizeof(double) );

	for ( i_r = 0; i_r < ntv ; i_r++ )
	{				 
		for ( i_c = 0; i_c < len_tv; i_c++ ) 
			tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c];
	}

	cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float));
	
	if (cudaStat != cudaSuccess) {
		free( tva );
		free( vtm );
		free( DP  );

		free( v_f_g );
		free( tv_sq );

		cudaFree( g_tva );
		cublasDestroy( handle );	
	
		fprintf (stderr, "!!!! Device memory allocation error (A)\n");
		getchar();
		return;
    }

	cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float));

	cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float));

	for( i_r = 0; i_r < ntv; i_r++ )
		for( i_c = 0; i_c < len_tv; i_c++ )
			tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c];

	// Copy cpu vector to gpu vector
	status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 );
    
	free( tr_ar );

	for( i_v = 0; i_v < ntv; i_v++ )
	{
		tv_sq[ i_v ] = 0;
		for( i_el = 0; i_el < len_tv; i_el++ )
			tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 );
	}



	for ( trvei = 0; trvei < ntv; trvei++ )
	{
		status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 );
		
		status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 );

		status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 );

		for ( i_c = 0; i_c < ntv; i_c++ )
			v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] ));
		

//.........这里部分代码省略.........
开发者ID:Kufieta,项目名称:CUDA,代码行数:101,代码来源:kernel_matrix_calculation.c

示例15: runAutoTest

void runAutoTest(int argc, char *argv[])
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);
    int devID = findCudaDevice(argc, (const char **)argv);

    // Ensure that SM 2.0 or higher device is available before running
    checkDeviceMeetComputeSpec(argc, argv);

    loadDefaultImage(argv[0]);

    Pixel *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel)));

    char *ref_file = NULL;
    char  dump_file[256];

    int mode = 0;
    mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode");
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);

    switch (mode)
    {
        case 0:
            g_SobelDisplayMode = SOBELDISPLAY_IMAGE;
            sprintf(dump_file, "lena_orig.pgm");
            break;

        case 1:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX;
            sprintf(dump_file, "lena_tex.pgm");
            break;

        case 2:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED;
            sprintf(dump_file, "lena_shared.pgm");
            break;

        default:
            printf("Invalid Filter Mode File\n");
            exit(EXIT_FAILURE);
            break;
    }

    printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);
    sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp);
    checkCudaErrors(cudaDeviceSynchronize());

    unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel));
    checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost));
    sdkSavePGM(dump_file, h_result, imWidth, imHeight);

    if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false))
    {
        g_TotalErrors++;
    }

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    if (g_TotalErrors != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed!\n");
    exit(EXIT_SUCCESS);
}
开发者ID:Aahung,项目名称:CudaSample,代码行数:68,代码来源:FunctionPointers.cpp


注:本文中的cudaMalloc函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。