本文整理汇总了C++中cudaFree函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaFree函数的具体用法?C++ cudaFree怎么用?C++ cudaFree使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaFree函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: quantus_cuda_cleanup
void quantus_cuda_cleanup(quantus_comm<T> *comm)
{
cudaFree((T *) comm->matrix);
}
示例2: CUDA_CHECK
GPUParams<Dtype>::~GPUParams() {
#ifndef CPU_ONLY
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaFree(diff_));
#endif
}
示例3: free
static void free(void *data) {
if (data) {
// std::cout << "free " << data << std::endl;
throw_(cudaFree(data));
}
}
示例4: cudaFree
TxVectorOptimizationDataCU::~TxVectorOptimizationDataCU() {
if (devicePtr) {
cudaFree(devicePtr);
}
}
示例5: cudaFree
void CloudConstructor::freeGPUPoints() {
cudaFree(d_resultPoints);
d_resultPoints = NULL;
}
示例6: main
int
main()
{
int i;
struct timeval start, stop;
FILE *fd;
char *key;
cudaSetDevice(0);
/* Allocate memory */
if ((key = (char *)malloc(40 * sizeof(char))) == NULL) {
printf("Malloc failed!\n");
exit(EXIT_FAILURE);
}
cudaMallocHost((void **) &batchKeys,
((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t));
cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int));
cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t));
cudaMalloc((void **) &d_keys,
((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t));
cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int));
cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t));
/* Create 'BATCH_SIZE' number of random keys
* and add them to batch table
*/
batchNo = 0;
batchIndex[0] = 0;
for(i = 0; i < BATCH_SIZE; i++) {
gen_random(key, 30);
add_to_batch(key, 30);
}
/* Start Time (execution + memory) */
#ifdef EXEC_MEM
gettimeofday(&start, NULL);
#endif // EXEC_MEM
/* MemCpy Host -> Device */
cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] +
strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char),
cudaMemcpyHostToDevice);
cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int),
cudaMemcpyHostToDevice);
/* Start Time (execution only)*/
#ifndef EXEC_MEM
gettimeofday(&start, NULL);
#endif // EXEC_MEM
/* Call the kernel */
CUDAhash(d_keys, d_index, d_len, d_res);
/* Start Time (execution only)*/
#ifndef EXEC_MEM
cudaDeviceSynchronize();
gettimeofday(&stop, NULL);
#endif // EXEC_MEM
/* MemCpy Device -> Host */
cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t),
cudaMemcpyDeviceToHost);
/* Start Time (execution + memory) */
#ifdef EXEC_MEM
gettimeofday(&stop, NULL);
#endif // EXEC_MEM
#ifdef DEBUG
for(i = 0; i < BATCH_SIZE; i++) {
printf("%s\n", &batchKeys[batchIndex[i]]);
printf("%u\n", hashedKeys[i]);
}
#endif // DEBUG
/* Print Time */
fd = fopen("log.txt", "a+");
fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec));
fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE /
((double)(((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000);
fclose(fd);
#ifdef DEBUG
printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec));
#endif // DEBUG
/* Free memory */
cudaFree(batchKeys);
cudaFree(nKeys);
//.........这里部分代码省略.........
示例7: CUDA_SAFE_CALL
void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
} catch(...) {}
}
示例8: calculateOnGPU
//.........这里部分代码省略.........
//move constants variables to constant cuda memory
setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength,
queryLength, gapOpen, gapExtension, maxScore, partQuerySize,
U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG,
U2::SmithWatermanAlgorithm::STOP);
size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
// start main loop
for (int i = 0; i < queryDevider; i++) {
calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib,
g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax,
g_FdataUp, g_directionsUp, g_directionsRec,
g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);
cudaError hasErrors = cudaThreadSynchronize();
if (hasErrors != 0) {
u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
}
//revert arrays
g_HdataTmp = g_HdataRec;
g_HdataRec = g_HdataUp;
g_HdataUp = g_HdataTmp;
g_HdataTmp = g_directionsRec;
g_directionsRec = g_directionsUp;
g_directionsUp = g_HdataTmp;
}
//Copy vectors on host and find actual results
cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
}
QList<resType> pas;
resType res;
for (int j = 0; j < (sizeRow); j++) {
if (tempRow[j] >= maxScore) {
res.refSubseq.startPos = directionRow[j];
res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
res.score = tempRow[j];
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
qint32 pairAlignOffset = 0;
qint32 row = backtraceBegins[2 * j];
qint32 column = backtraceBegins[2 * j + 1];
while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
row--;
column--;
} else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
column--;
} else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
row--;
}
if(0 >= row || 0 >= column) {
break;
}
}
res.patternSubseq.startPos = row;
res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
}
pas.append(res);
}
}
//deallocation memory
cudaFree(g_seqLib);
cudaFree(g_queryProfile);
cudaFree(g_HdataMax);
cudaFree(g_HdataUp);
cudaFree(g_HdataRec);
cudaFree(g_FdataUp);
cudaFree(g_directionsUp);
cudaFree(g_directionsMax);
cudaFree(g_directionsRec);
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
cudaFree(g_directionsMatrix);
cudaFree(g_backtraceBegins);
}
delete[] tempRow;
delete[] directionRow;
delete[] zerroArr;
delete[] globalMatrix;
delete[] backtraceBegins;
return pas;
}
示例9:
~curandStateManager()
{
//if(_state != NULL) memFree((char*)_state);
if(_state != NULL) CUDA_CHECK(cudaFree(_state));
}
示例10: sci_gpuLU
//.........这里部分代码省略.........
default : throw "First option argument must be 0 or 1 or 2.";
}
switch((int)option[1])
{
case 0 : // Don't keep the data input on Device.
{
if(inputType_A == sci_matrix)
{
status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) throw status;
d_A = NULL;
}
break;
}
case 1 : // Keep data of the fisrt argument on Device and return the Device pointer.
{
if(inputType_A == sci_matrix)
{
gpuMat_CUDA* dptr;
gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
dptr=new gpuMat_CUDA(tmp);
dptr->useCuda = true;
dptr->ptr->set_ptr((double*)d_A);
if(bComplex_A)
dptr->complex=TRUE;
else
dptr->complex=FALSE;
sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
if(sciErr.iErr) throw sciErr;
LhsVar(posOutput)=Rhs+posOutput;
}
else
throw "The first input argument is already a GPU variable.";
posOutput++;
break;
}
default : throw "Second option argument must be 0 or 1.";
}
// Shutdown
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) throw status;
}
#endif
#ifdef WITH_OPENCL
if (!useCuda())
{
throw "not implemented with OpenCL.";
}
#endif
if(Rhs == 1)
{
free(option);
option = NULL;
}
if(posOutput < Lhs+1)
throw "Too many output arguments.";
if(posOutput > Lhs+1)
throw "Too few output arguments.";
PutLhsVar();
return 0;
}
catch(const char* str)
{
Scierror(999,"%s\n",str);
}
catch(SciErr E)
{
printError(&E, 0);
}
#ifdef WITH_CUDA
catch(cudaError_t cudaE)
{
GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
}
catch(cublasStatus CublasE)
{
GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
}
if (useCuda())
{
if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
}
#endif
#ifdef WITH_OPENCL
if (!useCuda())
{
Scierror(999,"not implemented with OpenCL.\n");
}
#endif
if(Rhs == 1 && option != NULL) free(option);
return EXIT_FAILURE;
}
示例11: main
//.........这里部分代码省略.........
{
fprintf(stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Performs operation using plain C code */
simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
h_C_ref = h_C;
/* Performs operation using cublas */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Allocate host memory for reading back the result from device memory */
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
/* Check result against reference */
error_norm = 0;
ref_norm = 0;
for (i = 0; i < n2; ++i)
{
diff = h_C_ref[i] - h_C[i];
error_norm += diff * diff;
ref_norm += h_C_ref[i] * h_C_ref[i];
}
error_norm = (float)sqrt((double)error_norm);
ref_norm = (float)sqrt((double)ref_norm);
if (fabs(ref_norm) < 1e-7)
{
fprintf(stderr, "!!!! reference norm is 0\n");
return EXIT_FAILURE;
}
/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
free(h_C_ref);
if (cudaFree(d_A) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_B) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_C) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
if (error_norm / ref_norm < 1e-6f)
{
printf("simpleCUBLAS test passed.\n");
exit(EXIT_SUCCESS);
}
else
{
printf("simpleCUBLAS test failed.\n");
exit(EXIT_FAILURE);
}
}
示例12: main
//.........这里部分代码省略.........
int sem_status = sem_wait(sem1);
if (sem_status == -1)
{
fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n",
pid, errno);
return errno;
}
sem_status = sem_post(sem2);
if (sem_status == -1)
{
fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
pid, errno);
return errno;
}
}
// At this point two processes are synchronized.
config.step++;
// Reassign porcesses' input data segments to show some
// possible manipulation on shared memory.
// Here we perform cyclic shift of data pointers.
config.idevice++;
config.idevice %= ndevices + 1;
config.inout_cpu = inout + config.idevice * np;
}
// Release device buffers.
if (worker)
{
cuda_status = cudaFree(config.in_dev);
if (cuda_status != cudaSuccess)
{
fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n",
pid, cuda_status);
return cuda_status;
}
cuda_status = cudaFree(config.out_dev);
if (cuda_status != cudaSuccess)
{
fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n",
pid, cuda_status);
return cuda_status;
}
}
else
{
free(config.in_dev);
free(config.out_dev);
}
printf("Device %d deinitialized py process %d\n", config.idevice, pid);
// On master process perform results check:
// compare each GPU result to CPU result.
if (master)
{
float* control = inout + np * ndevices;
for (int idevice = 0; idevice < ndevices; idevice++)
{
// Find the maximum abs difference.
int maxi = 0, maxj = 0;
float maxdiff = fabs(control[0] - (inout + idevice * np)[0]);
示例13:
OsdCudaTable::~OsdCudaTable() {
if (_devicePtr) cudaFree(_devicePtr);
}
示例14: main
int main(int argc, char *argv[])
{
// needed to work correctly with piped benchmarkrunner
setlinebuf(stdout);
setlinebuf(stdin);
int n_indices = 1;
int n_dimensions = 1;
char inBuf[200]; // ridiculously large input buffer.
bool isFirst = true;
do {
// Allocate memory for the arrays
int *h_indices = 0;
double *h_outputGPU = 0;
try
{
h_indices = new int [n_indices * n_dimensions];
h_outputGPU = new double [n_indices * n_dimensions];
}
catch (std::exception e)
{
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
return -1;
}
int *d_indices;
double *d_output;
try
{
cudaError_t cudaResult;
cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));
if (cudaResult != cudaSuccess)
{
throw std::runtime_error(cudaGetErrorString(cudaResult));
}
}
catch (std::runtime_error e)
{
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
return -1;
}
// Initialize the indices (done on the host)
for(int i = 0; i < n_indices; i++) {
h_indices[i] = i;
}
// Copy the indices to the device
cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
// Execute the QRNG on the device
int n_vec;
sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);
cudaDeviceSynchronize();
cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);
// Cleanup and terminate
delete h_indices;
cudaFree(d_indices);
cudaFree(d_output);
if(!isFirst) {
printf("RESULT ");
for(int i = 0; i < std::min(n_indices,10); i++)
printf("%f ", h_outputGPU[i]);
printf("\n");
}
else {
printf("OK\n");
isFirst = false;
}
delete h_outputGPU;
fgets(inBuf, 200, stdin);
if (sscanf(inBuf, "%u", &n_indices) == 0)
{
// if input is not a number, it has to be "EXIT"
if (strncmp("EXIT",inBuf,4)==0)
{
printf("OK\n");
break;
}
else
{
printf("ERROR. Bad input: %s\n", inBuf);
//.........这里部分代码省略.........
示例15: gpuErrchk
PhysicsProcessor::~PhysicsProcessor(void)
{
gpuErrchk(cudaFree(d_V));
}