本文整理匯總了C++中CUDA_CHECK函數的典型用法代碼示例。如果您正苦於以下問題:C++ CUDA_CHECK函數的具體用法?C++ CUDA_CHECK怎麽用?C++ CUDA_CHECK使用的例子?那麽, 這裏精選的函數代碼示例或許可以為您提供幫助。
在下文中一共展示了CUDA_CHECK函數的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的C++代碼示例。
示例1: StopInternalThread
void InternalThread::StartInternalThread() {
// TODO switch to failing once Caffe prefetch thread is persistent.
// Threads should not be started and stopped repeatedly.
// CHECK(!is_started());
StopInternalThread();
#ifndef CPU_ONLY
CUDA_CHECK(cudaGetDevice(&device_));
#endif
mode_ = Caffe::mode();
rand_seed_ = caffe_rng_rand();
solver_count_ = Caffe::solver_count();
root_solver_ = Caffe::root_solver();
try {
thread_.reset(new boost::thread(&InternalThread::entry, this));
} catch (std::exception& e) {
CHECK(false) << e.what();
}
}
示例2: transform
void transform(Param<T> out, CParam<T> in, CParam<float> tf,
const bool inverse)
{
const dim_type nimages = in.dims[2];
// Multiplied in src/backend/transform.cpp
const dim_type ntransforms = out.dims[2] / in.dims[2];
// Copy transform to constant memory.
CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
cudaMemcpyDeviceToDevice));
dim3 threads(TX, TY, 1);
dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
if (nimages > 1) { blocks.x *= nimages; }
if (ntransforms > 1) { blocks.y *= ntransforms; }
if(inverse) {
transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
} else {
示例3: LOG
float Timer::MilliSeconds() {
if (!has_run_at_least_once()) {
LOG(WARNING) << "Timer has never been run before reading time.";
return 0;
}
if (running()) {
Stop();
}
if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
stop_gpu_));
#else
NO_GPU;
#endif
} else {
elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
}
return elapsed_milliseconds_;
}
示例4: CUDA_CHECK
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
// Initialize CUDA streams and cuDNN.
stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
CUDA_CHECK(cudaStreamCreate(&stream_[g]));
CUDNN_CHECK(cudnnCreate(&handle_[g]));
CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
}
// Set the indexing parameters.
weight_offset_ = (this->num_output_ / this->group_)
* (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
bias_offset_ = (this->num_output_ / this->group_);
// Create filter descriptor.
cudnn::createFilterDesc<Dtype>(&filter_desc_,
this->num_output_ / this->group_, this->channels_ / this->group_,
this->kernel_h_, this->kernel_w_);
// Create tensor descriptor(s) for data and corresponding convolution(s).
for (int i = 0; i < bottom.size(); i++) {
cudnnTensor4dDescriptor_t bottom_desc;
cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
bottom_descs_.push_back(bottom_desc);
cudnnTensor4dDescriptor_t top_desc;
cudnn::createTensor4dDesc<Dtype>(&top_desc);
top_descs_.push_back(top_desc);
cudnnConvolutionDescriptor_t conv_desc;
cudnn::createConvolutionDesc<Dtype>(&conv_desc);
conv_descs_.push_back(conv_desc);
}
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
}
}
示例5: morph
Array<T> morph(const Array<T> &in, const Array<T> &mask) {
const dim4 mdims = mask.dims();
if (mdims[0] != mdims[1])
CUDA_NOT_SUPPORTED("Rectangular masks are not supported");
if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");
Array<T> out = createEmptyArray<T>(in.dims());
CUDA_CHECK(cudaMemcpyToSymbolAsync(
kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
cudaMemcpyDeviceToDevice, cuda::getActiveStream()));
if (isDilation)
kernel::morph<T, true>(out, in, mdims[0]);
else
kernel::morph<T, false>(out, in, mdims[0]);
return out;
}
示例6: caffe_copy
void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
if (X != Y) {
// If there are more than one openmp thread (we are in active region)
// then checking Caffe::mode can create additional GPU Context
//
if (
#ifdef _OPENMP
(omp_in_parallel() == 0) &&
#endif
(Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
// NOLINT_NEXT_LINE(caffe/alt_fn)
CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
NO_GPU;
#endif
} else {
caffe_cpu_copy<Dtype>(N, X, Y);
}
}
}
示例7: switch
// 把數據放到cpu上
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_);
memset(cpu_ptr_, 0, size_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_);
own_cpu_data_ = true;
}
CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
head_ = SYNCED;
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
示例8: normalizeGPULaunch
/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
CUstream stream)
{
int sizeX, sizeY;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
void *normalize_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_norm->map,
(void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
(void *) &alfa, };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = NUM_SECTOR * 2;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, normalize_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");
return LATENT_SVM_OK;
}
示例9: PCAFeatureMapsAddNullableBorderGPULaunch
/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
CUstream stream)
{
int sizeX, sizeY, p;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
p = dev_map_in->numFeatures;
void *pca_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
(void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = 1;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, pca_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");
return LATENT_SVM_OK;
}
示例10: switch
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_);
CHECK(cpu_ptr_ != 0) << "size " << size_;
memset(cpu_ptr_, 0, size_);
head_ = HEAD_AT_CPU;
break;
#if 0
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_);
}
CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
head_ = SYNCED;
break;
#endif
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
示例11: pinnedAlloc
T* pinnedAlloc(const size_t &elements)
{
managerInit();
T* ptr = NULL;
// Allocate the higher megabyte. Overhead of creating pinned memory is
// more so we want more resuable memory.
size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;
if (elements > 0) {
// FIXME: Add better checks for garbage collection
// Perhaps look at total memory available as a metric
if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
pinnedGarbageCollect();
}
for(mem_iter iter = pinned_maps.begin();
iter != pinned_maps.end(); ++iter) {
mem_info info = iter->second;
if (info.is_free && info.bytes == alloc_bytes) {
iter->second.is_free = false;
pinned_used_bytes += alloc_bytes;
return (T *)iter->first;
}
}
// Perform garbage collection if memory can not be allocated
if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
pinnedGarbageCollect();
CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
}
mem_info info = {false, false, alloc_bytes};
pinned_maps[ptr] = info;
pinned_used_bytes += alloc_bytes;
}
return (T*)ptr;
}
示例12: LOG
float Timer::MicroSeconds() {
if (!has_run_at_least_once()) {
LOG(WARNING)<< "Timer has never been run before reading time.";
return 0;
}
if (running()) {
Stop();
}
#ifdef USE_CUDA
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
stop_gpu_));
// Cuda only measure milliseconds
elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
} else {
#endif
elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
}
#endif
return elapsed_microseconds_;
}
示例13: calculateNormGPULaunch
/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
int sizeX, sizeY, xp;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
xp = dev_map_in->numFeatures;
void *calc_norm_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
(void *) &sizeY, (void *) &xp, };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = 1;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");
return LATENT_SVM_OK;
}
示例14: CUDA_CHECK
void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
//LOG(ERROR)<<"device_id is "<<device;
CUDA_CHECK(cudaSetDevice(device));
#endif
started_.store(true);
MPIJob job;
while (true){
mutex::scoped_lock lock(queue_mutex_);
while( task_queue_.empty() && IsRunning()){
DLOG(INFO)<<"no job running, waiting on cond";
cond_work_.wait(lock);
}
lock.unlock();
DLOG(INFO)<<"Cond fulfilled, dispatching job";
if (IsRunning()){
job = task_queue_.front();
DLOG(INFO)<<task_queue_.size();
DispatchJob(job);
mutex::scoped_lock pop_lock(queue_mutex_);
task_queue_.pop();
pop_lock.unlock();
cond_finish_.notify_one();
DLOG(INFO)<<"job finished, poped taskqueue";
}else{
break;
}
}
// finish remaining jobs
while (!task_queue_.empty()){
boost::lock_guard<mutex> lock(queue_mutex_);
job = task_queue_.front();
task_queue_.pop();
DispatchJob(job);
}
}
示例15: morph
Array<T> morph(const Array<T> &in, const Array<T> &mask)
{
const dim4 mdims = mask.dims();
if (mdims[0] != mdims[1])
AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
if (mdims[0] > 19)
AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);
Array<T> out = createEmptyArray<T>(in.dims());
CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
mdims[0] * mdims[1] * sizeof(T),
0, cudaMemcpyDeviceToDevice,
cuda::getStream(cuda::getActiveDeviceId())));
if (isDilation)
kernel::morph<T, true >(out, in, mdims[0]);
else
kernel::morph<T, false>(out, in, mdims[0]);
return out;
}