當前位置: 首頁>>代碼示例>>C++>>正文


C++ CUDA_CHECK函數代碼示例

本文整理匯總了C++中CUDA_CHECK函數的典型用法代碼示例。如果您正苦於以下問題:C++ CUDA_CHECK函數的具體用法?C++ CUDA_CHECK怎麽用?C++ CUDA_CHECK使用的例子?那麽, 這裏精選的函數代碼示例或許可以為您提供幫助。


在下文中一共展示了CUDA_CHECK函數的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的C++代碼示例。

示例1: StopInternalThread

void InternalThread::StartInternalThread() {
  // TODO switch to failing once Caffe prefetch thread is persistent.
  // Threads should not be started and stopped repeatedly.
  // CHECK(!is_started());
  StopInternalThread();

#ifndef CPU_ONLY
  CUDA_CHECK(cudaGetDevice(&device_));
#endif
  mode_ = Caffe::mode();
  rand_seed_ = caffe_rng_rand();
  solver_count_ = Caffe::solver_count();
  root_solver_ = Caffe::root_solver();

  try {
    thread_.reset(new boost::thread(&InternalThread::entry, this));
  } catch (std::exception& e) {
    CHECK(false) << e.what();
  }
}
開發者ID:XinLiuNvidia,項目名稱:caffe,代碼行數:20,代碼來源:internal_thread.cpp

示例2: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            const dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            if (nimages > 1)     { blocks.x *= nimages;   }
            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
            } else {
開發者ID:EasonYi,項目名稱:arrayfire,代碼行數:20,代碼來源:transform.hpp

示例3: LOG

float Timer::MilliSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING) << "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
  if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                    stop_gpu_));
#else
      NO_GPU;
#endif
  } else {
    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
  }
  return elapsed_milliseconds_;
}
開發者ID:azrael417,項目名稱:caffe,代碼行數:20,代碼來源:benchmark.cpp

示例4: CUDA_CHECK

void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensor4dDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensor4dDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }
}
開發者ID:13331151,項目名稱:deeplab-public,代碼行數:41,代碼來源:cudnn_conv_layer.cpp

示例5: morph

Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
開發者ID:9prady9,項目名稱:arrayfire,代碼行數:21,代碼來源:morph_impl.hpp

示例6: caffe_copy

void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
  if (X != Y) {
    // If there are more than one openmp thread (we are in active region)
    // then checking Caffe::mode can create additional GPU Context
    //
    if (
#ifdef _OPENMP
        (omp_in_parallel() == 0) &&
#endif
        (Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
      // NOLINT_NEXT_LINE(caffe/alt_fn)
      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
      NO_GPU;
#endif
    } else {
      caffe_cpu_copy<Dtype>(N, X, Y);
    }
  }
}
開發者ID:crobertob,項目名稱:caffe,代碼行數:21,代碼來源:math_functions.cpp

示例7: switch

// 把數據放到cpu上
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
      own_cpu_data_ = true;
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
開發者ID:clarencezhang,項目名稱:caffe-windows-multilabels,代碼行數:22,代碼來源:syncedmem.cpp

示例8: normalizeGPULaunch

/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
           CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
        CUstream stream)
{
    int sizeX, sizeY;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;

    void *normalize_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map,
            (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
            (void *) &alfa, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = NUM_SECTOR * 2;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, normalize_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");

    return LATENT_SVM_OK;
}
開發者ID:ZenzouFuruta,項目名稱:Autoware,代碼行數:57,代碼來源:featurepyramid_gpu.cpp

示例9: PCAFeatureMapsAddNullableBorderGPULaunch

/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
           CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
        CUstream stream)
{
    int sizeX, sizeY, p;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    p = dev_map_in->numFeatures;

    void *pca_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
            block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, pca_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");

    return LATENT_SVM_OK;
}
開發者ID:ZenzouFuruta,項目名稱:Autoware,代碼行數:57,代碼來源:featurepyramid_gpu.cpp

示例10: switch

inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    CHECK(cpu_ptr_ != 0) << "size " << size_;
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    break;
#if 0
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
#endif
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
開發者ID:Devy001,項目名稱:Caffe-mini,代碼行數:22,代碼來源:syncedmem.cpp

示例11: pinnedAlloc

    T* pinnedAlloc(const size_t &elements)
    {
        managerInit();
        T* ptr = NULL;
        // Allocate the higher megabyte. Overhead of creating pinned memory is
        // more so we want more resuable memory.
        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
                pinnedGarbageCollect();
            }

            for(mem_iter iter = pinned_maps.begin();
                iter != pinned_maps.end(); ++iter) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    pinned_used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
                pinnedGarbageCollect();
                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, false, alloc_bytes};
            pinned_maps[ptr] = info;
            pinned_used_bytes += alloc_bytes;
        }
        return (T*)ptr;
    }
開發者ID:hxiaox,項目名稱:arrayfire,代碼行數:39,代碼來源:memory.cpp

示例12: LOG

float Timer::MicroSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING)<< "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
#ifdef USE_CUDA
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
            stop_gpu_));
    // Cuda only measure milliseconds
    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
  } else {
#endif
    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
  }
#endif
  return elapsed_microseconds_;
}
開發者ID:rickyHong,項目名稱:CaffeForOpenCL,代碼行數:22,代碼來源:benchmark.cpp

示例13: calculateNormGPULaunch

/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
          CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
    int sizeX, sizeY, xp;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    xp = dev_map_in->numFeatures;

    void *calc_norm_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &xp, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");

    return LATENT_SVM_OK;
}
開發者ID:ZenzouFuruta,項目名稱:Autoware,代碼行數:53,代碼來源:featurepyramid_gpu.cpp

示例14: CUDA_CHECK

void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
  //LOG(ERROR)<<"device_id is "<<device;
  CUDA_CHECK(cudaSetDevice(device));
#endif
  started_.store(true);
  MPIJob job;
  while (true){
    mutex::scoped_lock lock(queue_mutex_);
    while( task_queue_.empty() && IsRunning()){
      DLOG(INFO)<<"no job running, waiting on cond";
      cond_work_.wait(lock);
    }
    lock.unlock();

    DLOG(INFO)<<"Cond fulfilled, dispatching job";
    if (IsRunning()){
      job = task_queue_.front();
      DLOG(INFO)<<task_queue_.size();
      DispatchJob(job);
      mutex::scoped_lock pop_lock(queue_mutex_);
      task_queue_.pop();
      pop_lock.unlock();
      cond_finish_.notify_one();
      DLOG(INFO)<<"job finished, poped taskqueue";
    }else{
      break;
    }

  }

  // finish remaining jobs
  while (!task_queue_.empty()){
    boost::lock_guard<mutex> lock(queue_mutex_);
    job = task_queue_.front();
    task_queue_.pop();
    DispatchJob(job);
  }
}
開發者ID:xiangqiaolxq,項目名稱:caffe-parallel,代碼行數:39,代碼來源:mpijob.cpp

示例15: morph

Array<T>  morph(const Array<T> &in, const Array<T> &mask)
{
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
    if (mdims[0] > 19)
        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                       mdims[0] * mdims[1] * sizeof(T),
                                       0, cudaMemcpyDeviceToDevice,
                                       cuda::getStream(cuda::getActiveDeviceId())));

    if (isDilation)
        kernel::morph<T, true >(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
開發者ID:hxiaox,項目名稱:arrayfire,代碼行數:23,代碼來源:morph_impl.hpp


注:本文中的CUDA_CHECK函數示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。