本文整理汇总了C++中cudaStreamDestroy函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaStreamDestroy函数的具体用法?C++ cudaStreamDestroy怎么用?C++ cudaStreamDestroy使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaStreamDestroy函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: CopySegment
void CopySegment(int a, int b)
{
void *deva_buff = nullptr, *devb_buff = nullptr;
void *deva_buff2 = nullptr, *devb_buff2 = nullptr;
cudaStream_t a_stream, b_stream;
// Allocate buffers
CUDA_CHECK(cudaSetDevice(a));
CUDA_CHECK(cudaMalloc(&deva_buff, FLAGS_size));
CUDA_CHECK(cudaMalloc(&deva_buff2, FLAGS_size));
CUDA_CHECK(cudaStreamCreateWithFlags(&a_stream, cudaStreamNonBlocking));
CUDA_CHECK(cudaSetDevice(b));
CUDA_CHECK(cudaMalloc(&devb_buff, FLAGS_size));
CUDA_CHECK(cudaMalloc(&devb_buff2, FLAGS_size));
CUDA_CHECK(cudaStreamCreateWithFlags(&b_stream, cudaStreamNonBlocking));
// Synchronize devices before copying
CUDA_CHECK(cudaSetDevice(a));
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaSetDevice(b));
CUDA_CHECK(cudaDeviceSynchronize());
// Exchange
auto t1 = std::chrono::high_resolution_clock::now();
for(uint64_t i = 0; i < FLAGS_repetitions; ++i)
{
CUDA_CHECK(cudaMemcpyPeerAsync(devb_buff, b, deva_buff, a,
FLAGS_size, b_stream));
CUDA_CHECK(cudaMemcpyPeerAsync(deva_buff2, a, devb_buff2, b,
FLAGS_size, a_stream));
}
CUDA_CHECK(cudaSetDevice(a));
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaSetDevice(b));
CUDA_CHECK(cudaDeviceSynchronize());
auto t2 = std::chrono::high_resolution_clock::now();
double mstime = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000.0 / FLAGS_repetitions;
// MiB/s = [bytes / (1024^2)] / [ms / 1000]
double MBps = (FLAGS_size / 1024.0 / 1024.0) / (mstime / 1000.0);
printf("%.2lf MB/s (%lf ms)\n", MBps, mstime);
// Free buffers
CUDA_CHECK(cudaSetDevice(a));
CUDA_CHECK(cudaFree(deva_buff));
CUDA_CHECK(cudaFree(deva_buff2));
CUDA_CHECK(cudaStreamDestroy(a_stream));
CUDA_CHECK(cudaSetDevice(b));
CUDA_CHECK(cudaFree(devb_buff));
CUDA_CHECK(cudaFree(devb_buff2));
CUDA_CHECK(cudaStreamDestroy(b_stream));
}
示例2: CAFFE1_CUDA_CHECK
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
cudaStream_t stream;
cudaStream_t stream2;
if (Caffe::mode() == Caffe::GPU) {
CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (untransformed_top_)
CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
}
#endif
try {
while (!must_stop()) {
Batch<Dtype>* batch = prefetch_free_.pop();
Batch<Dtype>* batch_untransformed = NULL;
if (untransformed_top_)
{
batch_untransformed = prefetch_free_untransformed_.pop();
load_batch_and_untransformed_batch(batch,batch_untransformed);
}
else
load_batch(batch);
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
batch->data_.data().get()->async_gpu_push(stream);
CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
if (untransformed_top_)
{
batch_untransformed->data_.data().get()->async_gpu_push(stream2);
CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
}
}
#endif
prefetch_full_.push(batch);
if (untransformed_top_)
prefetch_full_untransformed_.push(batch_untransformed);
}
} catch (boost::thread_interrupted&) {
// Interrupted exception is expected on shutdown
}
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
if (untransformed_top_)
CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
}
#endif
}
示例3: cudnnDestroyTensorDescriptor
CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
// Check that handles have been setup before destroying.
if (!handles_setup_) { return; }
for (int_tp i = 0; i < bottom_descs_.size(); i++) {
cudnnDestroyTensorDescriptor(bottom_descs_[i]);
cudnnDestroyTensorDescriptor(top_descs_[i]);
cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
}
if (this->bias_term_) {
cudnnDestroyTensorDescriptor(bias_desc_);
}
cudnnDestroyFilterDescriptor(filter_desc_);
for (int_tp g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
cudaStreamDestroy(stream_[g]);
cudnnDestroy(handle_[g]);
}
cudaFree(workspaceData);
delete [] stream_;
delete [] handle_;
delete [] fwd_algo_;
delete [] bwd_filter_algo_;
delete [] bwd_data_algo_;
delete [] workspace_fwd_sizes_;
delete [] workspace_bwd_data_sizes_;
delete [] workspace_bwd_filter_sizes_;
}
示例4: TEST_P
TEST_P(MemcpyAsync, D2DTransfers) {
const size_t param = GetParam();
const size_t alloc = 1 << param;
cudaError_t ret;
void *d1, *d2;
ret = cudaMalloc(&d1, alloc);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaMalloc(&d2, alloc);
ASSERT_EQ(cudaSuccess, ret);
cudaStream_t stream;
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(d2, d1, alloc, cudaMemcpyDeviceToDevice, stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamSynchronize(stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaFree(d1);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaFree(d2);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamDestroy(stream);
ASSERT_EQ(cudaSuccess, ret);
}
示例5: CUDA_CHECK
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
cudaStream_t stream;//创建CUDA stream,非阻塞类型
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
}
#endif
try {
while (!must_stop()) { //循环载入批量数据
Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
batch->data_.data().get()->async_gpu_push(stream);
if (this->output_labels_) {
batch->label_.data().get()->async_gpu_push(stream);
}
CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
}
#endif
prefetch_full_.push(batch);//加入到带负载的Batch队列中
}
} catch (boost::thread_interrupted&) {//捕获异常,退出while循环
// Interrupted exception is expected on shutdown
}
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
}
#endif
}
示例6: TEST
/**
* CUDA4 introduced the cudaMemcpyDefault direction to cudaMemcpy.
*/
TEST(MemcpyAsync, CheckDefaultDirection) {
cudaError_t ret;
cudaStream_t stream;
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
int a1 = 0;
int a2 = 0;
int * b;
ret = cudaMalloc((void**) &b, sizeof(*b));
ASSERT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyDefault, stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(&a1, b, sizeof(a1), cudaMemcpyDefault, stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(b, &a1, sizeof(a1), cudaMemcpyDefault, stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(b, b, sizeof(a1), cudaMemcpyDefault, stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaStreamSynchronize(stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaFree(b);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamDestroy(stream);
EXPECT_EQ(cudaSuccess, ret);
}
示例7: TEST
TEST(StreamQuery, InvalidStream) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
cudaError_t ret;
cudaStream_t stream;
/* The CUDA 5.0 driver no longer segfaults. */
int driver;
ret = cudaDriverGetVersion(&driver);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamDestroy(stream);
ASSERT_EQ(cudaSuccess, ret);
if (driver >= 5000) {
ret = cudaStreamQuery(stream);
EXPECT_EQ(cudaErrorUnknown, ret);
} else {
EXPECT_EXIT({
cudaStreamQuery(stream); },
::testing::KilledBySignal(SIGSEGV), "");
}
示例8: THCudaShutdown
void THCudaShutdown(THCState* state)
{
THCRandom_shutdown(state);
THCudaBlas_shutdown(state);
free(state->blasState);
free(state->rngState);
free(state->deviceProperties);
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
for (int dev = 0; dev < state->numDevices; ++dev) {
THCudaCheck(cudaSetDevice(dev));
/* Free Torch-defined streams (0 is the default stream) */
for (int stream = 1; stream <= state->numUserStreams; ++stream) {
THCudaCheck(cudaStreamDestroy(state->streamsPerDevice[dev][stream]));
}
free(state->streamsPerDevice[dev]);
}
free(state->streamsPerDevice);
THCudaCheck(cudaSetDevice(prevDev));
}
示例9: TEST
TEST(EventRecord, RecordAfterDestroy) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
cudaError_t ret;
cudaEvent_t event;
cudaStream_t stream;
ret = cudaEventCreate(&event);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaEventDestroy(event);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
#if CUDART_VERSION >= 5000
ret = cudaEventRecord(event);
EXPECT_EQ(cudaErrorUnknown, ret);
#else
EXPECT_EXIT(
cudaEventRecord(event, stream),
::testing::KilledBySignal(SIGSEGV), "");
#endif
ret = cudaStreamDestroy(stream);
EXPECT_EQ(cudaSuccess, ret);
}
示例10: cudaEventDestroy
__host__ __device__
~future()
{
if(valid())
{
#if __BULK_HAS_CUDART__
// swallow errors
cudaError_t e = cudaEventDestroy(m_event);
#if __BULK_HAS_PRINTF__
if(e)
{
printf("CUDA error after cudaEventDestroy in future dtor: %s", cudaGetErrorString(e));
} // end if
#endif // __BULK_HAS_PRINTF__
if(m_owns_stream)
{
e = cudaStreamDestroy(m_stream);
#if __BULK_HAS_PRINTF__
if(e)
{
printf("CUDA error after cudaStreamDestroy in future dtor: %s", cudaGetErrorString(e));
} // end if
#endif // __BULK_HAS_PRINTF__
} // end if
#endif
} // end if
} // end ~future()
示例11: CUDA_CHECK
void BasePrefetchingLabelmapDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
cudaStream_t stream;
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
}
#endif
try {
while (!must_stop()) {
LabelmapBatch<Dtype>* batch = prefetch_free_.pop();
load_batch(batch);
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
batch->data_.data().get()->async_gpu_push(stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
#endif
prefetch_full_.push(batch);
}
} catch (boost::thread_interrupted&) {
// Interrupted exception is expected on shutdown
}
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamDestroy(stream));
}
#endif
}
示例12: cudaGetDevice
CudaStream::~CudaStream() {
int current_device; // Just to check CUDA status:
cudaError_t status = cudaGetDevice(¤t_device);
// Preventing dead lock while Caffe shutting down.
if (status != cudaErrorCudartUnloading) {
CUDA_CHECK(cudaStreamDestroy(stream_));
}
}
示例13: ActivateDevice
GpuDevice::Impl::~Impl() {
ActivateDevice();
for (size_t i = 0; i < kParallelism; ++i) {
CUDNN_CALL(cudnnDestroy(cudnn_handle[i]));
CUBLAS_CALL(cublasDestroy(cublas_handle[i]));
CUDA_CALL(cudaStreamDestroy(stream[i]));
}
}
示例14: CUDA_CHECK
NCCL<Dtype>::~NCCL() {
if (solver_->param().layer_wise_reduce()) {
CUDA_CHECK(cudaStreamDestroy(stream_));
}
if (comm_) {
ncclCommDestroy(comm_);
}
}
示例15: cudaStreamDestroy
void BilateralFilterLayer<Dtype>::cudastream_free() {
#ifndef CPU_ONLY
if(stream_ != NULL) {
cudaStreamDestroy(*stream_);
delete [] stream_;
stream_ = NULL;
}
#endif
}