C++ cudaStreamSynchronize函数代码示例

本文整理汇总了C++中cudaStreamSynchronize函数的典型用法代码示例。如果您正苦于以下问题：C++ cudaStreamSynchronize函数的具体用法？C++ cudaStreamSynchronize怎么用？C++ cudaStreamSynchronize使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了cudaStreamSynchronize函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: collect_final_result_dsyrk_syr2k

void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        //assume 1-switcher
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:31，代码来源:blasx_mem_control.c

示例2: collect_final_result_zgemm

void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:30，代码来源:blasx_mem_control.c

示例3: MyStreamSynchronize

cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
	cudaError_t result = cudaSuccess;
	if (abort_flag)
		return result;
	if (situation >= 0)
	{
		static std::map<int, tsumarray> tsum;

		double a = 0.95, b = 0.05;
		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence

		double tsync = 0.0;
		double tsleep = 0.95 * tsum[situation].value[thr_id];
		if (cudaStreamQuery(stream) == cudaErrorNotReady)
		{
			usleep((useconds_t)(1e6*tsleep));
			struct timeval tv_start, tv_end;
			gettimeofday(&tv_start, NULL);
			result = cudaStreamSynchronize(stream);
			gettimeofday(&tv_end, NULL);
			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
		}
		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
	}
	else
		result = cudaStreamSynchronize(stream);
	return result;
}

开发者ID:jcvernaleo，项目名称:ccminer，代码行数:29，代码来源:cuda.cpp

示例4: collect_final_result_dtrsm_mode_1

void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) {
    int temp = 0;
    for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) {
        //            printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher);
        int row = z;
        int col = tasks_rs[temp+stream_num*(1-switcher_rs)];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) {
        //assume 1-switcher
        //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp);
        int row = z;
        int col = tasks_rs[temp+stream_num*switcher_rs];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:30，代码来源:blasx_mem_control.c

示例5: get_gpu

JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint
    (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) {

	// the kernel is written to take multiple data sets and produce a set of results, but we're going
	// to run it as multiple parallel kernels, each producing a single result instead
	double *d_dataBuffer, *d_weightsBuffer, *d_score;
	int dataCount, dataLen, whichGPU;
	jdouble h_score, *h_dataBuffer, *h_weightsBuffer;
	cudaStream_t stream;

	// select a GPU for *this* specific dataset
	whichGPU = get_gpu();
	checkCudaErrors(cudaSetDevice(whichGPU));
	checkCudaErrors(cudaStreamCreate(&stream));

	// get a pointer to the raw input data, pinning them in memory
	dataCount = env->GetArrayLength(data);
	dataLen = dataCount*sizeof(double);
	assert(dataCount == env->GetArrayLength(weights));
	h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0);
	h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0);

	// copy input data to the GPU memory
	// TODO: It may be better to access host memory directly, skipping the copy.  Investigate.
	checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen));
	checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen));
	checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	// synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice
	checkCudaErrors(cudaStreamSynchronize(stream));
	// un-pin the host arrays, as we're done with them
	env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0);
	env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0);

	// allocate storage for the result
	checkCudaErrors(mallocBest((void**)&d_score, sizeof(double)));

	// run the kernel, to produce a result
	predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream);

	checkCudaErrors(cudaStreamSynchronize(stream));

	// copy result back to host
	checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream));

	checkCudaErrors(cudaStreamSynchronize(stream));

	// Free the GPU buffers
	checkCudaErrors(freeBest(d_dataBuffer));
	checkCudaErrors(freeBest(d_weightsBuffer));
	checkCudaErrors(freeBest(d_score));

	checkCudaErrors(cudaStreamDestroy(stream));

	return h_score;
}

开发者ID:IBMSparkGPU，项目名称:CUDA-MLlib，代码行数:56，代码来源:LogisticRegressionNative.cpp

示例6: dw_common_codelet_update_u11

static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
{
	float *sub11;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 

	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);

	unsigned long z;

	switch (s)
	{
		case 0:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				pivot = sub11[z+z*ld];
				STARPU_ASSERT(pivot != 0.0f);
		
				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
		
				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
						&sub11[z+(z+1)*ld], ld,
						&sub11[(z+1)+z*ld], 1,
						&sub11[(z+1) + (z+1)*ld],ld);
			}
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
				cudaStreamSynchronize(starpu_cuda_get_local_stream());

				STARPU_ASSERT(pivot != 0.0f);
				
				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
				
				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
								&sub11[z+(z+1)*ld], ld,
								&sub11[(z+1)+z*ld], 1,
								&sub11[(z+1) + (z+1)*ld],ld);
			}

			cudaStreamSynchronize(starpu_cuda_get_local_stream());

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}

开发者ID:excess-project，项目名称:starpu-ex-1.2.0rc5，代码行数:55，代码来源:dw_factolu_kernels.c

示例7: CAFFE1_CUDA_CHECK

void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  cudaStream_t stream2;
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      Batch<Dtype>* batch = prefetch_free_.pop();
      Batch<Dtype>* batch_untransformed = NULL;
      if (untransformed_top_)
        {
          batch_untransformed = prefetch_free_untransformed_.pop();
          load_batch_and_untransformed_batch(batch,batch_untransformed);
        }
      else
        load_batch(batch);

#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
        if (untransformed_top_)
          {
            batch_untransformed->data_.data().get()->async_gpu_push(stream2);
            CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
          }
      }
#endif
      prefetch_full_.push(batch);
      if (untransformed_top_)
        prefetch_full_untransformed_.push(batch_untransformed);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
  }
#endif
}

开发者ID:beniz，项目名称:caffe，代码行数:49，代码来源:base_data_layer.cpp

示例8: strmm_cuda

static void strmm_cuda(void *descr[], void *args) {
  float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned h = STARPU_MATRIX_GET_NX(descr[1]);

  unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
  unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
  
  struct strmm_arg * arg = (struct strmm_arg *)args;

  cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
  cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
  cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
  cublasOperation_t trans = CUBLAS_OP_T;

  const float factor = 1.0f;
  
  cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

  cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
  cudaStreamSynchronize(starpu_cuda_get_local_stream());

  free(arg);
}

开发者ID:hsyl20，项目名称:HaskellPU，代码行数:28，代码来源:FloatMatrixStrmm_c.c

示例9: TransferAllHalos

/**
 * @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes
 *
 * @param[in]		cartComm	The carthesian MPI communicator
 * @param[in]		domSize		The 2D size of the local domain
 * @param[in]		topIndex	The 2D index of the calling MPI process in the topology
 * @param[in]		neighbors	The list of ranks which are direct neighbors to the caller
 * @param[in]		copyStream	The stream used to overlap top & bottom halo exchange with side halo copy to host memory
 * @param[in, out]	devBlocks	The 2 device blocks that are updated during the Jacobi run
 * @param[in, out]	devSideEdges	The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them
 * @param[in, out]	devHaloLines	The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them
 * @param[in, out] 	hostSendLines	The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @param[in, out]	hostRecvLines	The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @return				The time spent during the MPI transfers
 */
double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream,
	real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2])
{
	real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1};
	real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1};
	int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]};
	int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]};
	int2 order = make_int2(topIndex->x % 2, topIndex->y % 2);
	double transferTime;

	// Populate the block's side edges
	CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream);

	// Exchange data with the top and bottom neighbors
	transferTime = MPI_Wtime();
	ExchangeHalos(cartComm, devSendLines[  order.y  ], hostSendLines[0], hostRecvLines[0], devRecvLines[  order.y  ], yNeighbors[  order.y  ], domSize->x);
	ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x);
	SafeCudaCall(cudaStreamSynchronize(copyStream));
	
	// Exchange data with the left and right neighbors
	ExchangeHalos(cartComm, devSideEdges[  order.x  ], hostSendLines[1], hostRecvLines[1], devHaloLines[  order.x  ], xNeighbors[  order.x  ], domSize->y);
	ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y); 
	transferTime = MPI_Wtime() - transferTime;

	// Copy the received halos to the device block
	CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors);

	return transferTime;
}

开发者ID:Haider-BA，项目名称:Matlab2CPP，代码行数:44，代码来源:Host.c

示例10: forward_process_synchronized

void forward_process_synchronized(struct ixmapfwd_thread *thread,
	unsigned int port_index, struct ixmap_packet *packet,
	unsigned int num_packets, struct ixmap_packet_cuda *result, uint8_t *read_buf)
{
	int fd, i;

	cudaStreamSynchronize(thread->stream);

	for(i = 0; i < num_packets; i++){
		if(result[i].outif >= 0){
			ixmap_tx_assign(thread->plane, result[i].outif,
				thread->buf, &packet[i]);
		}else if(result[i].outif == -1){
			goto packet_drop;
		}else{
			goto packet_inject;
		}

		continue;
packet_inject:
		memcpy(read_buf, packet[i].slot_buf, packet[i].slot_size);
		fd = thread->tun_plane->ports[port_index].fd;
		write(fd, read_buf, packet[i].slot_size);
packet_drop:
		ixmap_slot_release(thread->buf, packet[i].slot_index);
	}
	return;
}

开发者ID:edenden，项目名称:ixmap-cuda，代码行数:28，代码来源:forward.c

示例11: CUDA_CHECK

void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;//创建CUDA stream，非阻塞类型
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) { //循环载入批量数据
      Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
      load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        if (this->output_labels_) {
          batch->label_.data().get()->async_gpu_push(stream);
        }
        CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
      }
#endif
      prefetch_full_.push(batch);//加入到带负载的Batch队列中
    }
  } catch (boost::thread_interrupted&) {//捕获异常，退出while循环
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
  }
#endif
}

开发者ID:huanyii，项目名称:caffe_read，代码行数:32，代码来源:base_data_layer.cpp

示例12: LAMA_ASSERT_DEBUG

void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken )
{
    LAMA_CHECK_CUDA_ACCESS

    cudaStream_t stream = NULL;

    if ( syncToken )
    {
        CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken );
        LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" )
        stream = cudaStreamSyncToken->getCUDAStream();
    }

    cublasSetKernelStream( stream );
    LAMA_CHECK_CUBLAS_ERROR

    cublasDscal( n, alpha, x_d, incx );

    // No error check here possible as kernel is started asynchronously

    if ( !syncToken )
    {
        cudaStreamSynchronize( 0 );
        LAMA_CHECK_CUDA_ERROR
    }

开发者ID:fast-project，项目名称:lama，代码行数:25，代码来源:CUDABLAS1.cpp

示例13: advance_generations

/* 
 * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer
 * objects for writing from CUDA, executing the kernel <n> times, and unmapping
 * the pixel buffer object.
 */
void advance_generations(unsigned long n)
{
	uint8_t* device_bufs[2];
	size_t size;

	DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n");
	cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, 
								cuda_graphics_resources[0]);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, 
								cuda_graphics_resources[1]);

	check_cuda_error();

	while (n--) {

		DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n",
				grid.width, grid.height);

		launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], 
									grid.width, grid.height);

		grid.which_buf ^= 1;
	}

	DEBUG2("Unmapping CUDA resources\n");

	cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0);
	cudaStreamSynchronize(0);
}

开发者ID:anojavan，项目名称:csinparallel，代码行数:37，代码来源:cuda.c

示例14: CHECK

void NCCL<Dtype>::run(int layer) {
  CHECK(solver_->param().layer_wise_reduce());
  vector<shared_ptr<Blob<Dtype> > >& blobs =
    solver_->net()->layers()[layer]->blobs();
#ifdef DEBUG
  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
  for (int i = 1; i < blobs.size(); ++i) {
    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
             blobs[i + 0]->gpu_diff());
  }
#endif
  if (blobs.size() > 0) {
    // Make sure default stream is done computing gradients. Could be
    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
    // blocking the default stream, but it's actually slower.
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));

    // Reduce asynchronously
    int size = 0;
    for (int i = 0; i < blobs.size(); ++i) {
      size += blobs[i]->count();
    }
    if (barrier_) {  // NULL in multi process case
      barrier_->wait();
    }
    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
                             blobs[0]->mutable_gpu_diff(),
                             size,
                             nccl::dataType<Dtype>::type,
                             ncclSum, comm_, stream_));
    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
                   blobs[0]->mutable_gpu_diff(), stream_);
  }
}

开发者ID:20337112，项目名称:caffe，代码行数:34，代码来源:parallel.cpp

示例15: while

void KMeansController<Dtype>::PostProcess() {
    /** Use the cluster center calculated to get cluster labels**/

    int* label_data = this->int_outputs_[0]->mutable_cpu_data();

    // first restart the data provider.
    this->data_providers_[0]->ForceRestart();

    // assign cluster labels
    bool assignment_finished = false;
    while (!assignment_finished) {
        size_t batch_num = 0;
        size_t index = this->data_providers_[0]->current_index();

        // getting current index must precede GetData()
        // otherwise the current index in the data provide would be advanced
        this->mats_[0] = this->data_providers_[0]->GetData(batch_num);
        if ( this->data_providers_[0]->current_index()  == 0) {
            assignment_finished = true;
        }
        //execute only the maximization functions for all samples
        this->function_input_vecs_[0][0] = this->mats_[0].get();
        this->funcs_[0]->Execute(this->function_input_vecs_[0], this->function_output_vecs_[0], GKMeans::stream(0));
        CUDA_CHECK(cudaStreamSynchronize(GKMeans::stream(0)));

        // copy out cluster labels
        int* out_data = (int*)this->mats_[2]->cpu_data();
        for (size_t i = 0; i < batch_num; i++) {
            label_data[i + index] = out_data[i];
        }
    }

    // copy out center data
    memcpy(this->numeric_outputs_[0]->mutable_cpu_data(), this->mats_[1]->cpu_data(), this->mats_[1]->count() * sizeof(Dtype));
}

开发者ID:CUHK-MMLAB，项目名称:gkmeans，代码行数:35，代码来源:controllers.cpp

注：本文中的cudaStreamSynchronize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。