当前位置: 首页>>代码示例>>C++>>正文


C++ cudaThreadSynchronize函数代码示例

本文整理汇总了C++中cudaThreadSynchronize函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaThreadSynchronize函数的具体用法?C++ cudaThreadSynchronize怎么用?C++ cudaThreadSynchronize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了cudaThreadSynchronize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: runCalcU

Fflcun2::ParticlesPosition Fflcun2::run(int nSteps) {
	for(int j = 0; j < nSteps; j++) {
		runCalcU(blocks, threads, devMatrixes);
		cudaThreadSynchronize();

		if (cPar->ft == ConstParams::ROTATING) {
			chPar->hExtX = cPar->hExtXInit * sin(2 * M_PI * cPar->rff * chPar->time);
			chPar->hExtY = cPar->hExtYInit * cos(2 * M_PI * cPar->rff * chPar->time);
		}
		fillGloabalChangable(chPar);
		cudaThreadSynchronize();

		runOneStep(blocks, threads, devMatrixes);
		cudaThreadSynchronize();

		runApplyDeltas(blocks, threads, devMatrixes);
		cudaThreadSynchronize();
		chPar->time += chPar->dTimeCurrent;
	}

/*	cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
*/
	cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);

	return partPos;
}
开发者ID:psci2195,项目名称:fflcu,代码行数:34,代码来源:fflcun2.cpp

示例2: benchmark

void
benchmark(int iterations) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));

    // execute the kernel
    for(int i=0; i<iterations; i++) {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStopTimer( timer));

    // check if kernel execution generated an error
    cutilCheckMsg("Kernel execution failed");

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    cutilSafeCall(cudaFree(d_result));
}
开发者ID:AnkurAnandapu,项目名称:ocelot-fork,代码行数:30,代码来源:recursiveGaussian-host.cpp

示例3: device_X

void RadialBasisFunction::Train(HostMatrix<float> &Input, HostMatrix<float> &Target){

	//std::cout << "Training" << std::endl;

	//	c_width = (float*) malloc(sizeof(float)*network_size);
	//	memset(c_width,0,sizeof(float)*network_size);

	DeviceMatrix<float> device_X(Input);

	//std::cout << "KMeans" << std::endl;	
	clock_t initialTime = clock();
	KMeans KM;
	KM.SetSeed(seed);
	dCenters = KM.Execute(device_X,network_size);

	cudaThreadSynchronize();
	times[0] = (clock() - initialTime);

	//std::cout << "Adjust Widths" << std::endl;
	/*Adjust width using mean of distance to neighbours*/
	initialTime = clock();
	AdjustWidths(number_neighbours);

	cudaThreadSynchronize();
	times[1] = (clock() - initialTime);

	/*Training weights and scaling factor*/
	HostMatrix<float> TargetArr(Target.Rows(),NumClasses);
	memset(TargetArr.Pointer(),0,sizeof(float)*TargetArr.Elements());

	for(int i = 0; i < Target.Rows(); i++){
		TargetArr(i,((int)Target(i,0)-1)) = 1;
	}

	DeviceMatrix<float> d_Target(TargetArr);

	//std::cout << "Calculating Weights" << std::endl;

	initialTime = clock();

	DeviceMatrix<float> device_activ_matrix(device_X.Rows(),dCenters.Rows(),ColumnMajor);

	KernelActivationMatrix(device_activ_matrix.Pointer(),device_X.Pointer(),dCenters.Pointer(),device_X.Columns(),dCenters.Columns(),device_activ_matrix.Columns(),device_activ_matrix.Rows(),scaling_factor,device_c_width.Pointer());

	DeviceMatrix<float> d_Aplus = UTILS::pseudoinverse(device_activ_matrix);

	dWeights = DeviceMatrix<float>(d_Aplus.Rows(),d_Target.Columns());

	d_Aplus.Multiply(d_Aplus,d_Target,dWeights);


	/*Return Weights and Centers*/
	cudaThreadSynchronize();
	times[2] = (clock() - initialTime);

	// cudaMemcpy(c_width,device_c_width.Pointer(),sizeof(float)*device_c_width.Length(),cudaMemcpyDeviceToHost);
	//	this->Weights = HostMatrix<float>(dWeights);		
	//	this->Centers = HostMatrix<float>(dCenters);

}
开发者ID:kallaballa,项目名称:Neurocid,代码行数:60,代码来源:RadialBasisFunction.cpp

示例4: cudaGraphicsMapResources

//-----------------------------------------------------------------------------
void QGLImageGpuWidget::fillPbo(iu::ImageGpu_8u_C4* output)
{
  // map GL <-> CUDA resource
  uchar4 *d_dst = NULL;
  size_t start;
  cudaGraphicsMapResources(1, &cuda_pbo_resource_, 0);
  cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &start, cuda_pbo_resource_);

  // get image data
  iuprivate::cuCopyImageToPbo(image_, num_channels_, bit_depth_, d_dst, min_, max_);
  cudaThreadSynchronize();

  // get overlays
  iuprivate::OverlayList::iterator it;
  for ( it=overlay_list_.begin() ; it != overlay_list_.end(); it++ )
    if ((*it)->isActive())
      cuCopyOverlayToPbo((*it), d_dst, image_->size());
  cudaThreadSynchronize();

  if (output != NULL)
  {
    // copy final pbo to output
    iu::ImageGpu_8u_C4 temp(d_dst, image_->width(), image_->height(),
                            image_->width()*sizeof(uchar4), true);
    iu::copy(&temp, output);
  }

  // unmap GL <-> CUDA resource
  cudaGraphicsUnmapResources(1, &cuda_pbo_resource_, 0);
}
开发者ID:ankurhanda,项目名称:imageutilities,代码行数:31,代码来源:qglimagegpuwidget.cpp

示例5: cudaThreadSynchronize

OsdCudaGLVertexBuffer::~OsdCudaGLVertexBuffer() {

    cudaThreadSynchronize();
    unmap();
    cudaGraphicsUnregisterResource(_cudaResource);
    cudaThreadSynchronize();
    glDeleteBuffers(1, &_vbo);
}
开发者ID:chrislowe5,项目名称:OpenSubdiv-dev,代码行数:8,代码来源:cudaGLVertexBuffer.cpp

示例6: main

int main(int argc, char** argv)
{

	float fTotalTime = 0;

// 	int TARGET_WIDTH=atoi(argv[2]);
// 	int TARGET_HEIGHT=atoi(argv[3]);
// 	bool visualize_results=atoi(argv[4]);
// 	unsigned int kernel_size=atoi(argv[2]);
 	int gpuNr=atoi(argv[2]);
 	checkCudaErrors(cudaSetDevice(gpuNr));

	
	IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE);
	unsigned char * d_input_image;
	unsigned char * d_output_image;
	int widthImage=gray_image->width;
	int heightImage=gray_image->height;
	
	IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1);
	for( int i=0;i<heightImage;i++)
	  for( int j=0;j<widthImage;j++)
	    output_image->imageData[i*widthImage+j]=255;
	
	  	  unsigned int * d_histogram;
	int total_threads=256;	  
	  cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads);
	checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char)));  
	checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char)));
	checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice));
	unsigned int windows_array[4]={15,17,25,31};
	int total_implementations=4;
	double elapsed_time;
	for (int i=1;i<=total_implementations;i++)
	{
	  for( int j=0;j<4;j++)
	  {
	timer my_timer;  
	MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i);
	cudaThreadSynchronize();
	elapsed_time=my_timer.elapsed();
	printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time);
	  }
	}
	timer array_timer;
	arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16);
		cudaThreadSynchronize();
	elapsed_time=array_timer.elapsed();
	printf("elapsed_time for array fire was %f \n",elapsed_time);
	checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost));
// 	_medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage);
	
	cvSaveImage("output.jpg",output_image);
	



}
开发者ID:aglenis,项目名称:gpu_medfilter,代码行数:58,代码来源:main_large_windows.cpp

示例7: checkCudaErrors

void
CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) {
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    std::cout << "  setting " << pmem.totalSize * sizeof(float) << " bytes to " <<  pmem.base << "\n";
    if (pmem.device) {
        checkCudaErrors(cudaThreadSynchronize());
        checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float)));
        checkCudaErrors(cudaThreadSynchronize());
    } else
        memset(pmem.base, byteVal, pmem.totalSize * sizeof(float));
}
开发者ID:ethancaballero,项目名称:Multiplicative-Recursive-Neural-Net,代码行数:12,代码来源:host-device_interface.cpp

示例8: time_ongpu

void time_ongpu(int TA, int TB, int m, int k, int n)
{
    int iter = 10;
    float *a = random_matrix(m,k);
    float *b = random_matrix(k,n);

    int lda = (!TA)?k:m;
    int ldb = (!TB)?n:k;

    float *c = random_matrix(m,n);

    float *a_cl = cuda_make_array(a, m*k);
    float *b_cl = cuda_make_array(b, k*n);
    float *c_cl = cuda_make_array(c, m*n);

    int i;
    clock_t start = clock(), end;
    for(i = 0; i<iter; ++i){
        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
        cudaThreadSynchronize();
    }
    double flop = ((double)m)*n*(2.*k + 2.)*iter;
    double gflop = flop/pow(10., 9);
    end = clock();
    double seconds = sec(end-start);
    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
    cuda_free(a_cl);
    cuda_free(b_cl);
    cuda_free(c_cl);
    free(a);
    free(b);
    free(c);
}
开发者ID:hyperchris,项目名称:Yolo,代码行数:33,代码来源:gemm.c

示例9: cudaGetDeviceProperties

void Fflcun2::setConfig(std::string fname) {	//WARNING - test it
	this->freeMemory();
	cPar = new ConstParams;
	chPar = new ChangableParams;
	configFileName = fname;
	this->setSettings();

	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, 0);
	//TODO - throw exception if no device found

	blocks = ceil((float)cPar->nPart / SHARED_ARRAY);
	blocks += blocks % deviceProp.multiProcessorCount;
	threads = cPar->nPart / blocks;
	cPar->nPart = threads * blocks;
	std::cout << "After correction number of particles = " << cPar->nPart << std::endl;

	srand(time(NULL));

	this->allocMemory();
	this->initDevMatrixes();

	fillGloabalChangable(chPar);
	fillGloabalConstant(cPar);

	runSetupKernel(blocks, threads, devMatrixes);
	cudaThreadSynchronize();
}
开发者ID:psci2195,项目名称:fflcu,代码行数:28,代码来源:fflcun2.cpp

示例10: cufftDestroy

void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils)
{
  if (!gpuMemAllocated)
    return;

  cufftDestroy(fft_plan);
  // Destroy the cuFFT plan.
  if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
    printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));
  freeLookupTable();
  
  freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop
  
  if (n_coils > 1 && deapo_d != NULL)
    cudaFree(deapo_d);
  
  if (this->applySensData())
    cudaFree(sens_d);
  
  if (this->applyDensComp())
    cudaFree(density_comp_d);

  showMemoryInfo();
  gpuMemAllocated = false;
}
开发者ID:davidssmith,项目名称:TRON,代码行数:25,代码来源:gpuNUFFT_operator.cpp

示例11: mpla_generic_dgemv

void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance)
{
	// allocate redistributed vector
	struct mpla_vector x_redist;
	mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count);

	// redistribute input vector with row-block parallel distribution to column-block parallel distribution
	mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance);
	
	// generic computation core: matrix-vector product
	mpla_dgemv_core(b, A, &x_redist, instance);

	// create sub-communicator for each process row
	int remain_dims[2];
	remain_dims[0]=0;
	remain_dims[1]=1;
	MPI_Comm row_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &row_comm);

	// summation of block row results
	double* sum;
	cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm);
	cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// cleanup
	cudaFree(sum);
	mpla_free_vector(&x_redist, instance);

	MPI_Comm_free(&row_comm);
}
开发者ID:zaspel,项目名称:MPLA,代码行数:33,代码来源:mpla.cpp

示例12: cudaLaunch

cudaError_t cudaLaunch(const void *entry)
{
	static cudaError_t (*nv_cudaLaunch)(const char *) = NULL;
	cudaError_t ret;
	struct timeval t;

	if(!nv_cudaLaunch) {
		nv_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch");
		if(!nv_cudaLaunch) {
			fprintf(stderr, "failed to find symbol cudaLaunch: %s\n", dlerror());
			return cudaErrorSharedObjectSymbolNotFound;
		}
	}

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepting cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);

	ret = nv_cudaLaunch(entry);

	cudaThreadSynchronize();

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepted cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);


	return ret;
}
开发者ID:arnaudperin,项目名称:gpudb,代码行数:27,代码来源:intercept.c

示例13: synchronize

 /**
  * Synchronize with device.
  *
  * @param sync True to synchronize, false if not.
  */
 inline void synchronize(const bool sync = true) {
   #ifdef ENABLE_CUDA
   if (sync) {
     CUDA_CHECKED_CALL(cudaThreadSynchronize());
   }
   #endif
 }
开发者ID:JohannesBuchner,项目名称:LibBi,代码行数:12,代码来源:cuda.hpp

示例14: mvReductArraysToHost

void
mvReductArraysToHost ( int reduct_bytes )
{
  cutilSafeCall ( cudaMemcpy ( OP_reduct_h, OP_reduct_d, reduct_bytes,
                               cudaMemcpyDeviceToHost ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:7,代码来源:op_cuda_rt_support.c

示例15: mvConstArraysToDevice

void
mvConstArraysToDevice ( int consts_bytes )
{
  cutilSafeCall ( cudaMemcpy ( OP_consts_d, OP_consts_h, consts_bytes,
                               cudaMemcpyHostToDevice ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:7,代码来源:op_cuda_rt_support.c


注:本文中的cudaThreadSynchronize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。