当前位置: 首页>>代码示例>>C++>>正文


C++ MPI_Reduce函数代码示例

本文整理汇总了C++中MPI_Reduce函数的典型用法代码示例。如果您正苦于以下问题:C++ MPI_Reduce函数的具体用法?C++ MPI_Reduce怎么用?C++ MPI_Reduce使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了MPI_Reduce函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: main

/*
 * This test attempts collective communication after a process in
 * the communicator has failed.
 */
int main(int argc, char **argv)
{
    int rank, size, i, rc, errclass, toterrs, errs = 0;
    char rbuf[100000];
    char *sendbuf;
    int deadprocs[1] = {1};
    MPI_Group world, newgroup;
    MPI_Comm newcomm;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    if (size < 3) {
        fprintf( stderr, "Must run with at least 3 processes\n" );
        MPI_Abort( MPI_COMM_WORLD, 1 );
    }

    MPI_Comm_group(MPI_COMM_WORLD, &world);
    MPI_Group_excl(world, 1, deadprocs, &newgroup);
    MPI_Comm_create_group(MPI_COMM_WORLD, newgroup, 0, &newcomm);

    if (rank == 1) {
        exit(EXIT_FAILURE);
    }

    /* try a small send first */
    sendbuf = (char *)malloc(10*size*sizeof(char));

    if (rank == 0) {
      for (i=0;i<size;i++) {
          strcpy(sendbuf + i*10, "No Errors");
      }
    }

    rc = MPI_Scatter(sendbuf, 10, MPI_CHAR, rbuf, 10, MPI_CHAR, 0, MPI_COMM_WORLD);

#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
    MPI_Error_class(rc, &errclass);
    if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
        fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
        errs++;
    }
#endif

    /* reset the buffers and try a larger scatter */
    free(sendbuf);
    memset(rbuf, 0, sizeof(rbuf));
    sendbuf = (char *)malloc(100000*size*sizeof(char));

    if (rank == 0) {
      for (i=0;i<size;i++) {
          strcpy(sendbuf + i*100000, "No Errors");
      }
    }

    rc = MPI_Scatter(sendbuf, 100000, MPI_CHAR, rbuf, 100000, MPI_CHAR, 0, MPI_COMM_WORLD);

#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
    MPI_Error_class(rc, &errclass);
    if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
        fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
        errs++;
    }
#endif

    rc = MPI_Reduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, 0, newcomm);
    if(rc)
        fprintf(stderr, "Failed to get errors from other processes\n");

    if (rank == 0) {
        if (toterrs) {
            printf( " Found %d errors\n", toterrs );
        }
        else {
            printf( " No Errors\n" );
        }
        fflush(stdout);
    }

    free(sendbuf);

    MPI_Comm_free(&newcomm);
    MPI_Group_free(&newgroup);
    MPI_Group_free(&world);

    MPI_Finalize();

    return 0;
}
开发者ID:adevress,项目名称:MPICH-BlueGene,代码行数:95,代码来源:scatter.c

示例2: main

int main(int argc, char **argv)
{
	int rank;
	int world_size;

	/*
	 *	Initialization
	 */
	int thread_support;
	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
		fprintf(stderr,"MPI_Init_thread failed\n");
		exit(1);
	}
	if (thread_support == MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
	if (thread_support < MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI does not have thread support!\n");

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);

	starpu_srand48((long int)time(NULL));

	parse_args(rank, argc, argv);

	int ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	/* We disable sequential consistency in this example */
	starpu_data_set_default_sequential_consistency_flag(0);

	starpu_mpi_init(NULL, NULL, 0);

	STARPU_ASSERT(p*q == world_size);

	starpu_cublas_init();

	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	/*
	 * 	Problem Init
	 */

	init_matrix(rank);

	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
                        (int)(allocated_memory/(1024*1024)),
			(int)(allocated_memory_extra/(1024*1024)),
                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));

	display_grid(rank, nblocks);

	TYPE *a_r = NULL;
//	STARPU_PLU(display_data_content)(a_r, size);

	TYPE *x, *y;

	if (check)
	{
		x = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(x);

		y = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			unsigned ind;
			for (ind = 0; ind < size; ind++)
				x[ind] = (TYPE)starpu_drand48();
		}

		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);

		if (rank == 0)
			STARPU_PLU(display_data_content)(a_r, size);

//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
	}

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

	/*
	 * 	Report performance
	 */

	int reduce_ret;
	double min_timing = timing;
	double max_timing = timing;
	double sum_timing = timing;

	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
//.........这里部分代码省略.........
开发者ID:joao-lima,项目名称:starpu-1.2.0rc2,代码行数:101,代码来源:plu_example.c

示例3: main

int main(int argc, char** argv)
{
       	int rank;
	int size;
	MPI_Init(&argc, &argv);
	
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
        
	if(argc != 5) {
          fprintf(stderr, "argc %d\n", argc);
          fprintf(stderr, 
              "Usage: %s <left> <right> <numSyncFiles> <tolerance>\n"
              "where <left> and <right> are different"
              "N-procs_case directories\n", argv[0]);
          MPI_Finalize();
          return 1;
        }
	char* lpath = argv[1];
	char* rpath = argv[2];
        int nSyncFiles = atoi(argv[3]);
        double tolerance = atof(argv[4]);

	int ndof;
	int nshg;
	int solsize;
	double* solution;

	std::set<int>* l_timesteps = find_timesteps(lpath, nSyncFiles);
	std::set<int>* r_timesteps = find_timesteps(rpath, nSyncFiles);
	std::set<int>* timesteps_to_check = new std::set<int>;
	std::set_intersection(l_timesteps->begin(), l_timesteps->end(),
			r_timesteps->begin(), r_timesteps->end(),
			std::inserter(*timesteps_to_check, timesteps_to_check->begin()));
        delete l_timesteps;
        delete r_timesteps;
	if(rank == 0)
		printf("Found %d common timesteps\n",
			       	timesteps_to_check->size());
#ifdef DBGONLY
	read_solution(&solution, &solsize, &nshg, &ndof, 
            size, rank, 0, numSyncFiles, "./");
	printf("nshg: %d, ndof: %d\n", nshg, ndof);
	assert(solsize == ndof*nshg);
#endif
	double maxerror = 0.0;
	double error;
	double gblmaxerror;
	for(std::set<int>::iterator i = timesteps_to_check->begin();
			i!=timesteps_to_check->end();i++)
	{
		error = compare_solution(lpath, rpath, *i, size, nSyncFiles);
		if(error>maxerror) maxerror = error;
	}
        delete timesteps_to_check;
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Reduce(&maxerror, &gblmaxerror, 1, MPI_DOUBLE, MPI_MAX, 0,
		MPI_COMM_WORLD);
	if(rank == 0) printf("Maximum difference across all timesteps: %e\n", 
			gblmaxerror);
	MPI_Finalize();
        return (gblmaxerror > tolerance);
}
开发者ID:cwsmith,项目名称:phasta,代码行数:63,代码来源:checkphasta.cpp

示例4: test_P3DFFT

void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){

  int nx,ny,nz,procid,nprocs,ndim;
  int istart[3],isize[3],iend[3];
  int fstart[3],fsize[3],fend[3];
  int p3dfft_mem_conf,nrep;
  long int Nlocal,Nglob;
  double factor;
  double l_timers[12]={0},g_timers[12]={0};
  double total_time=0*MPI_Wtime(), setup_time=0;
  // rtime_local is timings on each process and _global is the max reduced to root
  // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time
  // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root.
  // the sum of first four elements give the comm time
  unsigned char op_f[4]="fft", op_b[4]="tff";
  int memsize[3];

  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&procid);



  nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP;

  if(decomp==1){
    dims[0] = 1; dims[1] = nprocs;
  }
  if(procid == 0)
    printf("Using processor grid %d x %d\n",dims[0],dims[1]);

  /* Initialize P3DFFT */
  MPI_Barrier(MPI_COMM_WORLD);
  setup_time -= MPI_Wtime(); //Compute Setup Time.
  Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize);
  setup_time +=  MPI_Wtime(); //Compute Setup Time.
  PCOUT<<"done with setup"<<std::endl;

  Cp3dfft_get_dims(istart,iend,isize,1);
  Cp3dfft_get_dims(fstart,fend,fsize,2);
  /* Allocate and initialize */

  double *A; // Input matrix A
  A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));
  //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));

  /* Warmup */
  Cp3dfft_ftran_r2c(A,A,op_f);
  Cp3dfft_ftran_r2c(A,A,op_f);
  MPI_Barrier(MPI_COMM_WORLD);
  Cset_timers();

  for (int rep=0; rep<nrep; rep++){
    initialize_p3dfft(A,n);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Forward transform */
    total_time -=  MPI_Wtime();
    Cp3dfft_ftran_r2c(A,A,op_f);
    total_time +=  MPI_Wtime();

    MPI_Barrier(MPI_COMM_WORLD);
  }

  Cget_timers(l_timers);
  Cp3dfft_btran_c2r(A,A,op_b);

  /* Compute Error */
  //PCOUT<<"Done With FFTs computing error"<<std::endl;
  compute_error_p3dfft(A,n);

  /* Gather timing statistics */
  double g_total_time, g_comm_time, g_setup_time;

  MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);


  g_total_time=g_total_time/nrep;
  g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep);
  //g_total_time=g_total_time/((double)nrep);
  ptrdiff_t size=n[0];size*=n[1]; size*=n[2];
  double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9;

  if(procid == 0){
    std::cout.precision(4);
    std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;;
    std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl;
    std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl;
    std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl;
    std::cout<<"\033[1;31m";
    std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl;
    std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
    std::cout<<"\033[0m\n"<<std::endl;

    results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
  }
  /* Free work space */
  fftw_free(A);
//.........这里部分代码省略.........
开发者ID:spiralgen,项目名称:accfft-scratch,代码行数:101,代码来源:p3dfft.cpp

示例5: olsr_main

int olsr_main(int argc, char *argv[])
{
    int i;
    char log[32];

    tw_opt_add(olsr_opts);
    tw_init(&argc, &argv);

#if DEBUG
    sprintf( log, "olsr-log.%ld", g_tw_mynode );

    olsr_event_log = fopen( log, "w+");
    if( olsr_event_log == nullptr )
        tw_error( TW_LOC, "Failed to Open OLSR Event Log file \n");
#endif

    g_tw_mapping = CUSTOM;
    g_tw_custom_initial_mapping = &olsr_custom_mapping;
    g_tw_custom_lp_global_to_local_map = &olsr_mapping_to_lp;

    // nlp_per_pe = OLSR_MAX_NEIGHBORS;// / tw_nnodes();
    //g_tw_lookahead = SA_INTERVAL;

    SA_range_start = nlp_per_pe;

    // Increase nlp_per_pe by nlp_per_pe / OMN
    nlp_per_pe += nlp_per_pe / OLSR_MAX_NEIGHBORS;

    g_tw_events_per_pe =  OLSR_MAX_NEIGHBORS / 2 * nlp_per_pe  + 65536;
    tw_define_lps(nlp_per_pe, sizeof(olsr_msg_data));

    for(i = 0; i < OLSR_END_EVENT; i++)
        g_olsr_event_stats[i] = 0;

    for (i = 0; i < SA_range_start; i++) {
        tw_lp_settype(i, &olsr_lps[0]);
    }

    for (i = SA_range_start; i < nlp_per_pe; i++) {
        tw_lp_settype(i, &olsr_lps[1]);
    }

#if DEBUG
    printf("g_tw_nlp is %llu\n", g_tw_nlp);
#endif

    tw_run();

    if( g_tw_synchronization_protocol != 1 )
    {
        MPI_Reduce( g_olsr_event_stats, g_olsr_root_event_stats, OLSR_END_EVENT, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
    }
    else {
        for (i = 0; i < OLSR_END_EVENT; i++) {
            g_olsr_root_event_stats[i] = g_olsr_event_stats[i];
        }
    }

    if (tw_ismaster()) {
        for( i = 0; i < OLSR_END_EVENT; i++ )
            printf("OLSR Type %s Event Count = %llu \n", event_names[i], g_olsr_root_event_stats[i]);
        printf("Complete.\n");
    }

    tw_end();

    return 0;
}
开发者ID:laprej,项目名称:ROSS-Models,代码行数:68,代码来源:olsr-main.cpp

示例6: main


//.........这里部分代码省略.........
    //printf("count = %lld\n", count);

    if (count != nitem) {
      fprintf( stderr, "%d: Wrong count (%lld) on write\n", rank, count );
      fflush(stderr);
      /* exit */
      MPI_Finalize();
      exit(1);
    }

    MPI_File_close(&fh);
    MPI_Barrier(MPI_COMM_WORLD);
    if(rank == 0) printf("File is written\n\n");
  }

  double *tmp = (double *)malloc( nitem * sizeof(double) );
  memset (tmp, 0, nitem*sizeof(double));

  if(use_normalsto == 1) {
      MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );
      /* Read nothing (check status) */
      memset( &status, 0xff, sizeof(MPI_Status) );
      
      offset = rank * nitem * type_size;

      /* start I/O */
      stime = MPI_Wtime();
      MPI_File_read_at(fh, offset, tmp, nitem, MPI_DOUBLE, &status);
      etime = MPI_Wtime();
      /* end I/O */
      iotime = etime - stime;
      
      if(_debug==1) printf("%d: iotime = %10.4f\n", rank, iotime);
      MPI_Reduce(&iotime, &max_iotime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      
      sum = 0.0; /* reset sum */
      
      /* start computation */
      stime = MPI_Wtime();
      
      for(i=0; i<nitem; i++) {
          sum += tmp[i];
      }
      
      MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      etime = MPI_Wtime();
      /* end computation */

      comptime = etime - stime;

      if(_debug==1) printf("%d: comptime = %10.4f\n", rank, comptime);
      
      MPI_Reduce(&comptime, &max_comptime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

      if(rank == 0) {
          elapsed_time = max_comptime + max_iotime;
          printf("<<Result (SUM) with normal read>>\n"
                 "SUM              = %10.4f \n"
                 "Computation time = %10.4f sec\n"
                 "I/O time         = %10.4f sec\n"
                 "total time       = %10.4f sec\n\n", 
                 global_sum, max_comptime, max_iotime, elapsed_time);
      }
      
      MPI_File_close(&fh);
  }
开发者ID:kandycs,项目名称:PVFS-AS,代码行数:67,代码来源:simple.c

示例7: trainOneEpochDenseCPU

void trainOneEpochDenseCPU(int itask, float *data, float *numerator,
                           float *denominator, float *codebook,
                           unsigned int nSomX, unsigned int nSomY,
                           unsigned int nDimensions, unsigned int nVectors,
                           unsigned int nVectorsPerRank, float radius,
                           float scale, string mapType, string gridType,
                           bool compact_support, bool gaussian, int *globalBmus) {
    unsigned int p1[2] = {0, 0};
    unsigned int *bmus = new unsigned int[nVectorsPerRank * 2];
#ifdef _OPENMP
    #pragma omp parallel default(shared) private(p1)
#endif
    {
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int n = 0; n < nVectorsPerRank; n++) {
#else
        for (unsigned int n = 0; n < nVectorsPerRank; n++) {
#endif
            if (itask * nVectorsPerRank + n < nVectors) {
                /// get the best matching unit
                get_bmu_coord(codebook, data, nSomY, nSomX,
                              nDimensions, p1, n);
                bmus[2 * n] = p1[0];
                bmus[2 * n + 1] = p1[1];
            }
        }
    }

    float *localNumerator = new float[nSomY * nSomX * nDimensions];
    float *localDenominator = new float[nSomY * nSomX];
#ifdef _OPENMP
    #pragma omp parallel default(shared)
#endif
    {
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int som_y = 0; som_y < nSomY; som_y++) {
#else
        for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
            for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
                localDenominator[som_y * nSomX + som_x] = 0.0;
                for (unsigned int d = 0; d < nDimensions; d++)
                    localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] = 0.0;
            }
        }
        /// Accumulate denoms and numers
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int som_y = 0; som_y < nSomY; som_y++) {
#else
        for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
            for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
                for (unsigned int n = 0; n < nVectorsPerRank; n++) {
                    if (itask * nVectorsPerRank + n < nVectors) {
                        float dist = 0.0f;
                        if (gridType == "rectangular") {
                            if (mapType == "planar") {
                                dist = euclideanDistanceOnPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
                            }
                            else if (mapType == "toroid") {
                                dist = euclideanDistanceOnToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
                            }
                        }
                        else {
                            if (mapType == "planar") {
                                dist = euclideanDistanceOnHexagonalPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
                            }
                            else if (mapType == "toroid") {
                                dist = euclideanDistanceOnHexagonalToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
                            }
                        }
                        float neighbor_fuct = getWeight(dist, radius, scale, compact_support, gaussian);

                        for (unsigned int d = 0; d < nDimensions; d++) {
                            localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] +=
                                1.0f * neighbor_fuct
                                * (*(data + n * nDimensions + d));
                        }
                        localDenominator[som_y * nSomX + som_x] += neighbor_fuct;
                    }
                }
            }
        }
    }
#ifdef HAVE_MPI
    MPI_Reduce(localNumerator, numerator,
               nSomY * nSomX * nDimensions, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(localDenominator, denominator,
               nSomY * nSomX, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Gather(bmus, nVectorsPerRank * 2, MPI_INT, globalBmus, nVectorsPerRank * 2, MPI_INT, 0, MPI_COMM_WORLD);

//.........这里部分代码省略.........
开发者ID:xgerrmann,项目名称:somoclu,代码行数:101,代码来源:denseCpuKernels.cpp

示例8: main


//.........这里部分代码省略.........
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm1);

	IF_VERBOSE(("0: sending 2 to process 2.\n"));
	data = 2;
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm2);

	IF_VERBOSE(("0: sending 3 to process 3.\n"));
	data = 3;
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm3);

	IF_VERBOSE(("0: disconnecting.\n"));
	MPI_Comm_disconnect(&comm1);
	MPI_Comm_disconnect(&comm2);
	MPI_Comm_disconnect(&comm3);
    }
    else if (rank == 1)
    {
	IF_VERBOSE(("1: receiving port.\n"));
	MPI_Recv(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("1: received port1: <%s>\n", port1));
	IF_VERBOSE(("1: connecting.\n"));
	MPI_Comm_connect(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm1, &status);
	if (data != 1)
	{
	    printf("Received %d from root when expecting 1\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("1: disconnecting.\n"));
	MPI_Comm_disconnect(&comm1);
    }
    else if (rank == 2)
    {
	IF_VERBOSE(("2: receiving port.\n"));
	MPI_Recv(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("2: received port2: <%s>\n", port2));
	/* make sure process 1 has time to do the connect before this process 
	   attempts to connect */
	MTestSleep(2);
	IF_VERBOSE(("2: connecting.\n"));
	MPI_Comm_connect(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm2, &status);
	if (data != 2)
	{
	    printf("Received %d from root when expecting 2\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("2: disconnecting.\n"));
	MPI_Comm_disconnect(&comm2);
    }
    else if (rank == 3)
    {
	IF_VERBOSE(("3: receiving port.\n"));
	MPI_Recv(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("2: received port2: <%s>\n", port2));
	/* make sure process 1 and 2 have time to do the connect before this 
	   process attempts to connect */
	MTestSleep(4);
	IF_VERBOSE(("3: connecting.\n"));
	MPI_Comm_connect(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm3, &status);
	if (data != 3)
	{
	    printf("Received %d from root when expecting 3\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("3: disconnecting.\n"));
	MPI_Comm_disconnect(&comm3);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Reduce(&num_errors, &total_num_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
    if (rank == 0)
    {
	if (total_num_errors)
	{
	    printf(" Found %d errors\n", total_num_errors);
	}
	else
	{
	    printf(" No Errors\n");
	}
	fflush(stdout);
    }
    MPI_Finalize();
    return total_num_errors;
}
开发者ID:abhinavvishnu,项目名称:matex,代码行数:101,代码来源:multiple_ports2.c

示例9: main


//.........这里部分代码省略.........
    recvcounts=rdispls=NULL;
    recvcounts = (int *) malloc (numprocs*sizeof(int));
    if(NULL == recvcounts) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }
    
    rdispls = (int *) malloc (numprocs*sizeof(int));
    if(NULL == rdispls) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }

    s_buf1 = r_buf1 = NULL;

    s_buf1 = (char *) malloc(sizeof(char)*max_msg_size + MAX_ALIGNMENT);
    if(NULL == s_buf1) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }
    
    r_buf1 = (char *) malloc(sizeof(char)*max_msg_size * numprocs + MAX_ALIGNMENT);
    if(NULL == r_buf1) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }


    align_size = getpagesize();

    sendbuf = (char *)(((unsigned long) s_buf1 + (align_size - 1)) / align_size
                    * align_size);
    recvbuf = (char *)(((unsigned long) r_buf1 + (align_size - 1)) / align_size
                    * align_size);

    memset(sendbuf, 1, max_msg_size);
    memset(recvbuf, 0, max_msg_size * numprocs);

    for(size=1; size <= max_msg_size; size *= 2) {

        if(size > LARGE_MESSAGE_SIZE) {
            skip = SKIP_LARGE;
            iterations = iterations_large;
        } else {
            skip = SKIP;
            
        }

        MPI_Barrier(MPI_COMM_WORLD);

        disp =0;
        for ( i = 0; i < numprocs; i++) {
            recvcounts[i] = size;
            rdispls[i] = disp;
            disp += size;
        }

        MPI_Barrier(MPI_COMM_WORLD);       
        timer=0.0;
        for(i=0; i < iterations + skip ; i++) {

            t_start = MPI_Wtime();

            MPI_Allgatherv(sendbuf, size, MPI_CHAR, recvbuf, recvcounts, rdispls, MPI_CHAR, MPI_COMM_WORLD);
        
            t_stop = MPI_Wtime();

            if(i >= skip) {
                timer+= t_stop-t_start;
            }
            MPI_Barrier(MPI_COMM_WORLD);
 
        }
        
        MPI_Barrier(MPI_COMM_WORLD);

        latency = (double)(timer * 1e6) / iterations;

        MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0, 
                MPI_COMM_WORLD); 
        MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, 
                MPI_COMM_WORLD); 
        MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0, 
                MPI_COMM_WORLD); 
        avg_time = avg_time/numprocs; 

        print_data(rank, full, size, avg_time, min_time, max_time, iterations);
        MPI_Barrier(MPI_COMM_WORLD);
    }
    
    free(s_buf1);
    free(r_buf1);

    free(recvcounts);
    free(rdispls);

    MPI_Finalize();

    return EXIT_SUCCESS;
}
开发者ID:01org,项目名称:opa-mpi-apps,代码行数:101,代码来源:osu_allgatherv.c

示例10: main


//.........这里部分代码省略.........
    double *sendbuf = mk_1D_array(nrColon*m, false);
    double *recbuf = mk_1D_array(nrColon*m, false); 


    int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor 
    int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor

 

    sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0
    for(int i = 0;i<size;i++){
        sendcnt[i] = cnt[i]*cnt[rank]; //  antt elementer jeg eier * ant element den eier
        sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor
    }

    // GRID
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n+1; i++) {
        grid[i] = i * h;
    }




    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++) {
        diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue
      }

  // Initialize the right hand side data 
    
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
        //  b[i][j] = h * h;
            b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h
        }
    }

    // Calculate Btilde^T = S^-1 * (S * B)^T 
 
    #pragma omp parallel for schedule(guided, 5)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(b[i], &n, z[omp_get_thread_num()], &nn);
    }
    MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Solve Lambda * Xtilde = Btilde

    #pragma omp parallel for schedule(static)

    for (int j=0; j < nrColon; j++) {
       for (int i=0; i < m; i++) {
            bt[j][i] /= (diag[j+displs[rank]]+diag[i]);
        }
    }

    // Calculate X = S^-1 * (S * Xtilde^T)
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(bt[i], &n, z[omp_get_thread_num()], &nn);

    }
    MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(b[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Calculate maximal value of solution
    double u_max = 0.0, temp;

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
            temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]);  //tester resultat - kjent funksjon, skal bli = 0
            if (temp > u_max){
                u_max = temp;
            }
        }
    }
    MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob 

    MPI_Finalize();

    if (rank == 0) {
        printf("Nodes = %d \n", size);
        printf("Threads per node = %d \n", omp_get_max_threads());
        printf("u_max = %e\n", umaxglob);  //Printer max feil
        double times = MPI_Wtime()-start; //Stopper klokka
        printf("Time elapsed = %1.16f \n", times); //Pinter tid
    }
    return 0;
}
开发者ID:lapstue,项目名称:tma4280,代码行数:101,代码来源:poisson.c

示例11: grad

void grad(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double *data;
  Complex *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  data=(double*)accfft_alloc(alloc_max);
  data_hat=(Complex*)accfft_alloc(alloc_max);

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  double * gradx=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * grady=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * gradz=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double timings[5]={0};

  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_grad(gradx,grady,gradz,data,plan,XYZ,timings);
  exec_time+=MPI_Wtime();
  /* Check err*/
  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl;
  check_err_grad(gradx,n,c_comm,0);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl;
  check_err_grad(grady,n,c_comm,1);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl;
  check_err_grad(gradz,n,c_comm,2);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data);
  accfft_free(data_hat);
  MPI_Barrier(c_comm);
  accfft_free(gradx);
  accfft_free(grady);
  accfft_free(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end grad
开发者ID:spiralgen,项目名称:accfft-scratch,代码行数:92,代码来源:step5.cpp

示例12: main

int
main(int argc, char *argv[])
{
    GRID *g;
    DOF *u_h;
    MAT *A, *A0, *B;
    MAP *map;
    INT i;
    size_t nnz, mem, mem_peak;
    VEC *x, *y0, *y1, *y2;
    double t0, t1, dnz, dnz1, mflops, mop;
    char *fn = "../test/cube.dat";
    FLOAT mem_max = 300;
    INT refine = 0;

    phgOptionsRegisterFilename("-mesh_file", "Mesh file", (char **)&fn);
    phgOptionsRegisterInt("-loop_count", "Loop count", &loop_count);
    phgOptionsRegisterInt("-refine", "Refinement level", &refine);
    phgOptionsRegisterFloat("-mem_max", "Maximum memory", &mem_max);

    phgInit(&argc, &argv);
    g = phgNewGrid(-1);
    if (!phgImport(g, fn, FALSE))
	phgError(1, "can't read file \"%s\".\n", fn);
    phgRefineAllElements(g, refine);
    u_h = phgDofNew(g, DOF_DEFAULT, 1, "u_h", DofNoAction);

    while (TRUE) {
	phgPrintf("\n");
	if (phgBalanceGrid(g, 1.2, 1, NULL, 0.))
	    phgPrintf("Repartition mesh, %d submeshes, load imbalance: %lg\n",
			g->nprocs, (double)g->lif);
	map = phgMapCreate(u_h, NULL);
	A = phgMapCreateMat(map, map);
	A->handle_bdry_eqns = TRUE;
	build_matrix(A, u_h);
	phgMatAssemble(A);

	/* Note: A is unsymmetric (A' != A) if boundary entries not removed */
	phgMatRemoveBoundaryEntries(A);

#if 0
	/* test block matrix operation */
	A0 = phgMatCreateBlockMatrix(g->comm, 1, 1, &A, NULL);
#else
	A0 = A;
#endif

	phgPrintf("%d DOF, %d elems, %d submeshes, matrix size: %d, LIF: %lg\n",
			DofGetDataCountGlobal(u_h), g->nleaf_global,
			g->nprocs, A->rmap->nglobal, (double)g->lif);

	/* test PHG mat-vec multiply */
	x = phgMapCreateVec(A->cmap, 1);
	y1 = phgMapCreateVec(A->rmap, 1);
	phgVecRandomize(x, 123);
	phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);

	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	for (i = 0; i < loop_count; i++) {
	    phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);
	}
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	y0 = phgVecCopy(y1, NULL);
	nnz = A->nnz_d + A->nnz_o;
#if USE_MPI
	dnz1 = nnz;
	MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
	dnz = nnz;
#endif
	mop = loop_count * (dnz + dnz - A->rmap->nlocal) * 1e-6;

	phgPrintf("\n");
	t1 -= t0;
	phgPrintf("   PHG:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF)\n",
			t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops);

	/* test trans(A)*x */
	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	for (i = 0; i < loop_count; i++) {
	    phgMatVec(MAT_OP_T, 1.0, A0, x, 0.0, &y1);
	}
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	t1 -= t0;
	phgPrintf("  A'*x:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		  "err: %le\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	/* time A * trans(A) */
	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	B = phgMatMat(MAT_OP_N, MAT_OP_N, 1.0, A, A, 0.0, NULL);
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	nnz = B->nnz_d + B->nnz_o;
//.........这里部分代码省略.........
开发者ID:00liujj,项目名称:PHG-On-Windows,代码行数:101,代码来源:mat_test.c

示例13: MPI_Reduce

void Master::collectStats() {
        int64_t dummy = 0;
        MPI_Reduce(&dummy, &engine.conflicts, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&dummy, &engine.propagations, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&dummy, &engine.opt_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
}
开发者ID:geoffchu,项目名称:chuffed,代码行数:6,代码来源:master.c

示例14: main


//.........这里部分代码省略.........
      }
    }

    for (phase=1; phase<Num_groups; phase++){
      recv_from = ((group_ID + phase             )%Num_groups);
      send_to   = ((group_ID - phase + Num_groups)%Num_groups);

#ifndef SYNCHRONOUS
      if (shm_ID==0) {
         MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, 
                   recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req);  
      }
#endif

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
	  }
      }
      else {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order)
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
	      }
      }

      /* NEED A LOAD/STORE FENCE HERE                                            */
      MPI_Barrier(shm_comm);
      if (shm_ID==0) {
#ifndef SYNCHRONOUS  
        MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
                  phase, MPI_COMM_WORLD, &send_req);
        MPI_Wait(&recv_req, &status);
        MPI_Wait(&send_req, &status);
#else
        MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, 
                     phase, Work_in_p, Block_size, MPI_DOUBLE, 
  	             recv_from*group_size, phase, MPI_COMM_WORLD, &status);
#endif
      }
      /* NEED A LOAD FENCE HERE                                                  */ 
      MPI_Barrier(shm_comm);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) = Work_in(i,j);

    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  /*  for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */
  for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
    for (i=0;i<order; i++) { 
      abserr += ABS(B(i,j) - (double)(order*i + j+colstart));
    }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr_tot);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  MPI_Win_free(&shm_win_A);
  MPI_Win_free(&shm_win_B);
  if (Num_groups>1) {
      MPI_Win_free(&shm_win_Work_in);
      MPI_Win_free(&shm_win_Work_out);
  }

  MPI_Info_free(&rma_winfo);

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
开发者ID:beginZero,项目名称:Kernels,代码行数:101,代码来源:transpose.c

示例15: main


//.........这里部分代码省略.........
			
/*
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif */
			/* source illumination
			for(ix=0; ix<rnx; ix++){
				for(iz=0; iz<rnz; iz++){
					mig1[itau][ix][iz] = mig1[itau][ix][iz]/(sill[ix][iz]+SF_EPS);
				}
			} */
        } //end of itau
        
        /* output wavefield snapshot */
        if(wantwf){
            for(it=0; it<wfnt; it++){
                if(it%snap==0){
                    sf_floatwrite(fwf[it][0], rnzx, Ffwf);
                    sf_floatwrite(bwf[wfnt-1-it][0], rnzx, Fbwf);
                }
            }
            sf_fileclose(Ffwf);
            sf_fileclose(Fbwf);
        }
        
        /* add all the shot images that are on the same node */
#ifdef _OPENMP
#pragma omp parallel for private(itau, ix, iz)
#endif
        for(itau=0; itau<ntau; itau++){
            for(ix=0; ix<rnx; ix++){
                for(iz=0; iz<rnz; iz++){
                    img1[itau][ix+is*nds][iz] += mig1[itau][ix][iz];
                }
            }
        }
        
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
        for(ix=0; ix<rnx; ix++){
            for(iz=0; iz<rnz; iz++){
                img2[ix+is*nds][iz] += mig2[ix][iz];
            }
        }
	}
    ////////////////end of ishot
	MPI_Barrier(comm);
    
    cfft2_finalize();
    sf_fileclose(Fdat);
    
    free(ww); free(rr);
	free(*dd); free(dd);
	free(cwave); free(cwavem); free(curr);
    free(*ccr); free(ccr);
    free(*sill); free(sill);
    free(**fwf); free(*fwf); free(fwf);
    free(**bwf); free(*bwf); free(bwf);
    free(**mig1); free(*mig1); free(mig1);
    free(*mig2); free(mig2);
    
    /* sum image */
    if(cpuid==0){
        sendbuf=(float *)MPI_IN_PLACE;
        recvbuf=img1[0][0];
    }else{
        sendbuf=img1[0][0];
        recvbuf=NULL;
    }
    MPI_Reduce(sendbuf, recvbuf, ntau*vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
    
    if(cpuid==0){
        sendbuf=MPI_IN_PLACE;
        recvbuf=img2[0];
    }else{
        sendbuf=img2[0];
        recvbuf=NULL;
    }
    MPI_Reduce(sendbuf, recvbuf, vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
    
    /* output image */
    if(cpuid==0){
        sf_floatwrite(img1[0][0], ntau*vnx*rnz, Fimg1);
        sf_floatwrite(img2[0], vnx*rnz, Fimg2);
    }
	MPI_Barrier(comm);

	sf_fileclose(Fimg1);
    sf_fileclose(Fimg2);
    free(**img1); free(*img1); free(img1);
    free(*img2); free(img2);
    
	gettimeofday(&tim, NULL);
	tend=tim.tv_sec+(tim.tv_usec/1000000.0);
	sf_warning(">> The computing time is %.3lf minutes <<", (tend-tstart)/60.);

	MPI_Finalize();
	exit(0);
}
开发者ID:housian0724,项目名称:src,代码行数:101,代码来源:Mmpilrrtm_ts.c


注:本文中的MPI_Reduce函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。