本文整理汇总了C++中MPI_Reduce函数的典型用法代码示例。如果您正苦于以下问题:C++ MPI_Reduce函数的具体用法?C++ MPI_Reduce怎么用?C++ MPI_Reduce使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了MPI_Reduce函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: main
/*
* This test attempts collective communication after a process in
* the communicator has failed.
*/
int main(int argc, char **argv)
{
int rank, size, i, rc, errclass, toterrs, errs = 0;
char rbuf[100000];
char *sendbuf;
int deadprocs[1] = {1};
MPI_Group world, newgroup;
MPI_Comm newcomm;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
if (size < 3) {
fprintf( stderr, "Must run with at least 3 processes\n" );
MPI_Abort( MPI_COMM_WORLD, 1 );
}
MPI_Comm_group(MPI_COMM_WORLD, &world);
MPI_Group_excl(world, 1, deadprocs, &newgroup);
MPI_Comm_create_group(MPI_COMM_WORLD, newgroup, 0, &newcomm);
if (rank == 1) {
exit(EXIT_FAILURE);
}
/* try a small send first */
sendbuf = (char *)malloc(10*size*sizeof(char));
if (rank == 0) {
for (i=0;i<size;i++) {
strcpy(sendbuf + i*10, "No Errors");
}
}
rc = MPI_Scatter(sendbuf, 10, MPI_CHAR, rbuf, 10, MPI_CHAR, 0, MPI_COMM_WORLD);
#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
MPI_Error_class(rc, &errclass);
if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
errs++;
}
#endif
/* reset the buffers and try a larger scatter */
free(sendbuf);
memset(rbuf, 0, sizeof(rbuf));
sendbuf = (char *)malloc(100000*size*sizeof(char));
if (rank == 0) {
for (i=0;i<size;i++) {
strcpy(sendbuf + i*100000, "No Errors");
}
}
rc = MPI_Scatter(sendbuf, 100000, MPI_CHAR, rbuf, 100000, MPI_CHAR, 0, MPI_COMM_WORLD);
#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
MPI_Error_class(rc, &errclass);
if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
errs++;
}
#endif
rc = MPI_Reduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, 0, newcomm);
if(rc)
fprintf(stderr, "Failed to get errors from other processes\n");
if (rank == 0) {
if (toterrs) {
printf( " Found %d errors\n", toterrs );
}
else {
printf( " No Errors\n" );
}
fflush(stdout);
}
free(sendbuf);
MPI_Comm_free(&newcomm);
MPI_Group_free(&newgroup);
MPI_Group_free(&world);
MPI_Finalize();
return 0;
}
示例2: main
int main(int argc, char **argv)
{
int rank;
int world_size;
/*
* Initialization
*/
int thread_support;
if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
fprintf(stderr,"MPI_Init_thread failed\n");
exit(1);
}
if (thread_support == MPI_THREAD_FUNNELED)
fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
if (thread_support < MPI_THREAD_FUNNELED)
fprintf(stderr,"Warning: MPI does not have thread support!\n");
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
starpu_srand48((long int)time(NULL));
parse_args(rank, argc, argv);
int ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
/* We disable sequential consistency in this example */
starpu_data_set_default_sequential_consistency_flag(0);
starpu_mpi_init(NULL, NULL, 0);
STARPU_ASSERT(p*q == world_size);
starpu_cublas_init();
int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
/*
* Problem Init
*/
init_matrix(rank);
fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
(int)(allocated_memory/(1024*1024)),
(int)(allocated_memory_extra/(1024*1024)),
(int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
display_grid(rank, nblocks);
TYPE *a_r = NULL;
// STARPU_PLU(display_data_content)(a_r, size);
TYPE *x, *y;
if (check)
{
x = calloc(size, sizeof(TYPE));
STARPU_ASSERT(x);
y = calloc(size, sizeof(TYPE));
STARPU_ASSERT(y);
if (rank == 0)
{
unsigned ind;
for (ind = 0; ind < size; ind++)
x[ind] = (TYPE)starpu_drand48();
}
a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
if (rank == 0)
STARPU_PLU(display_data_content)(a_r, size);
// STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
}
barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
/*
* Report performance
*/
int reduce_ret;
double min_timing = timing;
double max_timing = timing;
double sum_timing = timing;
reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
//.........这里部分代码省略.........
示例3: main
int main(int argc, char** argv)
{
int rank;
int size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(argc != 5) {
fprintf(stderr, "argc %d\n", argc);
fprintf(stderr,
"Usage: %s <left> <right> <numSyncFiles> <tolerance>\n"
"where <left> and <right> are different"
"N-procs_case directories\n", argv[0]);
MPI_Finalize();
return 1;
}
char* lpath = argv[1];
char* rpath = argv[2];
int nSyncFiles = atoi(argv[3]);
double tolerance = atof(argv[4]);
int ndof;
int nshg;
int solsize;
double* solution;
std::set<int>* l_timesteps = find_timesteps(lpath, nSyncFiles);
std::set<int>* r_timesteps = find_timesteps(rpath, nSyncFiles);
std::set<int>* timesteps_to_check = new std::set<int>;
std::set_intersection(l_timesteps->begin(), l_timesteps->end(),
r_timesteps->begin(), r_timesteps->end(),
std::inserter(*timesteps_to_check, timesteps_to_check->begin()));
delete l_timesteps;
delete r_timesteps;
if(rank == 0)
printf("Found %d common timesteps\n",
timesteps_to_check->size());
#ifdef DBGONLY
read_solution(&solution, &solsize, &nshg, &ndof,
size, rank, 0, numSyncFiles, "./");
printf("nshg: %d, ndof: %d\n", nshg, ndof);
assert(solsize == ndof*nshg);
#endif
double maxerror = 0.0;
double error;
double gblmaxerror;
for(std::set<int>::iterator i = timesteps_to_check->begin();
i!=timesteps_to_check->end();i++)
{
error = compare_solution(lpath, rpath, *i, size, nSyncFiles);
if(error>maxerror) maxerror = error;
}
delete timesteps_to_check;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Reduce(&maxerror, &gblmaxerror, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD);
if(rank == 0) printf("Maximum difference across all timesteps: %e\n",
gblmaxerror);
MPI_Finalize();
return (gblmaxerror > tolerance);
}
示例4: test_P3DFFT
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){
int nx,ny,nz,procid,nprocs,ndim;
int istart[3],isize[3],iend[3];
int fstart[3],fsize[3],fend[3];
int p3dfft_mem_conf,nrep;
long int Nlocal,Nglob;
double factor;
double l_timers[12]={0},g_timers[12]={0};
double total_time=0*MPI_Wtime(), setup_time=0;
// rtime_local is timings on each process and _global is the max reduced to root
// 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time
// The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root.
// the sum of first four elements give the comm time
unsigned char op_f[4]="fft", op_b[4]="tff";
int memsize[3];
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&procid);
nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP;
if(decomp==1){
dims[0] = 1; dims[1] = nprocs;
}
if(procid == 0)
printf("Using processor grid %d x %d\n",dims[0],dims[1]);
/* Initialize P3DFFT */
MPI_Barrier(MPI_COMM_WORLD);
setup_time -= MPI_Wtime(); //Compute Setup Time.
Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize);
setup_time += MPI_Wtime(); //Compute Setup Time.
PCOUT<<"done with setup"<<std::endl;
Cp3dfft_get_dims(istart,iend,isize,1);
Cp3dfft_get_dims(fstart,fend,fsize,2);
/* Allocate and initialize */
double *A; // Input matrix A
A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));
//B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));
/* Warmup */
Cp3dfft_ftran_r2c(A,A,op_f);
Cp3dfft_ftran_r2c(A,A,op_f);
MPI_Barrier(MPI_COMM_WORLD);
Cset_timers();
for (int rep=0; rep<nrep; rep++){
initialize_p3dfft(A,n);
MPI_Barrier(MPI_COMM_WORLD);
/* Forward transform */
total_time -= MPI_Wtime();
Cp3dfft_ftran_r2c(A,A,op_f);
total_time += MPI_Wtime();
MPI_Barrier(MPI_COMM_WORLD);
}
Cget_timers(l_timers);
Cp3dfft_btran_c2r(A,A,op_b);
/* Compute Error */
//PCOUT<<"Done With FFTs computing error"<<std::endl;
compute_error_p3dfft(A,n);
/* Gather timing statistics */
double g_total_time, g_comm_time, g_setup_time;
MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
g_total_time=g_total_time/nrep;
g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep);
//g_total_time=g_total_time/((double)nrep);
ptrdiff_t size=n[0];size*=n[1]; size*=n[2];
double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9;
if(procid == 0){
std::cout.precision(4);
std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;;
std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl;
std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl;
std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl;
std::cout<<"\033[1;31m";
std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl;
std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
std::cout<<"\033[0m\n"<<std::endl;
results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
}
/* Free work space */
fftw_free(A);
//.........这里部分代码省略.........
示例5: olsr_main
int olsr_main(int argc, char *argv[])
{
int i;
char log[32];
tw_opt_add(olsr_opts);
tw_init(&argc, &argv);
#if DEBUG
sprintf( log, "olsr-log.%ld", g_tw_mynode );
olsr_event_log = fopen( log, "w+");
if( olsr_event_log == nullptr )
tw_error( TW_LOC, "Failed to Open OLSR Event Log file \n");
#endif
g_tw_mapping = CUSTOM;
g_tw_custom_initial_mapping = &olsr_custom_mapping;
g_tw_custom_lp_global_to_local_map = &olsr_mapping_to_lp;
// nlp_per_pe = OLSR_MAX_NEIGHBORS;// / tw_nnodes();
//g_tw_lookahead = SA_INTERVAL;
SA_range_start = nlp_per_pe;
// Increase nlp_per_pe by nlp_per_pe / OMN
nlp_per_pe += nlp_per_pe / OLSR_MAX_NEIGHBORS;
g_tw_events_per_pe = OLSR_MAX_NEIGHBORS / 2 * nlp_per_pe + 65536;
tw_define_lps(nlp_per_pe, sizeof(olsr_msg_data));
for(i = 0; i < OLSR_END_EVENT; i++)
g_olsr_event_stats[i] = 0;
for (i = 0; i < SA_range_start; i++) {
tw_lp_settype(i, &olsr_lps[0]);
}
for (i = SA_range_start; i < nlp_per_pe; i++) {
tw_lp_settype(i, &olsr_lps[1]);
}
#if DEBUG
printf("g_tw_nlp is %llu\n", g_tw_nlp);
#endif
tw_run();
if( g_tw_synchronization_protocol != 1 )
{
MPI_Reduce( g_olsr_event_stats, g_olsr_root_event_stats, OLSR_END_EVENT, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
}
else {
for (i = 0; i < OLSR_END_EVENT; i++) {
g_olsr_root_event_stats[i] = g_olsr_event_stats[i];
}
}
if (tw_ismaster()) {
for( i = 0; i < OLSR_END_EVENT; i++ )
printf("OLSR Type %s Event Count = %llu \n", event_names[i], g_olsr_root_event_stats[i]);
printf("Complete.\n");
}
tw_end();
return 0;
}
示例6: main
//.........这里部分代码省略.........
//printf("count = %lld\n", count);
if (count != nitem) {
fprintf( stderr, "%d: Wrong count (%lld) on write\n", rank, count );
fflush(stderr);
/* exit */
MPI_Finalize();
exit(1);
}
MPI_File_close(&fh);
MPI_Barrier(MPI_COMM_WORLD);
if(rank == 0) printf("File is written\n\n");
}
double *tmp = (double *)malloc( nitem * sizeof(double) );
memset (tmp, 0, nitem*sizeof(double));
if(use_normalsto == 1) {
MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );
/* Read nothing (check status) */
memset( &status, 0xff, sizeof(MPI_Status) );
offset = rank * nitem * type_size;
/* start I/O */
stime = MPI_Wtime();
MPI_File_read_at(fh, offset, tmp, nitem, MPI_DOUBLE, &status);
etime = MPI_Wtime();
/* end I/O */
iotime = etime - stime;
if(_debug==1) printf("%d: iotime = %10.4f\n", rank, iotime);
MPI_Reduce(&iotime, &max_iotime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
sum = 0.0; /* reset sum */
/* start computation */
stime = MPI_Wtime();
for(i=0; i<nitem; i++) {
sum += tmp[i];
}
MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
etime = MPI_Wtime();
/* end computation */
comptime = etime - stime;
if(_debug==1) printf("%d: comptime = %10.4f\n", rank, comptime);
MPI_Reduce(&comptime, &max_comptime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(rank == 0) {
elapsed_time = max_comptime + max_iotime;
printf("<<Result (SUM) with normal read>>\n"
"SUM = %10.4f \n"
"Computation time = %10.4f sec\n"
"I/O time = %10.4f sec\n"
"total time = %10.4f sec\n\n",
global_sum, max_comptime, max_iotime, elapsed_time);
}
MPI_File_close(&fh);
}
示例7: trainOneEpochDenseCPU
void trainOneEpochDenseCPU(int itask, float *data, float *numerator,
float *denominator, float *codebook,
unsigned int nSomX, unsigned int nSomY,
unsigned int nDimensions, unsigned int nVectors,
unsigned int nVectorsPerRank, float radius,
float scale, string mapType, string gridType,
bool compact_support, bool gaussian, int *globalBmus) {
unsigned int p1[2] = {0, 0};
unsigned int *bmus = new unsigned int[nVectorsPerRank * 2];
#ifdef _OPENMP
#pragma omp parallel default(shared) private(p1)
#endif
{
#ifdef _OPENMP
#pragma omp for
#endif
#ifdef _WIN32
for (int n = 0; n < nVectorsPerRank; n++) {
#else
for (unsigned int n = 0; n < nVectorsPerRank; n++) {
#endif
if (itask * nVectorsPerRank + n < nVectors) {
/// get the best matching unit
get_bmu_coord(codebook, data, nSomY, nSomX,
nDimensions, p1, n);
bmus[2 * n] = p1[0];
bmus[2 * n + 1] = p1[1];
}
}
}
float *localNumerator = new float[nSomY * nSomX * nDimensions];
float *localDenominator = new float[nSomY * nSomX];
#ifdef _OPENMP
#pragma omp parallel default(shared)
#endif
{
#ifdef _OPENMP
#pragma omp for
#endif
#ifdef _WIN32
for (int som_y = 0; som_y < nSomY; som_y++) {
#else
for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
localDenominator[som_y * nSomX + som_x] = 0.0;
for (unsigned int d = 0; d < nDimensions; d++)
localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] = 0.0;
}
}
/// Accumulate denoms and numers
#ifdef _OPENMP
#pragma omp for
#endif
#ifdef _WIN32
for (int som_y = 0; som_y < nSomY; som_y++) {
#else
for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
for (unsigned int n = 0; n < nVectorsPerRank; n++) {
if (itask * nVectorsPerRank + n < nVectors) {
float dist = 0.0f;
if (gridType == "rectangular") {
if (mapType == "planar") {
dist = euclideanDistanceOnPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
}
else if (mapType == "toroid") {
dist = euclideanDistanceOnToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
}
}
else {
if (mapType == "planar") {
dist = euclideanDistanceOnHexagonalPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
}
else if (mapType == "toroid") {
dist = euclideanDistanceOnHexagonalToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
}
}
float neighbor_fuct = getWeight(dist, radius, scale, compact_support, gaussian);
for (unsigned int d = 0; d < nDimensions; d++) {
localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] +=
1.0f * neighbor_fuct
* (*(data + n * nDimensions + d));
}
localDenominator[som_y * nSomX + som_x] += neighbor_fuct;
}
}
}
}
}
#ifdef HAVE_MPI
MPI_Reduce(localNumerator, numerator,
nSomY * nSomX * nDimensions, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(localDenominator, denominator,
nSomY * nSomX, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Gather(bmus, nVectorsPerRank * 2, MPI_INT, globalBmus, nVectorsPerRank * 2, MPI_INT, 0, MPI_COMM_WORLD);
//.........这里部分代码省略.........
示例8: main
//.........这里部分代码省略.........
MPI_Send(&data, 1, MPI_INT, 0, 0, comm1);
IF_VERBOSE(("0: sending 2 to process 2.\n"));
data = 2;
MPI_Send(&data, 1, MPI_INT, 0, 0, comm2);
IF_VERBOSE(("0: sending 3 to process 3.\n"));
data = 3;
MPI_Send(&data, 1, MPI_INT, 0, 0, comm3);
IF_VERBOSE(("0: disconnecting.\n"));
MPI_Comm_disconnect(&comm1);
MPI_Comm_disconnect(&comm2);
MPI_Comm_disconnect(&comm3);
}
else if (rank == 1)
{
IF_VERBOSE(("1: receiving port.\n"));
MPI_Recv(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);
IF_VERBOSE(("1: received port1: <%s>\n", port1));
IF_VERBOSE(("1: connecting.\n"));
MPI_Comm_connect(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1);
MPI_Recv(&data, 1, MPI_INT, 0, 0, comm1, &status);
if (data != 1)
{
printf("Received %d from root when expecting 1\n", data);
fflush(stdout);
num_errors++;
}
IF_VERBOSE(("1: disconnecting.\n"));
MPI_Comm_disconnect(&comm1);
}
else if (rank == 2)
{
IF_VERBOSE(("2: receiving port.\n"));
MPI_Recv(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);
IF_VERBOSE(("2: received port2: <%s>\n", port2));
/* make sure process 1 has time to do the connect before this process
attempts to connect */
MTestSleep(2);
IF_VERBOSE(("2: connecting.\n"));
MPI_Comm_connect(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2);
MPI_Recv(&data, 1, MPI_INT, 0, 0, comm2, &status);
if (data != 2)
{
printf("Received %d from root when expecting 2\n", data);
fflush(stdout);
num_errors++;
}
IF_VERBOSE(("2: disconnecting.\n"));
MPI_Comm_disconnect(&comm2);
}
else if (rank == 3)
{
IF_VERBOSE(("3: receiving port.\n"));
MPI_Recv(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);
IF_VERBOSE(("2: received port2: <%s>\n", port2));
/* make sure process 1 and 2 have time to do the connect before this
process attempts to connect */
MTestSleep(4);
IF_VERBOSE(("3: connecting.\n"));
MPI_Comm_connect(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3);
MPI_Recv(&data, 1, MPI_INT, 0, 0, comm3, &status);
if (data != 3)
{
printf("Received %d from root when expecting 3\n", data);
fflush(stdout);
num_errors++;
}
IF_VERBOSE(("3: disconnecting.\n"));
MPI_Comm_disconnect(&comm3);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Reduce(&num_errors, &total_num_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
{
if (total_num_errors)
{
printf(" Found %d errors\n", total_num_errors);
}
else
{
printf(" No Errors\n");
}
fflush(stdout);
}
MPI_Finalize();
return total_num_errors;
}
示例9: main
//.........这里部分代码省略.........
recvcounts=rdispls=NULL;
recvcounts = (int *) malloc (numprocs*sizeof(int));
if(NULL == recvcounts) {
fprintf(stderr, "malloc failed.\n");
exit(1);
}
rdispls = (int *) malloc (numprocs*sizeof(int));
if(NULL == rdispls) {
fprintf(stderr, "malloc failed.\n");
exit(1);
}
s_buf1 = r_buf1 = NULL;
s_buf1 = (char *) malloc(sizeof(char)*max_msg_size + MAX_ALIGNMENT);
if(NULL == s_buf1) {
fprintf(stderr, "malloc failed.\n");
exit(1);
}
r_buf1 = (char *) malloc(sizeof(char)*max_msg_size * numprocs + MAX_ALIGNMENT);
if(NULL == r_buf1) {
fprintf(stderr, "malloc failed.\n");
exit(1);
}
align_size = getpagesize();
sendbuf = (char *)(((unsigned long) s_buf1 + (align_size - 1)) / align_size
* align_size);
recvbuf = (char *)(((unsigned long) r_buf1 + (align_size - 1)) / align_size
* align_size);
memset(sendbuf, 1, max_msg_size);
memset(recvbuf, 0, max_msg_size * numprocs);
for(size=1; size <= max_msg_size; size *= 2) {
if(size > LARGE_MESSAGE_SIZE) {
skip = SKIP_LARGE;
iterations = iterations_large;
} else {
skip = SKIP;
}
MPI_Barrier(MPI_COMM_WORLD);
disp =0;
for ( i = 0; i < numprocs; i++) {
recvcounts[i] = size;
rdispls[i] = disp;
disp += size;
}
MPI_Barrier(MPI_COMM_WORLD);
timer=0.0;
for(i=0; i < iterations + skip ; i++) {
t_start = MPI_Wtime();
MPI_Allgatherv(sendbuf, size, MPI_CHAR, recvbuf, recvcounts, rdispls, MPI_CHAR, MPI_COMM_WORLD);
t_stop = MPI_Wtime();
if(i >= skip) {
timer+= t_stop-t_start;
}
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
latency = (double)(timer * 1e6) / iterations;
MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
MPI_COMM_WORLD);
MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD);
MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD);
avg_time = avg_time/numprocs;
print_data(rank, full, size, avg_time, min_time, max_time, iterations);
MPI_Barrier(MPI_COMM_WORLD);
}
free(s_buf1);
free(r_buf1);
free(recvcounts);
free(rdispls);
MPI_Finalize();
return EXIT_SUCCESS;
}
示例10: main
//.........这里部分代码省略.........
double *sendbuf = mk_1D_array(nrColon*m, false);
double *recbuf = mk_1D_array(nrColon*m, false);
int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor
int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor
sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0
for(int i = 0;i<size;i++){
sendcnt[i] = cnt[i]*cnt[rank]; // antt elementer jeg eier * ant element den eier
sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor
}
// GRID
#pragma omp parallel for schedule(static)
for (int i = 0; i < n+1; i++) {
grid[i] = i * h;
}
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < m; i++) {
diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue
}
// Initialize the right hand side data
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < nrColon; i++) {
for (size_t j = 0; j < m; j++) {
// b[i][j] = h * h;
b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h
}
}
// Calculate Btilde^T = S^-1 * (S * B)^T
#pragma omp parallel for schedule(guided, 5)
for (size_t i = 0; i < nrColon; i++) {
fst_(b[i], &n, z[omp_get_thread_num()], &nn);
}
MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < nrColon; i++) {
fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn);
}
// Solve Lambda * Xtilde = Btilde
#pragma omp parallel for schedule(static)
for (int j=0; j < nrColon; j++) {
for (int i=0; i < m; i++) {
bt[j][i] /= (diag[j+displs[rank]]+diag[i]);
}
}
// Calculate X = S^-1 * (S * Xtilde^T)
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < nrColon; i++) {
fst_(bt[i], &n, z[omp_get_thread_num()], &nn);
}
MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < nrColon; i++) {
fstinv_(b[i], &n, z[omp_get_thread_num()], &nn);
}
// Calculate maximal value of solution
double u_max = 0.0, temp;
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < nrColon; i++) {
for (size_t j = 0; j < m; j++) {
temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]); //tester resultat - kjent funksjon, skal bli = 0
if (temp > u_max){
u_max = temp;
}
}
}
MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob
MPI_Finalize();
if (rank == 0) {
printf("Nodes = %d \n", size);
printf("Threads per node = %d \n", omp_get_max_threads());
printf("u_max = %e\n", umaxglob); //Printer max feil
double times = MPI_Wtime()-start; //Stopper klokka
printf("Time elapsed = %1.16f \n", times); //Pinter tid
}
return 0;
}
示例11: grad
void grad(int *n, int nthreads) {
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Create Cartesian Communicator */
int c_dims[2]={0};
MPI_Comm c_comm;
accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);
double *data;
Complex *data_hat;
double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
int alloc_max=0;
int isize[3],osize[3],istart[3],ostart[3];
/* Get the local pencil size and the allocation size */
alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm);
//data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
data=(double*)accfft_alloc(alloc_max);
data_hat=(Complex*)accfft_alloc(alloc_max);
accfft_init(nthreads);
/* Create FFT plan */
setup_time=-MPI_Wtime();
accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
setup_time+=MPI_Wtime();
/* Initialize data */
initialize(data,n,c_comm);
MPI_Barrier(c_comm);
double * gradx=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
double * grady=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
double * gradz=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
double timings[5]={0};
std::bitset<3> XYZ=0;
XYZ[0]=1;
XYZ[1]=1;
XYZ[2]=1;
double exec_time=-MPI_Wtime();
accfft_grad(gradx,grady,gradz,data,plan,XYZ,timings);
exec_time+=MPI_Wtime();
/* Check err*/
PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl;
check_err_grad(gradx,n,c_comm,0);
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;
PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl;
check_err_grad(grady,n,c_comm,1);
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;
PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl;
check_err_grad(gradz,n,c_comm,2);
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;
/* Compute some timings statistics */
double g_setup_time,g_timings[5],g_exec_time;
MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;
accfft_free(data);
accfft_free(data_hat);
MPI_Barrier(c_comm);
accfft_free(gradx);
accfft_free(grady);
accfft_free(gradz);
accfft_destroy_plan(plan);
accfft_cleanup();
MPI_Comm_free(&c_comm);
PCOUT<<"-------------------------------------------------------"<<std::endl;
PCOUT<<"-------------------------------------------------------"<<std::endl;
PCOUT<<"-------------------------------------------------------\n"<<std::endl;
return ;
} // end grad
示例12: main
int
main(int argc, char *argv[])
{
GRID *g;
DOF *u_h;
MAT *A, *A0, *B;
MAP *map;
INT i;
size_t nnz, mem, mem_peak;
VEC *x, *y0, *y1, *y2;
double t0, t1, dnz, dnz1, mflops, mop;
char *fn = "../test/cube.dat";
FLOAT mem_max = 300;
INT refine = 0;
phgOptionsRegisterFilename("-mesh_file", "Mesh file", (char **)&fn);
phgOptionsRegisterInt("-loop_count", "Loop count", &loop_count);
phgOptionsRegisterInt("-refine", "Refinement level", &refine);
phgOptionsRegisterFloat("-mem_max", "Maximum memory", &mem_max);
phgInit(&argc, &argv);
g = phgNewGrid(-1);
if (!phgImport(g, fn, FALSE))
phgError(1, "can't read file \"%s\".\n", fn);
phgRefineAllElements(g, refine);
u_h = phgDofNew(g, DOF_DEFAULT, 1, "u_h", DofNoAction);
while (TRUE) {
phgPrintf("\n");
if (phgBalanceGrid(g, 1.2, 1, NULL, 0.))
phgPrintf("Repartition mesh, %d submeshes, load imbalance: %lg\n",
g->nprocs, (double)g->lif);
map = phgMapCreate(u_h, NULL);
A = phgMapCreateMat(map, map);
A->handle_bdry_eqns = TRUE;
build_matrix(A, u_h);
phgMatAssemble(A);
/* Note: A is unsymmetric (A' != A) if boundary entries not removed */
phgMatRemoveBoundaryEntries(A);
#if 0
/* test block matrix operation */
A0 = phgMatCreateBlockMatrix(g->comm, 1, 1, &A, NULL);
#else
A0 = A;
#endif
phgPrintf("%d DOF, %d elems, %d submeshes, matrix size: %d, LIF: %lg\n",
DofGetDataCountGlobal(u_h), g->nleaf_global,
g->nprocs, A->rmap->nglobal, (double)g->lif);
/* test PHG mat-vec multiply */
x = phgMapCreateVec(A->cmap, 1);
y1 = phgMapCreateVec(A->rmap, 1);
phgVecRandomize(x, 123);
phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);
phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */
t0 = phgGetTime(NULL);
for (i = 0; i < loop_count; i++) {
phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);
}
t1 = phgGetTime(NULL);
mflops = phgPerfGetMflops(g, NULL, NULL);
y0 = phgVecCopy(y1, NULL);
nnz = A->nnz_d + A->nnz_o;
#if USE_MPI
dnz1 = nnz;
MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
dnz = nnz;
#endif
mop = loop_count * (dnz + dnz - A->rmap->nlocal) * 1e-6;
phgPrintf("\n");
t1 -= t0;
phgPrintf(" PHG: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF)\n",
t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops);
/* test trans(A)*x */
phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */
t0 = phgGetTime(NULL);
for (i = 0; i < loop_count; i++) {
phgMatVec(MAT_OP_T, 1.0, A0, x, 0.0, &y1);
}
t1 = phgGetTime(NULL);
mflops = phgPerfGetMflops(g, NULL, NULL);
t1 -= t0;
phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
"err: %le\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops,
(double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));
/* time A * trans(A) */
phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */
t0 = phgGetTime(NULL);
B = phgMatMat(MAT_OP_N, MAT_OP_N, 1.0, A, A, 0.0, NULL);
t1 = phgGetTime(NULL);
mflops = phgPerfGetMflops(g, NULL, NULL);
nnz = B->nnz_d + B->nnz_o;
//.........这里部分代码省略.........
示例13: MPI_Reduce
void Master::collectStats() {
int64_t dummy = 0;
MPI_Reduce(&dummy, &engine.conflicts, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&dummy, &engine.propagations, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&dummy, &engine.opt_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
}
示例14: main
//.........这里部分代码省略.........
}
}
for (phase=1; phase<Num_groups; phase++){
recv_from = ((group_ID + phase )%Num_groups);
send_to = ((group_ID - phase + Num_groups)%Num_groups);
#ifndef SYNCHRONOUS
if (shm_ID==0) {
MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE,
recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req);
}
#endif
istart = send_to*Block_order;
if (!tiling) {
for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++)
for (j=0; j<Block_order; j++){
Work_out(j,i) = A(i,j);
}
}
else {
for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order)
for (j=0; j<Block_order; j+=Tile_order)
for (it=i; it<MIN(Block_order,i+Tile_order); it++)
for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
Work_out(jt,it) = A(it,jt);
}
}
/* NEED A LOAD/STORE FENCE HERE */
MPI_Barrier(shm_comm);
if (shm_ID==0) {
#ifndef SYNCHRONOUS
MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
phase, MPI_COMM_WORLD, &send_req);
MPI_Wait(&recv_req, &status);
MPI_Wait(&send_req, &status);
#else
MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
phase, Work_in_p, Block_size, MPI_DOUBLE,
recv_from*group_size, phase, MPI_COMM_WORLD, &status);
#endif
}
/* NEED A LOAD FENCE HERE */
MPI_Barrier(shm_comm);
istart = recv_from*Block_order;
/* scatter received block to transposed matrix; no need to tile */
for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
for (i=0; i<Block_order; i++)
B(i,j) = Work_in(i,j);
} /* end of phase loop */
} /* end of iterations */
local_trans_time = wtime() - local_trans_time;
MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
MPI_COMM_WORLD);
abserr = 0.0;
istart = 0;
/* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */
for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
for (i=0;i<order; i++) {
abserr += ABS(B(i,j) - (double)(order*i + j+colstart));
}
MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
if (my_ID == root) {
if (abserr_tot < epsilon) {
printf("Solution validates\n");
avgtime = trans_time/(double)iterations;
printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
printf("Summed errors: %f \n", abserr_tot);
#endif
}
else {
printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon);
error = 1;
}
}
bail_out(error);
MPI_Win_free(&shm_win_A);
MPI_Win_free(&shm_win_B);
if (Num_groups>1) {
MPI_Win_free(&shm_win_Work_in);
MPI_Win_free(&shm_win_Work_out);
}
MPI_Info_free(&rma_winfo);
MPI_Finalize();
exit(EXIT_SUCCESS);
} /* end of main */
示例15: main
//.........这里部分代码省略.........
/*
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif */
/* source illumination
for(ix=0; ix<rnx; ix++){
for(iz=0; iz<rnz; iz++){
mig1[itau][ix][iz] = mig1[itau][ix][iz]/(sill[ix][iz]+SF_EPS);
}
} */
} //end of itau
/* output wavefield snapshot */
if(wantwf){
for(it=0; it<wfnt; it++){
if(it%snap==0){
sf_floatwrite(fwf[it][0], rnzx, Ffwf);
sf_floatwrite(bwf[wfnt-1-it][0], rnzx, Fbwf);
}
}
sf_fileclose(Ffwf);
sf_fileclose(Fbwf);
}
/* add all the shot images that are on the same node */
#ifdef _OPENMP
#pragma omp parallel for private(itau, ix, iz)
#endif
for(itau=0; itau<ntau; itau++){
for(ix=0; ix<rnx; ix++){
for(iz=0; iz<rnz; iz++){
img1[itau][ix+is*nds][iz] += mig1[itau][ix][iz];
}
}
}
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
for(ix=0; ix<rnx; ix++){
for(iz=0; iz<rnz; iz++){
img2[ix+is*nds][iz] += mig2[ix][iz];
}
}
}
////////////////end of ishot
MPI_Barrier(comm);
cfft2_finalize();
sf_fileclose(Fdat);
free(ww); free(rr);
free(*dd); free(dd);
free(cwave); free(cwavem); free(curr);
free(*ccr); free(ccr);
free(*sill); free(sill);
free(**fwf); free(*fwf); free(fwf);
free(**bwf); free(*bwf); free(bwf);
free(**mig1); free(*mig1); free(mig1);
free(*mig2); free(mig2);
/* sum image */
if(cpuid==0){
sendbuf=(float *)MPI_IN_PLACE;
recvbuf=img1[0][0];
}else{
sendbuf=img1[0][0];
recvbuf=NULL;
}
MPI_Reduce(sendbuf, recvbuf, ntau*vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
if(cpuid==0){
sendbuf=MPI_IN_PLACE;
recvbuf=img2[0];
}else{
sendbuf=img2[0];
recvbuf=NULL;
}
MPI_Reduce(sendbuf, recvbuf, vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
/* output image */
if(cpuid==0){
sf_floatwrite(img1[0][0], ntau*vnx*rnz, Fimg1);
sf_floatwrite(img2[0], vnx*rnz, Fimg2);
}
MPI_Barrier(comm);
sf_fileclose(Fimg1);
sf_fileclose(Fimg2);
free(**img1); free(*img1); free(img1);
free(*img2); free(img2);
gettimeofday(&tim, NULL);
tend=tim.tv_sec+(tim.tv_usec/1000000.0);
sf_warning(">> The computing time is %.3lf minutes <<", (tend-tstart)/60.);
MPI_Finalize();
exit(0);
}