本文整理汇总了C++中MPI_Comm_rank函数的典型用法代码示例。如果您正苦于以下问题:C++ MPI_Comm_rank函数的具体用法?C++ MPI_Comm_rank怎么用?C++ MPI_Comm_rank使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了MPI_Comm_rank函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: CHECK
void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const int new_height = this->layer_param_.image_data_param().new_height();
const int new_width = this->layer_param_.image_data_param().new_width();
const bool is_color = this->layer_param_.image_data_param().is_color();
string root_folder = this->layer_param_.image_data_param().root_folder();
CHECK((new_height == 0 && new_width == 0) ||
(new_height > 0 && new_width > 0)) << "Current implementation requires "
"new_height and new_width to be set at the same time.";
// Read the file with filenames and labels
const string& source = this->layer_param_.image_data_param().source();
LOG(INFO) << "Opening file " << source;
std::ifstream infile(source.c_str());
string filename;
int label;
while (infile >> filename >> label) {
lines_.push_back(std::make_pair(filename, label));
}
if (this->layer_param_.image_data_param().shuffle()) {
// randomly shuffle data
LOG(INFO) << "Shuffling data";
const unsigned int prefetch_rng_seed = caffe_rng_rand();
prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
ShuffleImages();
}
LOG(INFO) << "A total of " << lines_.size() << " images.";
lines_id_ = 0;
#ifdef USE_MPI
//for multi GPU test,all gpu get one part of val dataset
int rank,size=0;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
lines_id_ = (lines_.size()/size)*rank;
#endif
// Check if we would need to randomly skip a few data points
if (this->layer_param_.image_data_param().rand_skip()) {
unsigned int skip = caffe_rng_rand() %
this->layer_param_.image_data_param().rand_skip();
LOG(INFO) << "Skipping first " << skip << " data points.";
CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
lines_id_ += skip;
}
// Read an image, and use it to initialize the top blob.
cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
new_height, new_width, is_color);
CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
// Use data_transformer to infer the expected blob shape from a cv_image.
vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
this->transformed_data_.Reshape(top_shape);
// Reshape prefetch_data and top[0] according to the batch_size.
const int batch_size = this->layer_param_.image_data_param().batch_size();
CHECK_GT(batch_size, 0) << "Positive batch size required";
top_shape[0] = batch_size;
for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
this->prefetch_[i].data_.Reshape(top_shape);
}
top[0]->Reshape(top_shape);
LOG(INFO) << "output data size: " << top[0]->num() << ","
<< top[0]->channels() << "," << top[0]->height() << ","
<< top[0]->width();
// label
vector<int> label_shape(1, batch_size);
top[1]->Reshape(label_shape);
for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
this->prefetch_[i].label_.Reshape(label_shape);
}
}
示例2: main
int main (int argc, char *argv[])
{
int myid, num_procs;
int n, N, pi, pj, pk;
double h;
double tol, theta;
int maxit, cycle_type;
int rlx_type, rlx_sweeps, rlx_weight, rlx_omega;
int amg_coarsen_type, amg_agg_levels, amg_rlx_type;
int amg_interp_type, amg_Pmax;
int singular_problem ;
HYPRE_Int time_index;
HYPRE_SStructGrid edge_grid;
HYPRE_SStructGraph A_graph;
HYPRE_SStructMatrix A;
HYPRE_SStructVector b;
HYPRE_SStructVector x;
HYPRE_SStructGrid node_grid;
HYPRE_SStructGraph G_graph;
HYPRE_SStructStencil G_stencil[3];
HYPRE_SStructMatrix G;
HYPRE_SStructVector xcoord, ycoord, zcoord;
HYPRE_Solver solver, precond;
/* Initialize MPI */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
/* Set default parameters */
n = 10;
optionAlpha = 0;
optionBeta = 0;
maxit = 100;
tol = 1e-6;
cycle_type = 13;
rlx_type = 2;
rlx_sweeps = 1;
rlx_weight = 1.0;
rlx_omega = 1.0;
amg_coarsen_type = 10;
amg_agg_levels = 1;
amg_rlx_type = 6;
theta = 0.25;
amg_interp_type = 6;
amg_Pmax = 4;
singular_problem = 0;
/* Parse command line */
{
int arg_index = 0;
int print_usage = 0;
while (arg_index < argc)
{
if ( strcmp(argv[arg_index], "-n") == 0 )
{
arg_index++;
n = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-a") == 0 )
{
arg_index++;
optionAlpha = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-b") == 0 )
{
arg_index++;
optionBeta = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-maxit") == 0 )
{
arg_index++;
maxit = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-tol") == 0 )
{
arg_index++;
tol = atof(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-type") == 0 )
{
arg_index++;
cycle_type = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-rlx") == 0 )
{
arg_index++;
rlx_type = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-rlxn") == 0 )
{
arg_index++;
rlx_sweeps = atoi(argv[arg_index++]);
}
else if ( strcmp(argv[arg_index], "-rlxw") == 0 )
//.........这里部分代码省略.........
示例3: main
int main(int argc,char** argv)
{
int taskid, ntasks;
MPI_Status status;
int ierr,i,j,itask;
int buffsize;
double **sendbuff,*recvbuff,buffsum,buffsums[1024];
double inittime,totaltime,recvtime,recvtimes[1024];
/*===============================================================*/
/* MPI Initialisation. Its important to put this call at the */
/* begining of the program, after variable declarations. */
MPI_Init(&argc, &argv);
/*===============================================================*/
/* Get the number of MPI tasks and the taskid of this task. */
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
/*===============================================================*/
/* Get buffsize value from program arguments. */
buffsize=atoi(argv[1]);
/*===============================================================*/
/* Printing out the description of the example. */
if ( taskid == 0 ){
printf("\n\n\n");
printf("##########################################################\n\n");
printf(" Example 4 \n\n");
printf(" Point-to-point Communication: MPI_Send MPI_Recv \n\n");
printf(" Vector size: %d\n",buffsize);
printf(" Number of tasks: %d\n\n",ntasks);
printf("##########################################################\n\n");
printf(" --> BEFORE COMMUNICATION <--\n\n");
}
if ( taskid == 0 ){
/*=============================================================*/
/* Memory allocation. */
sendbuff=(double **)malloc(sizeof(double *)*ntasks);
sendbuff[0]=(double *)malloc(sizeof(double)*ntasks*buffsize);
for(i=1;i<ntasks;i++)sendbuff[i]=sendbuff[i-1]+buffsize;
/*=============================================================*/
/* Vectors and/or matrices initalisation. */
srand((unsigned)time( NULL ) + taskid);
for(itask=0;itask<ntasks;itask++){
for(i=0;i<buffsize;i++){
sendbuff[itask][i]=(double)rand()/RAND_MAX;
}
}
/*==============================================================*/
/* Print out before communication. */
for(itask=1;itask<ntasks;itask++){
buffsum=0.0;
for(i=0;i<buffsize;i++){
buffsum=buffsum+sendbuff[itask][i];
}
printf("Task %d : Sum of vector sent to %d -> %e \n",
taskid,itask,buffsum);
}
}
else{
/*=============================================================*/
/* Memory allocation. */
recvbuff=(double *)malloc(sizeof(double)*buffsize);
}
/*===============================================================*/
/* Communication. */
inittime = MPI_Wtime();
if ( taskid == 0 ){
for(itask=1 ; itask<ntasks ; itask++){
ierr = MPI_Send(sendbuff[itask],
buffsize,
MPI_DOUBLE,
itask,
0,
MPI_COMM_WORLD);
}
}
else{
ierr = MPI_Recv(recvbuff,
buffsize,
MPI_DOUBLE,
0,
MPI_ANY_TAG,
//.........这里部分代码省略.........
示例4: main
int main(int argc, char *argv[])
{
FILE *parameterfile = NULL;
int j, i, ix = 0, isample = 0, op_id = 0;
char datafilename[206];
char parameterfilename[206];
char conf_filename[50];
char * input_filename = NULL;
char * filename = NULL;
double plaquette_energy;
struct stout_parameters params_smear;
spinor **s, *s_;
#ifdef _KOJAK_INST
#pragma pomp inst init
#pragma pomp inst begin(main)
#endif
#if (defined SSE || defined SSE2 || SSE3)
signal(SIGILL, &catch_ill_inst);
#endif
DUM_DERI = 8;
DUM_MATRIX = DUM_DERI + 5;
#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR)
NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
#else
NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
#endif
verbose = 0;
g_use_clover_flag = 0;
#ifdef MPI
# ifdef OMP
int mpi_thread_provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
# else
MPI_Init(&argc, &argv);
# endif
MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
#else
g_proc_id = 0;
#endif
process_args(argc,argv,&input_filename,&filename);
set_default_filenames(&input_filename, &filename);
/* Read the input file */
if( (j = read_input(input_filename)) != 0) {
fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
exit(-1);
}
#ifdef OMP
init_openmp();
#endif
/* this DBW2 stuff is not needed for the inversion ! */
if (g_dflgcr_flag == 1) {
even_odd_flag = 0;
}
g_rgi_C1 = 0;
if (Nsave == 0) {
Nsave = 1;
}
if (g_running_phmc) {
NO_OF_SPINORFIELDS = DUM_MATRIX + 8;
}
tmlqcd_mpi_init(argc, argv);
g_dbw2rand = 0;
/* starts the single and double precision random number */
/* generator */
start_ranlux(rlxd_level, random_seed);
/* we need to make sure that we don't have even_odd_flag = 1 */
/* if any of the operators doesn't use it */
/* in this way even/odd can still be used by other operators */
for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0;
#ifndef MPI
g_dbw2rand = 0;
#endif
#ifdef _GAUGE_COPY
j = init_gauge_field(VOLUMEPLUSRAND, 1);
#else
j = init_gauge_field(VOLUMEPLUSRAND, 0);
#endif
if (j != 0) {
fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
exit(-1);
}
j = init_geometry_indices(VOLUMEPLUSRAND);
//.........这里部分代码省略.........
示例5: getFileOutputName
//.........这里部分代码省略.........
oss << middlePart ;
if (hasEndDate) oss << splitEnd.getStr(splitFormat);
oss << lastPart ;
StdString keySuffix("CContext_"+CContext::getCurrent()->getId()+"::CFile_"+getFileOutputName()+"::") ;
context->registryOut->setKey(keySuffix+"splitStart", lastSplit);
context->registryOut->setKey(keySuffix+"splitEnd", splitEnd);
}
else oss<<firstPart<<lastPart ;
bool append = !this->append.isEmpty() && this->append.getValue();
bool useClassicFormat = !format.isEmpty() && format == format_attr::netcdf4_classic;
bool useCFConvention = convention.isEmpty() || convention == convention_attr::CF;
bool multifile = true;
if (!type.isEmpty())
{
if (type == type_attr::one_file) multifile = false;
else if (type == type_attr::multiple_file) multifile = true;
}
#ifndef USING_NETCDF_PAR
if (!multifile)
{
info(0) << "!!! Warning -> Using non parallel version of netcdf, switching in multiple_file mode for file : " << filename << " ..." << endl;
multifile = true;
}
#endif
if (multifile)
{
int commSize, commRank;
MPI_Comm_size(fileComm, &commSize);
MPI_Comm_rank(fileComm, &commRank);
if (server->intraCommSize > 1)
{
oss << "_" ;
int width=0; int n = commSize-1;
while (n != 0) { n = n / 10; width++;}
if (!min_digits.isEmpty())
if (width < min_digits) width = min_digits;
oss.width(width);
oss.fill('0');
oss << right << commRank;
}
}
oss << ".nc";
bool isCollective = par_access.isEmpty() || par_access == par_access_attr::collective;
if (isOpen) data_out->closeFile();
data_out = shared_ptr<CDataOutput>(new CNc4DataOutput(this, oss.str(), append, useClassicFormat, useCFConvention,
fileComm, multifile, isCollective, time_counter_name));
isOpen = true;
data_out->writeFile(CFile::get(this));
// Do not recreate the file structure if opening an existing file
if (!data_out->IsInAppendMode())
{
std::vector<CField*>::iterator it, end = this->enabledFields.end();
for (it = this->enabledFields.begin(); it != end; it++)
{
CField* field = *it;
示例6: CreateMesh
PetscErrorCode CreateMesh(MPI_Comm comm, AppCtx *user, DM *dm)
{
PetscInt dim = user->dim;
PetscBool interpolate = user->interpolate;
PetscBool refinementUniform = user->refinementUniform;
PetscReal refinementLimit = user->refinementLimit;
PetscBool cellSimplex = user->cellSimplex;
const char *filename = user->filename;
const char *partitioner = "chaco";
size_t len;
PetscMPIInt rank;
PetscErrorCode ierr;
PetscFunctionBegin;
ierr = PetscLogEventBegin(user->createMeshEvent,0,0,0,0);CHKERRQ(ierr);
ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
ierr = PetscStrlen(filename, &len);CHKERRQ(ierr);
if (len) {
#if defined(PETSC_HAVE_CGNS)
int cgid = -1;
if (!rank) {
ierr = cg_open(filename, CG_MODE_READ, &cgid);CHKERRQ(ierr);
if (cgid <= 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "cg_open(\"%s\",...) did not return a valid file ID", filename);
}
ierr = DMPlexCreateCGNS(comm, cgid, interpolate, dm);CHKERRQ(ierr);
if (!rank) {ierr = cg_close(cgid);CHKERRQ(ierr);}
#else
SETERRQ(comm, PETSC_ERR_SUP, "Loading meshes requires CGNS support. Reconfigure using --with-cgns-dir");
#endif
} else if (cellSimplex) {
ierr = DMPlexCreateBoxMesh(comm, dim, interpolate, dm);CHKERRQ(ierr);
} else {
const PetscInt cells[3] = {2, 2, 2};
ierr = DMPlexCreateHexBoxMesh(comm, dim, cells, dm);CHKERRQ(ierr);
}
{
DM refinedMesh = NULL;
DM distributedMesh = NULL;
/* Refine mesh using a volume constraint */
ierr = DMPlexSetRefinementUniform(*dm, PETSC_FALSE);CHKERRQ(ierr);
ierr = DMPlexSetRefinementLimit(*dm, refinementLimit);CHKERRQ(ierr);
ierr = DMRefine(*dm, comm, &refinedMesh);CHKERRQ(ierr);
if (refinedMesh) {
ierr = DMDestroy(dm);CHKERRQ(ierr);
*dm = refinedMesh;
}
/* Distribute mesh over processes */
ierr = DMPlexDistribute(*dm, partitioner, 0, &distributedMesh);CHKERRQ(ierr);
if (distributedMesh) {
ierr = DMDestroy(dm);CHKERRQ(ierr);
*dm = distributedMesh;
}
if (refinementUniform) {
ierr = DMPlexSetRefinementUniform(*dm, refinementUniform);CHKERRQ(ierr);
ierr = DMRefine(*dm, comm, &refinedMesh);CHKERRQ(ierr);
if (refinedMesh) {
ierr = DMDestroy(dm);CHKERRQ(ierr);
*dm = refinedMesh;
}
}
}
ierr = PetscObjectSetName((PetscObject) *dm, "Simplical Mesh");CHKERRQ(ierr);
ierr = DMSetFromOptions(*dm);CHKERRQ(ierr);
ierr = PetscLogEventEnd(user->createMeshEvent,0,0,0,0);CHKERRQ(ierr);
user->dm = *dm;
PetscFunctionReturn(0);
}
示例7: mcfft3_init
int mcfft3_init(int pad1 /* padding on the first axis */,
int nx, int ny, int nz /* input data size */,
int *nx2, int *ny2, int *nz2 /* padded data size */,
int *n_local, int *o_local /* local size & start */)
/*< initialize >*/
{
int i, nth=1;
int cpuid;
MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);
fftwf_mpi_init();
/* axis 1 */
nk = n1 = kiss_fft_next_fast_size(nx*pad1);
/* axis 2 */
n2 = kiss_fft_next_fast_size(ny);
/* axis 3 */
n3 = kiss_fft_next_fast_size(nz);
alloc_local = fftwf_mpi_local_size_2d_transposed(n3, n2*n1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);
cc = sf_complexalloc(n1*n2*local_n0);
/* kiss-fft */
#ifdef _OPENMP
#pragma omp parallel
{nth = omp_get_num_threads();}
#endif
cfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
icfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
cfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
icfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
cfg3 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
icfg3 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
for (i=0; i < nth; i++) {
cfg1[i] = kiss_fft_alloc(n1,0,NULL,NULL);
icfg1[i]= kiss_fft_alloc(n1,1,NULL,NULL);
cfg2[i] = kiss_fft_alloc(n2,0,NULL,NULL);
icfg2[i]= kiss_fft_alloc(n2,1,NULL,NULL);
cfg3[i] = kiss_fft_alloc(n3,0,NULL,NULL);
icfg3[i]= kiss_fft_alloc(n3,1,NULL,NULL);
}
ctrace2= (kiss_fft_cpx **) sf_complexalloc2(n2,nth);
ctrace3= (kiss_fft_cpx **) sf_complexalloc2(n3,nth);
//tmp = (kiss_fft_cpx *) sf_complexalloc(alloc_local);
tmp = (kiss_fft_cpx *) sf_alloc(alloc_local,sizeof(kiss_fft_cpx));
tmp2= (sf_complex *) tmp;
/* fftw for transpose */
cfg = fftwf_mpi_plan_many_transpose(n3,n2*n1,2,
FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
(float *) tmp,
(float *) tmp,
MPI_COMM_WORLD,
FFTW_MEASURE);
icfg= fftwf_mpi_plan_many_transpose(n2*n1,n3,2,
FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
(float *) tmp,
(float *) tmp,
MPI_COMM_WORLD,
FFTW_MEASURE);
if (NULL == cfg || NULL == icfg) sf_error("FFTW failure.");
*nx2 = n1;
*ny2 = n2;
*nz2 = n3;
*n_local = (int) local_n0;
*o_local = (int) local_0_start;
wt = 1.0/(n3*n2*n1);
return (nk*n2*n3);
}
示例8: main
int main(int argc, char **argv)
{
int rank, size;
int i, j, k;
int **table, *buffer;
int begin_row, end_row, send_count, *recv_counts, *displs;
char errmsg[200];
MPI_Init( &argc, &argv );
Test_Init_No_File();
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Comm_size( MPI_COMM_WORLD, &size );
/* get buffer space and init table */
buffer = (int *) malloc( (size * BLOCKSIZE) * (size * BLOCKSIZE) * sizeof(int) );
table = (int **)malloc( (size * BLOCKSIZE) * sizeof(int *) );
recv_counts = (int *) malloc( size * sizeof(int) );
displs = (int *) malloc( size * sizeof(int) );
if( !buffer || !table || !recv_counts || !displs ) {
fprintf( stderr, "Out of memory error!\n" );
MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
}
for( i = 0; i < size * BLOCKSIZE; i++ )
table[i] = &(buffer[i*size*BLOCKSIZE]);
/* Determine what rows are my responsibility */
begin_row = rank * BLOCKSIZE;
end_row = (rank + 1) * BLOCKSIZE;
send_count = BLOCKSIZE * size * BLOCKSIZE;
for( i = 0; i < size; i++ ) {
recv_counts[i] = BLOCKSIZE * size * BLOCKSIZE;
displs[i] = i * BLOCKSIZE * size * BLOCKSIZE;
}
/* Paint my rows my color */
for( i = begin_row; i < end_row ; i++ )
for( j = 0; j < size * BLOCKSIZE; j++ )
table[i][j] = rank + 10;
/* Gather everybody's result together - sort of like an */
/* inefficient allgather */
for (i = 0; i < size; i++)
MPI_Gatherv(table[begin_row], send_count, MPI_INT,
table[0], recv_counts, displs, MPI_INT, i,
MPI_COMM_WORLD);
/* Everybody should have the same table now. */
for( i = 0; i < size; i++ )
for( j = 0; j < BLOCKSIZE; j++ )
for( k = 0; k < size * BLOCKSIZE; k++ )
if( table[i*BLOCKSIZE+j][k] != i + 10 ) {
sprintf(errmsg, "[%d] got %d expected %d for %dth entry in row %d\n",
rank, table[i*BLOCKSIZE+j][k], i + 10, k, i*BLOCKSIZE + j);
Test_Message( errmsg );
Test_Failed( NULL );
}
Test_Waitforall();
Test_Global_Summary();
free( buffer );
free( table );
free( recv_counts );
free( displs );
MPI_Finalize();
exit( EXIT_SUCCESS );
}
示例9: main
int main( int argc , char ** argv )
{
int comm_rank = 0 ;
#if defined( KOKKOS_ENABLE_MPI )
MPI_Init( & argc , & argv );
MPI_Comm comm = MPI_COMM_WORLD ;
MPI_Comm_rank( comm , & comm_rank );
#else
MPI_Comm comm = 0 ;
(void) comm ; // suppress warning
#endif
int cmdline[ CMD_COUNT ] ;
for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
if ( 0 == comm_rank ) {
for ( int i = 1 ; i < argc ; ++i ) {
if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
}
else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
}
else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
sscanf( argv[++i] , "%dx%d" ,
cmdline + CMD_USE_NUMA ,
cmdline + CMD_USE_CORE_PER_NUMA );
}
else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
cmdline[ CMD_USE_CUDA ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
cmdline[ CMD_USE_CUDA ] = 1 ;
cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
}
else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
cmdline[ CMD_USE_ROCM ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
sscanf( argv[++i] , "%dx%dx%d" ,
cmdline + CMD_USE_FIXTURE_X ,
cmdline + CMD_USE_FIXTURE_Y ,
cmdline + CMD_USE_FIXTURE_Z );
}
else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
sscanf( argv[++i] , "%d..%d" ,
cmdline + CMD_USE_FIXTURE_BEGIN ,
cmdline + CMD_USE_FIXTURE_END );
}
else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
cmdline[ CMD_USE_ATOMIC ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
}
else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
cmdline[ CMD_VTUNE ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
cmdline[ CMD_PRINT ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
cmdline[ CMD_ECHO ] = 1 ;
}
else {
cmdline[ CMD_ERROR ] = 1 ;
std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
}
}
if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
}
#if defined( KOKKOS_ENABLE_MPI )
MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
#endif
if ( cmdline[ CMD_VTUNE ] ) {
std::stringstream cmd;
pid_t my_os_pid=getpid();
const std::string vtune_loc =
"/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
const std::string output_dir = "./vtune/vtune.";
const int p_rank = comm_rank;
cmd << vtune_loc
<< " -collect hotspots -result-dir " << output_dir << p_rank
<< " -target-pid " << my_os_pid << " &";
if (p_rank == 0)
std::cout << cmd.str() << std::endl;
system(cmd.str().c_str());
system("sleep 10");
}
if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
//.........这里部分代码省略.........
示例10: main
int main(int argc, char* argv[])
{
bool verb;
int it,iz,im,ikz,ikx,iky,ix,iy,i,j,snap; /* index variables */
int nt,nz,nx,ny, m2, nk, nzx, nz2, nx2, ny2, nzx2, n2, pad1;
float dt;
sf_complex c;
float *rr; /* I/O arrays*/
sf_complex *cwave, *cwavem, *ww;
sf_complex **wave, *curr;
float *rcurr, *rcurr_all;
sf_file Fw,Fr,Fo; /* I/O files */
sf_axis at,az,ax,ay; /* cube axes */
sf_complex **lt, **rt;
sf_file left, right, snaps;
/*MPI related*/
int cpuid,numprocs;
int provided;
int n_local, o_local;
int ozx2;
float *sendbuf, *recvbuf;
int *rcounts, *displs;
MPI_Init_thread(&argc,&argv,MPI_THREAD_FUNNELED,&provided);
threads_ok = provided >= MPI_THREAD_FUNNELED;
sf_init(argc,argv);
MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!sf_getbool("verb",&verb)) verb=true; /* verbosity */
/* setup I/O files */
Fw = sf_input ("--input" );
Fo = sf_output("--output");
Fr = sf_input ("ref");
/* Read/Write axes */
at = sf_iaxa(Fw,1); nt = sf_n(at); dt = sf_d(at);
az = sf_iaxa(Fr,1); nz = sf_n(az);
ax = sf_iaxa(Fr,2); nx = sf_n(ax);
ay = sf_iaxa(Fr,3); ny = sf_n(ay);
if (!sf_getint("pad1",&pad1)) pad1=1; /* padding factor on the first axis */
if (!sf_getint("snap",&snap)) snap=0;
/* interval for snapshots */
if (cpuid==0) {
sf_oaxa(Fo,az,1);
sf_oaxa(Fo,ax,2);
sf_oaxa(Fo,ay,3);
sf_settype(Fo,SF_FLOAT);
if (snap > 0) {
snaps = sf_output("snaps");
/* (optional) snapshot file */
sf_oaxa(snaps,az,1);
sf_oaxa(snaps,ax,2);
sf_oaxa(snaps,ay,3);
sf_oaxa(snaps,at,4);
sf_settype(snaps,SF_FLOAT);
sf_putint(snaps,"n4",nt/snap);
sf_putfloat(snaps,"d4",dt*snap);
sf_putfloat(snaps,"o4",0.);
} else {
snaps = NULL;
}
}
//nk = cfft3_init(pad1,nz,nx,ny,&nz2,&nx2,&ny2);
//n_local = ny2;
//o_local = 0;
nk = mcfft3_init(pad1,nz,nx,ny,&nz2,&nx2,&ny2,&n_local,&o_local);
sf_warning("Cpuid=%d,n2=%d,n1=%d,n0=%d,local_n0=%d,local_0_start=%d",cpuid,nz2,nx2,ny2,n_local,o_local);
nzx = nz*nx*ny;
//nzx2 = nz2*nx2*ny2;
nzx2 = n_local*nz2*nx2;
ozx2 = o_local*nz2*nx2;
/* propagator matrices */
left = sf_input("left");
right = sf_input("right");
if (!sf_histint(left,"n1",&n2) || n2 != nzx) sf_error("Need n1=%d in left",nzx);
if (!sf_histint(left,"n2",&m2)) sf_error("Need n2=%d in left",m2);
if (!sf_histint(right,"n1",&n2) || n2 != m2) sf_error("Need n1=%d in right",m2);
if (!sf_histint(right,"n2",&n2) || n2 != nk) sf_error("Need n2=%d in right",nk);
//.........这里部分代码省略.........
示例11: run
void run( MPI_Comm comm , const int cmd[] )
{
int comm_rank = 0 ;
#if defined( KOKKOS_ENABLE_MPI )
MPI_Comm_rank( comm , & comm_rank );
#else
comm = 0 ;
#endif
if ( 0 == comm_rank ) {
if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }
if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
else { std::cout << " , LINEAR-ELEMENT" ; }
if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
}
std::vector< std::pair<std::string,std::string> > headers;
headers.push_back(std::make_pair("ELEMS","count"));
headers.push_back(std::make_pair("NODES","count"));
headers.push_back(std::make_pair("NEWTON","iter"));
headers.push_back(std::make_pair("CG","iter"));
headers.push_back(std::make_pair("MAP_RATIO","ratio"));
headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
headers.push_back(std::make_pair("SCAN/NODE","millisec"));
headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
headers.push_back(std::make_pair("SORT/NODE","millisec"));
headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
headers.push_back(std::make_pair("ERROR","ratio"));
// find print widths
size_t min_width = 10;
std::vector< size_t > widths(headers.size());
for (size_t i=0, ie=headers.size(); i<ie; ++i)
widths[i] = std::max(min_width, headers[i].first.size()+1);
// print column headers
if ( 0 == comm_rank ) {
std::cout << std::endl ;
for (size_t i=0; i<headers.size(); ++i)
std::cout << std::setw(widths[i]) << headers[i].first << " ,";
std::cout << "\b\b " << std::endl;
for (size_t i=0; i<headers.size(); ++i)
std::cout << std::setw(widths[i]) << headers[i].second << " ,";
std::cout << "\b\b " << std::endl;
std::cout << std::scientific;
std::cout.precision(3);
}
if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
int nelem[3] ;
nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
nelem[1] = 1 + nelem[0] ;
nelem[2] = 2 * nelem[0] ;
const Kokkos::Example::FENL::Perf perf =
cmd[ CMD_USE_FIXTURE_QUADRATIC ]
? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
: Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
;
if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
}
}
else {
int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
cmd[ CMD_USE_FIXTURE_Y ] ,
cmd[ CMD_USE_FIXTURE_Z ] };
const Kokkos::Example::FENL::Perf perf =
cmd[ CMD_USE_FIXTURE_QUADRATIC ]
? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
: Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
;
if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
}
}
示例12: main
int main(int argc,char **argv)
{
PetscMPIInt rank;
PetscInt M = -10,N = -8;
PetscErrorCode ierr;
PetscBool flg = PETSC_FALSE;
DM da;
PetscViewer viewer;
Vec local,global;
PetscScalar value;
DMDABoundaryType bx = DMDA_BOUNDARY_NONE,by = DMDA_BOUNDARY_NONE;
DMDAStencilType stype = DMDA_STENCIL_BOX;
#if defined(PETSC_HAVE_MATLAB_ENGINE)
PetscViewer mviewer;
#endif
ierr = PetscInitialize(&argc,&argv,(char*)0,help);CHKERRQ(ierr);
ierr = PetscViewerDrawOpen(PETSC_COMM_WORLD,0,"",300,0,300,300,&viewer);CHKERRQ(ierr);
#if defined(PETSC_HAVE_MATLAB_ENGINE)
ierr = PetscViewerMatlabOpen(PETSC_COMM_WORLD,"tmp.mat",FILE_MODE_WRITE,&mviewer);CHKERRQ(ierr);
#endif
ierr = PetscOptionsGetBool(PETSC_NULL,"-star_stencil",&flg,PETSC_NULL);CHKERRQ(ierr);
if (flg) stype = DMDA_STENCIL_STAR;
/* Create distributed array and get vectors */
ierr = DMDACreate2d(PETSC_COMM_WORLD,bx,by,stype,M,N,PETSC_DECIDE,PETSC_DECIDE,1,1,PETSC_NULL,PETSC_NULL,&da);CHKERRQ(ierr);
ierr = DMCreateGlobalVector(da,&global);CHKERRQ(ierr);
ierr = DMCreateLocalVector(da,&local);CHKERRQ(ierr);
value = -3.0;
ierr = VecSet(global,value);CHKERRQ(ierr);
ierr = DMGlobalToLocalBegin(da,global,INSERT_VALUES,local);CHKERRQ(ierr);
ierr = DMGlobalToLocalEnd(da,global,INSERT_VALUES,local);CHKERRQ(ierr);
ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
value = rank+1;
ierr = VecScale(local,value);CHKERRQ(ierr);
ierr = DMLocalToGlobalBegin(da,local,ADD_VALUES,global);CHKERRQ(ierr);
ierr = DMLocalToGlobalEnd(da,local,ADD_VALUES,global);CHKERRQ(ierr);
flg = PETSC_FALSE;
ierr = PetscOptionsGetBool(PETSC_NULL, "-view_global", &flg,PETSC_NULL);CHKERRQ(ierr);
if (flg) { /* view global vector in natural ordering */
ierr = VecView(global,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
}
ierr = DMView(da,viewer);CHKERRQ(ierr);
ierr = VecView(global,viewer);CHKERRQ(ierr);
#if defined(PETSC_HAVE_MATLAB_ENGINE)
ierr = DMView(da,mviewer);CHKERRQ(ierr);
ierr = VecView(global,mviewer);CHKERRQ(ierr);
#endif
/* Free memory */
#if defined(PETSC_HAVE_MATLAB_ENGINE)
ierr = PetscViewerDestroy(&mviewer);CHKERRQ(ierr);
#endif
ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
ierr = VecDestroy(&local);CHKERRQ(ierr);
ierr = VecDestroy(&global);CHKERRQ(ierr);
ierr = DMDestroy(&da);CHKERRQ(ierr);
ierr = PetscFinalize();
return 0;
}
示例13: HYPRE_LSI_PolySetup
int HYPRE_LSI_PolySetup(HYPRE_Solver solver, HYPRE_ParCSRMatrix A_csr,
HYPRE_ParVector b, HYPRE_ParVector x )
{
int i, j, my_id, startRow, endRow, order;
int pos_diag, neg_diag;
int rowLeng, *colInd, *row_partition;
double *coefs=NULL, rowsum, max_norm, *colVal;
HYPRE_LSI_Poly *poly_ptr = (HYPRE_LSI_Poly *) solver;
#ifndef HYPRE_SEQUENTIAL
double dtemp;
#endif
/* ---------------------------------------------------------------- */
/* initialize structure */
/* ---------------------------------------------------------------- */
order = poly_ptr->order;
coefs = (double *) malloc((order+1) * sizeof(double));
poly_ptr->coefficients = coefs;
/* ---------------------------------------------------------------- */
/* compute matrix norm */
/* ---------------------------------------------------------------- */
HYPRE_ParCSRMatrixGetRowPartitioning( A_csr, &row_partition );
#ifdef HYPRE_SEQUENTIAL
my_id = 0;
#else
MPI_Comm_rank(poly_ptr->comm, &my_id);
#endif
startRow = row_partition[my_id];
endRow = row_partition[my_id+1] - 1;
hypre_TFree( row_partition );
poly_ptr->Nrows = endRow - startRow + 1;
max_norm = 0.0;
pos_diag = neg_diag = 0;
for ( i = startRow; i <= endRow; i++ )
{
HYPRE_ParCSRMatrixGetRow(A_csr, i, &rowLeng, &colInd, &colVal);
rowsum = 0.0;
for (j = 0; j < rowLeng; j++)
{
rowsum += habs(colVal[j]);
if ( colInd[j] == i && colVal[j] > 0.0 ) pos_diag++;
if ( colInd[j] == i && colVal[j] < 0.0 ) neg_diag++;
}
if ( rowsum > max_norm ) max_norm = rowsum;
HYPRE_ParCSRMatrixRestoreRow(A_csr, i, &rowLeng, &colInd, &colVal);
}
#ifndef HYPRE_SEQUENTIAL
MPI_Allreduce(&max_norm, &dtemp, 1, MPI_INT, MPI_MAX, poly_ptr->comm);
#endif
if ( pos_diag == 0 && neg_diag > 0 ) max_norm = - max_norm;
/* ---------------------------------------------------------------- */
/* fill in the coefficient table */
/* ---------------------------------------------------------------- */
switch ( order )
{
case 0: coefs[0] = 1.0; break;
case 1: coefs[0] = 5.0; coefs[1] = -1.0; break;
case 2: coefs[0] = 14.0; coefs[1] = -7.0; coefs[2] = 1.0;
break;
case 3: coefs[0] = 30.0; coefs[1] = -27.0; coefs[2] = 9.0;
coefs[3] = -1.0; break;
case 4: coefs[0] = 55.0; coefs[1] = -77.0; coefs[2] = 44.0;
coefs[3] = -11.0; coefs[4] = 1.0; break;
case 5: coefs[0] = 91.0; coefs[1] = -182.0; coefs[2] = 156.0;
coefs[3] = -65.0; coefs[4] = 13.0; coefs[5] = -1.0;
break;
case 6: coefs[0] = 140.0; coefs[1] = -378.0; coefs[2] = 450.0;
coefs[3] = -275.0; coefs[4] = 90.0; coefs[5] = -15.0;
coefs[6] = 1.0; break;
case 7: coefs[0] = 204.0; coefs[1] = -714.0; coefs[2] = 1122.0;
coefs[3] = -935.0; coefs[4] = 442.0; coefs[5] = -119.0;
coefs[6] = 17.0; coefs[7] = -1.0; break;
case 8: coefs[0] = 285.0; coefs[1] = -1254.0; coefs[2] = 2508.0;
coefs[3] = -2717.0; coefs[4] = 1729.0; coefs[5] = -665.0;
coefs[6] = 152.0; coefs[7] = -19.0; coefs[8] = 1.0;
break;
}
for( i = 0; i <= order; i++ )
coefs[i] *= pow( 4.0 / max_norm, (double) i);
return 0;
}
示例14: MAIN__
int MAIN__()
{
int argc=1;
char * name = "c_example";
char ** argv ;
#else
int main(int argc, char ** argv)
{
#endif
DMUMPS_STRUC_C id;
int n = 6;
int nz = 21;
int irnL[] = {1,2,3,4,5,6,2,3,4,5,6,3,4,5,6,4,5,6,5,6,6};
int jcnL[] = {1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6};
int irnS[] = {1,2,3,2,3,3};
int jcnS[] = {1,1,1,2,2,3};
int *irn = irnL;
int *jcn = jcnL;
double a[21];
double rhs[6];
int myid, ierr, numP;
#if defined(MAIN_COMP)
argv = &name;
#endif
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &myid);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &numP);
double nump = numP * 1.0;
/* Define A and rhs */
rhs[0]=2.0;rhs[1]=2.0;rhs[2]=2.0;rhs[3]=2.0;rhs[4]=2.0;rhs[5]=2.0;
a[0]=4169.95/nump;
a[1]=0.0;
a[2]=10075.0/nump;
a[3]=-4030;
a[4]=0.0;
a[5]=0.0;
a[6]=10084.0/nump;
a[7]=-1612.0/nump;
a[8]=0.0;
a[9]=-8.9556;
a[10]=-1612;
a[11]=1354080.0/nump;
a[12]=0.0;
a[13]=1612.0;
a[14]=193440.0;
a[15]=4169.93;
a[16]=0.0;
a[17]=10075;
a[18]=10084;
a[19]=1612;
a[20]=1354000;
if (myid != 0) {
nz = 6;
a[0]=4169.95/nump;
a[1]=0.0;
a[2]=10075.0/nump;
a[3]=10084.0/nump;
a[4]=-1612.0/nump;
a[5]=1354080.0/nump;
irn = irnS;
jcn = jcnS;
}
#define ICNTL(I) icntl[(I)-1] /* macro s.t. indices match documentation */
/* Initialize a MUMPS instance. Use MPI_COMM_WORLD */
id.job=JOB_INIT; id.par=1; id.sym=2; id.comm_fortran=USE_COMM_WORLD;
/* parallel solver; distributed i/p matrix A */
id.ICNTL(5)=0; id.ICNTL(18)=3;
dmumps_c(&id);
/* Define the problem on the host */
/*
id.n = n; id.nz =nz; id.irn=irn; id.jcn=jcn;
id.a = a; id.rhs = rhs;
*/
/* parallel solver; distributed i/p matrix A */
id.ICNTL(5)=0; id.ICNTL(18)=3;
id.n = n; id.nz_loc =nz; id.irn_loc=irn; id.jcn_loc=jcn;
id.a_loc = a; id.rhs = rhs;
/* No outputs */
id.ICNTL(1)=-1; id.ICNTL(2)=-1; id.ICNTL(3)=-1; id.ICNTL(4)=0;
//.........这里部分代码省略.........
示例15: MPI_Comm_rank
std::unique_ptr<const bfly::PotentialField<R,d,q>>
transform
( const bfly::Context<R,d,q>& context,
const Plan<d>& plan,
const Amplitude<R,d>& amplitude,
const Phase<R,d>& phase,
const Box<R,d>& sBox,
const Box<R,d>& tBox,
const vector<Source<R,d>>& mySources )
{
#ifdef TIMING
bfly::ResetTimers();
bfly::timer.Start();
#endif
typedef complex<R> C;
const size_t q_to_d = Pow<q,d>::val;
// Extract our communicator and its size
MPI_Comm comm = plan.GetComm();
int rank, numProcesses;
MPI_Comm_rank( comm, &rank );
MPI_Comm_size( comm, &numProcesses );
// Get the problem-specific parameters
const size_t N = plan.GetN();
const size_t log2N = Log2( N );
const array<size_t,d>& myInitialSBoxCoords =
plan.GetMyInitialSourceBoxCoords();
const array<size_t,d>& log2InitialSBoxesPerDim =
plan.GetLog2InitialSourceBoxesPerDim();
array<size_t,d> mySBoxCoords = myInitialSBoxCoords;
array<size_t,d> log2SBoxesPerDim = log2InitialSBoxesPerDim;
Box<R,d> mySBox;
for( size_t j=0; j<d; ++j )
{
mySBox.widths[j] = sBox.widths[j] / (1u<<log2SBoxesPerDim[j]);
mySBox.offsets[j] = sBox.offsets[j] + mySBox.widths[j]*mySBoxCoords[j];
}
array<size_t,d> myTBoxCoords, log2TBoxesPerDim;
myTBoxCoords.fill(0);
log2TBoxesPerDim.fill(0);
Box<R,d> myTBox;
myTBox = tBox;
const size_t bootstrap = plan.GetBootstrapSkip();
// Compute the number of source and target boxes that our process is
// responsible for initializing weights in
size_t log2WeightGridSize = 0;
size_t log2LocalSBoxes = 0;
size_t log2LocalTBoxes = 0;
array<size_t,d> log2LocalSBoxesPerDim, log2LocalTBoxesPerDim;
log2LocalTBoxesPerDim.fill(0);
for( size_t j=0; j<d; ++j )
{
if( log2N-log2SBoxesPerDim[j] >= bootstrap )
log2LocalSBoxesPerDim[j]= (log2N-log2SBoxesPerDim[j]) - bootstrap;
else
log2LocalSBoxesPerDim[j] = 0;
log2LocalTBoxesPerDim[j] = bootstrap;
log2LocalSBoxes += log2LocalSBoxesPerDim[j];
log2LocalTBoxes += log2LocalTBoxesPerDim[j];
log2WeightGridSize += log2N-log2SBoxesPerDim[j];
}
// Initialize the weights using Lagrangian interpolation on the
// smooth component of the kernel.
WeightGridList<R,d,q> weightGridList( 1u<<log2WeightGridSize );
#ifdef TIMING
bfly::initializeWeightsTimer.Start();
#endif
bfly::InitializeWeights
( context, plan, phase, sBox, tBox, mySBox,
log2LocalSBoxes, log2LocalSBoxesPerDim, mySources, weightGridList );
#ifdef TIMING
bfly::initializeWeightsTimer.Stop();
#endif
// Now cut the target domain if necessary
for( size_t j=0; j<d; ++j )
{
if( log2LocalSBoxesPerDim[j] == 0 )
{
log2LocalTBoxesPerDim[j] -= bootstrap - (log2N-log2SBoxesPerDim[j]);
log2LocalTBoxes -= bootstrap - (log2N-log2SBoxesPerDim[j]);
}
}
// Start the main recursion loop
if( bootstrap == log2N/2 )
{
#ifdef TIMING
bfly::M2LTimer.Start();
#endif
bfly::M2L
( context, plan, amplitude, phase, sBox, tBox, mySBox, myTBox,
log2LocalSBoxes, log2LocalTBoxes,
log2LocalSBoxesPerDim, log2LocalTBoxesPerDim, weightGridList );
#ifdef TIMING
bfly::M2LTimer.Stop();
//.........这里部分代码省略.........