当前位置: 首页>>代码示例>>C++>>正文


C++ op_timers_core函数代码示例

本文整理汇总了C++中op_timers_core函数的典型用法代码示例。如果您正苦于以下问题:C++ op_timers_core函数的具体用法?C++ op_timers_core怎么用?C++ op_timers_core使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了op_timers_core函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: op_par_loop_res

// host stub function
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3){

  int nargs = 4;
  op_arg args[4];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map1idx = arg1.map_data[n * arg1.map->dim + 1];
      int map2idx = arg1.map_data[n * arg1.map->dim + 0];


      res(
        &((double*)arg0.data)[1 * n],
        &((double*)arg1.data)[1 * map1idx],
        &((double*)arg2.data)[1 * map2idx],
        (double*)arg3.data);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg1.size;
  OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg3.size;
  OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:61,代码来源:res_seqkernel.cpp

示例2: op_par_loop_save_soln

void op_par_loop_save_soln(char const *name, op_set set,      
  op_arg arg0,                                                
  op_arg arg1 ){                                              
                                                              
  int ninds   = 0;    
  int nargs   = 2;
  op_arg args[2] = {arg0,arg1};

  if (OP_diags>2) {                                           
    printf(" kernel routine w/o indirection:  save_soln \n"); 
  }                                                           
                                                              
  // initialise timers                                        
                                                              
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                    
  op_timers_core(&cpu_t1, &wall_t1);                               
                                                              
  // set number of threads                                    
                                                              
#ifdef _OPENMP                                                
  int nthreads = omp_get_max_threads( );                      
#else                                                         
  int nthreads = 1;                                           
#endif                                                        
                                                              
  // execute plan                                             
                                                              
#pragma omp parallel for                                      
  for (int thr=0; thr<nthreads; thr++) {                      
    int start  = (set->size* thr   )/nthreads;                
    int finish = (set->size*(thr+1))/nthreads;                
    op_x86_save_soln( (double *) arg0.data,                    
                      (double *) arg1.data,                    
                      start, finish );                        
  }                                                           
                           
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE

  
  
  // update kernel record                                     
                                                              
  op_timers_core(&cpu_t2, &wall_t2);                               
  op_timing_realloc(0);                                       
  OP_kernels[0].name      = name;                             
  OP_kernels[0].count    += 1;                                
  OP_kernels[0].time     += wall_t2 - wall_t1;                
  OP_kernels[0].transfer += (double)set->size * arg0.size;     
  OP_kernels[0].transfer += (double)set->size * arg1.size;     
}                                                             
开发者ID:ioz9,项目名称:OP2-Common,代码行数:56,代码来源:save_soln_mpi_kernel.cpp

示例3: op_par_loop_save_soln_cpu

// host stub function
void op_par_loop_save_soln_cpu(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1){

  int nargs = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  save_soln");
  }

  op_mpi_halo_exchanges(set, nargs, args);
  // set number of threads
  #ifdef _OPENMP
    int nthreads = omp_get_max_threads();
  #else
    int nthreads = 1;
  #endif

  if (set->size >0) {

    // execute plan
    #pragma omp parallel for
    for ( int thr=0; thr<nthreads; thr++ ){
      int start  = (set->size* thr)/nthreads;
      int finish = (set->size*(thr+1))/nthreads;
      for ( int n=start; n<finish; n++ ){
        save_soln(
          &((double*)arg0.data)[4*n],
          &((double*)arg1.data)[4*n]);
      }
    }
  }

  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg1.size;
}
开发者ID:pushkarjain1991,项目名称:OP2-Common,代码行数:55,代码来源:save_soln_cpu_kernel.cpp

示例4: op_par_loop_update

// host stub function
void op_par_loop_update(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4){

  int nargs = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  update");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      update(
        &((float*)arg0.data)[1*n],
        &((float*)arg1.data)[1*n],
        &((float*)arg2.data)[1*n],
        (float*)arg3.data,
        (float*)arg4.data);
    }
  }

  // combine reduction data
  op_mpi_reduce_float(&arg3,(float*)arg3.data);
  op_mpi_reduce_float(&arg4,(float*)arg4.data);
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;
  OP_kernels[1].time     += wall_t2 - wall_t1;
  OP_kernels[1].transfer += (float)set->size * arg0.size;
  OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:55,代码来源:update_seqkernel.cpp

示例5: op_par_loop_res_calc

void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9 ){


  int    nargs   = 13;
  op_arg args[13];

  arg0.idx = 0;
  args[0] = arg0;
  for (int v = 1; v < 4; v++) {
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }
  arg4.idx = 0;
  args[4] = arg4;
  for (int v = 1; v < 4; v++) {
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }
  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for (int v = 1; v < 4; v++) {
    args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
  }

  int    ninds   = 3;
  int    inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         (double *)arg4.data,
         (double *)arg9.data,
         Plan->ind_map,
         Plan->loc_map,
         (double *)arg8.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
//.........这里部分代码省略.........
开发者ID:xyuan,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_kernel.cpp

示例6: op_par_loop_update

void op_par_loop_update(char const *name, op_set set,           
  op_arg arg0,                                                  
  op_arg arg1,                                                  
  op_arg arg2,                                                  
  op_arg arg3,                                                  
  op_arg arg4 ){                                                
   
  int ninds   = 0;    
  int nargs   = 5; 
  op_arg args[5] = {arg0,arg1,arg2,arg3,arg4};
  
  double *arg4h = (double *)arg4.data;                            
                                                                
  if (OP_diags>2) {                                             
    printf(" kernel routine w/o indirection:  update \n");      
  }                                                             
                                                                
  // initialise timers                                          
                                                                
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                      
  op_timers_core(&cpu_t1, &wall_t1);                                 
                                                                
  // set number of threads                                      
                                                                
#ifdef _OPENMP                                                  
  int nthreads = omp_get_max_threads( );                        
#else                                                           
  int nthreads = 1;                                             
#endif                                                          
                                                                
  // allocate and initialise arrays for global reduction        
                                                                
  double arg4_l[1+64*64];                                        
  for (int thr=0; thr<nthreads; thr++)                          
    for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double;        
                                                                
  // execute plan                                               
                                                                
#pragma omp parallel for                                        
  for (int thr=0; thr<nthreads; thr++) {                        
    int start  = (set->size* thr   )/nthreads;                  
    int finish = (set->size*(thr+1))/nthreads;                  
    op_x86_update( (double *) arg0.data,                         
                   (double *) arg1.data,                         
                   (double *) arg2.data,                         
                   (double *) arg3.data,                         
                   arg4_l + thr*64,                             
                   start, finish );                             
  }                                                             
                                                                
  // combine reduction data                                     
                                                                
  for (int thr=0; thr<nthreads; thr++)                          
    for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64];        
     
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_GBL) 
      	global_reduce(&args[i]);
  


  // update kernel record                                       
                                                                
  op_timers_core(&cpu_t2, &wall_t2);                                 
  op_timing_realloc(4);                                         
  OP_kernels[4].name      = name;                               
  OP_kernels[4].count    += 1;                                  
  OP_kernels[4].time     += wall_t2 - wall_t1;                  
  OP_kernels[4].transfer += (double)set->size * arg0.size;       
  OP_kernels[4].transfer += (double)set->size * arg1.size;       
  OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (double)set->size * arg3.size;       
}                                                               
开发者ID:ioz9,项目名称:OP2-Common,代码行数:79,代码来源:update_mpi_kernel.cpp

示例7: op_par_loop_res_calc

// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9,
  op_arg arg13){

  int nargs = 17;
  op_arg args[17];

  arg0.idx = 0;
  args[0] = arg0;
  for ( int v=1; v<4; v++ ){
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }

  arg4.idx = 0;
  args[4] = arg4;
  for ( int v=1; v<4; v++ ){
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }

  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for ( int v=1; v<4; v++ ){
    args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW);
  }

  arg13.idx = 0;
  args[13] = arg13;
  for ( int v=1; v<4; v++ ){
    args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC);
  }


  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  int  ninds   = 4;
  int  inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan
  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_0
    int nthread = OP_BLOCK_SIZE_0;
  #else
    int nthread = OP_block_size;
  #endif


  int ncolors = 0;
  int set_size1 = set->size + set->exec_size;

  if (set->size >0) {

    if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) {
      opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0);
      opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST;
    }
    if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) {
      direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8);
      direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST;
    }

    //Set up typed device pointers for OpenMP
    int *map0 = arg0.map_data_d;
     int map0size = arg0.map->dim * set_size1;

    double* data8 = (double*)arg8.data_d;
    int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim;
    double *data0 = (double *)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
    double *data4 = (double *)arg4.data_d;
    int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim;
    double *data9 = (double *)arg9.data_d;
    int dat9size =
        (arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim;
    double *data13 = (double *)arg13.data_d;
    int dat13size =
        (arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim;

    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;
//.........这里部分代码省略.........
开发者ID:OP2,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_omp4kernel.cpp

示例8: op_par_loop_EvolveValuesRK2_1

void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4 ){


  int    nargs   = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;

  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  EvolveValuesRK2_1\n");
  }

  op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  if (set->size >0) {

    op_timers_core(&cpu_t1, &wall_t1);

  // execute plan

#pragma omp parallel for
  for (int thr=0; thr<nthreads; thr++) {
    int start  = (set->size* thr   )/nthreads;
    int finish = (set->size*(thr+1))/nthreads;
    op_x86_EvolveValuesRK2_1( (float *) arg0.data,
                              (float *) arg1.data,
                              (float *) arg2.data,
                              (float *) arg3.data,
                              (float *) arg4.data,
                              start, finish );
  }

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[0].transfer += (float)set->size * arg2.size;
  OP_kernels[0].transfer += (float)set->size * arg3.size;
  OP_kernels[0].transfer += (float)set->size * arg4.size;
}
开发者ID:xyliuucl,项目名称:Volna-1,代码行数:72,代码来源:EvolveValuesRK2_1_kernel.cpp

示例9: op_par_loop_res_calc

void op_par_loop_res_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5,                                                          
  op_arg arg6,                                                          
  op_arg arg7 ){                                                        
                                                                        
                                                                        
  int    nargs   = 8;                                                   
  op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7};           
                                                                        
  int    ninds   = 4;                                                   
  int    inds[8] = {0,0,1,1,2,2,3,3};                                   
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res_calc \n");            
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_2                                                 
    int part_size = OP_PART_SIZE_2;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers_core(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_res_calc( blockIdx,                                         
       (double *)arg0.data, Plan->ind_maps[0],                          
       (double *)arg2.data, Plan->ind_maps[1],                          
       (double *)arg4.data, Plan->ind_maps[2],                          
       (double *)arg6.data, Plan->ind_maps[3],                          
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       Plan->loc_maps[4],                                               
       Plan->loc_maps[5],                                               
       Plan->loc_maps[6],                                               
       Plan->loc_maps[7],                                               
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // combine reduction data                                             
                                                                        
  // update kernel record                                               
                                                                        
  op_timers_core(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(2);                                                 
  OP_kernels[2].name      = name;                                       
  OP_kernels[2].count    += 1;                                          
  OP_kernels[2].time     += wall_t2 - wall_t1;                          
  OP_kernels[2].transfer  += Plan->transfer;                            
  OP_kernels[2].transfer2 += Plan->transfer2;                           
}                                                                       
开发者ID:ioz9,项目名称:OP2-Common,代码行数:90,代码来源:res_calc_kernel.cpp

示例10: op_par_loop_adt_calc

// host stub function
void op_par_loop_adt_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;

  int  ninds   = 1;
  int  inds[6] = {0,0,0,0,-1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

  // get plan
  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_1
    int part_size = OP_PART_SIZE_1;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_1
    int nthread = OP_BLOCK_SIZE_1;
  #else
    int nthread = OP_block_size;
  #endif


  int ncolors = 0;
  int set_size1 = set->size + set->exec_size;

  if (set->size >0) {

    //Set up typed device pointers for OpenMP
    int *map0 = arg0.map_data_d;
     int map0size = arg0.map->dim * set_size1;

    float* data4 = (float*)arg4.data_d;
    int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim;
    float* data5 = (float*)arg5.data_d;
    int dat5size = getSetSizeFromOpArg(&arg5) * arg5.dat->dim;
    float *data0 = (float *)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;

    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;

    // execute plan
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col+1];

      adt_calc_omp4_kernel(map0, map0size, data4, dat4size, data5, dat5size,
                           data0, dat0size, col_reord, set_size1, start, end,
                           part_size != 0 ? (end - start - 1) / part_size + 1
                                          : (end - start - 1) / nthread,
                           nthread);
    }
    OP_kernels[1].transfer  += Plan->transfer;
    OP_kernels[1].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  if (OP_diags>1) deviceSync();
  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].time     += wall_t2 - wall_t1;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:97,代码来源:adt_calc_omp4kernel.cpp

示例11: op_timers

void op_timers(double * cpu, double * et)
{
  op_timers_core(cpu,et);
}
开发者ID:ioz9,项目名称:OP2-Common,代码行数:4,代码来源:op_cuda_decl.c

示例12: op_timers

void op_timers(double * cpu, double * et)
{
  MPI_Barrier(MPI_COMM_WORLD);
  op_timers_core(cpu,et);
}
开发者ID:ioz9,项目名称:OP2-Common,代码行数:5,代码来源:op_mpi_decl.c

示例13: op_par_loop_update

// host stub function
void op_par_loop_update(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3){

  double*arg3h = (double *)arg3.data;
  int nargs = 4;
  op_arg args[4];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(8);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[8].name      = name;
  OP_kernels[8].count    += 1;


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  update");
  }

  op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_8
    int part_size = OP_PART_SIZE_8;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_8
    int nthread = OP_BLOCK_SIZE_8;
  #else
    int nthread = OP_block_size;
  #endif

  double arg3_l = arg3h[0];

  if (set->size >0) {

    //Set up typed device pointers for OpenMP

    double* data0 = (double*)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
    double* data1 = (double*)arg1.data_d;
    int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim;
    double* data2 = (double*)arg2.data_d;
    int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim;
    update_omp4_kernel(
      data0,
      dat0size,
      data1,
      dat1size,
      data2,
      dat2size,
      &arg3_l,
      set->size,
      part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread,
      nthread);

  }

  // combine reduction data
  arg3h[0] = arg3_l;
  op_mpi_reduce_double(&arg3,arg3h);
  op_mpi_set_dirtybit_cuda(nargs, args);

  if (OP_diags>1) deviceSync();
  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[8].time     += wall_t2 - wall_t1;
  OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f;
  OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[8].transfer += (float)set->size * arg2.size;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:80,代码来源:update_omp4kernel.cpp

示例14: op_par_loop_res_calc

void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){

  int *arg1h = (int *)arg1.data;

  int    nargs   = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  int    ninds   = 1;
  int    inds[2] = {0,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  int arg1_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         Plan->ind_map,
         Plan->loc_map,
         &arg1_l[64*omp_get_thread_num()],
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);


  // combine reduction data
    if (col == Plan->ncolors_owned-1) {
      for (int thr=0; thr<nthreads; thr++)
        for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
    }

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data
//.........这里部分代码省略.........
开发者ID:Benjamin-git,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_kernel.cpp

示例15: op_par_loop_res

void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3 ){

  float *arg3h = (float *)arg3.data;

  int    nargs   = 4;
  op_arg args[4] = {arg0,arg1,arg2,arg3};

  int    ninds   = 2;
  int    inds[4] = {-1,0,1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res \n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
  // execute plan

  int block_offset = 0;

  for (int col=0; col < Plan->ncolors; col++) {
    if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

    int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res( blockIdx,
         (float *)arg1.data,
         (float *)arg2.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg0.data,
         (float *)arg3.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

    block_offset += nblocks;
  }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
开发者ID:xyuan,项目名称:OP2-Common,代码行数:87,代码来源:res_kernel.cpp


注:本文中的op_timers_core函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。