本文整理汇总了C++中ops_register_args函数的典型用法代码示例。如果您正苦于以下问题:C++ ops_register_args函数的具体用法?C++ ops_register_args怎么用?C++ ops_register_args使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ops_register_args函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: ops_par_loop_reset_field_kernel1_execute
// host stub function
void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
ops_arg arg3 = desc->args[3];
// Timing
double t1, t2, c1, c2;
ops_arg args[4] = {arg0, arg1, arg2, arg3};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 4, range, 139))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[139].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[3];
int end[3];
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "reset_field_kernel1");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
double *__restrict__ density0 = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
const double *__restrict__ density1 = (double *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
double *__restrict__ energy0 = (double *)(args[2].data + base2);
int base3 = args[3].dat->base_offset;
const double *__restrict__ energy1 = (double *)(args[3].data + base3);
// initialize global variable with the dimension of dats
int xdim0_reset_field_kernel1 = args[0].dat->size[0];
int ydim0_reset_field_kernel1 = args[0].dat->size[1];
int xdim1_reset_field_kernel1 = args[1].dat->size[0];
int ydim1_reset_field_kernel1 = args[1].dat->size[1];
int xdim2_reset_field_kernel1 = args[2].dat->size[0];
int ydim2_reset_field_kernel1 = args[2].dat->size[1];
int xdim3_reset_field_kernel1 = args[3].dat->size[0];
int ydim3_reset_field_kernel1 = args[3].dat->size[1];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[139].mpi_time += t1 - t2;
}
#pragma omp parallel for collapse(2)
for (int n_z = start[2]; n_z < end[2]; n_z++) {
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(density0, density1, energy0, energy1)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
density0[OPS_ACC0(0, 0, 0)] = density1[OPS_ACC1(0, 0, 0)];
energy0[OPS_ACC2(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 0)];
}
}
}
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[139].time += t2 - t1;
}
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c1, &t1);
OPS_kernels[139].mpi_time += t1 - t2;
OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1);
OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2);
OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3);
}
}
示例2: ops_par_loop_update_halo_kernel1_fr2
// host stub function
void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range,
ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3,
ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) {
//Timing
double t1,t2,c1,c2;
ops_timers_core(&c1,&t1);
int offs[8][3];
ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7};
ops_timing_realloc(51,"update_halo_kernel1_fr2");
OPS_kernels[51].count++;
//compute locally allocated range for the sub-block
int start[3];
int end[3];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
if (!sb->owned) return;
for ( int n=0; n<3; n++ ){
start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n];
if (start[n] >= range[2*n]) {
start[n] = 0;
}
else {
start[n] = range[2*n] - start[n];
}
if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n];
if (end[n] >= range[2*n+1]) {
end[n] = range[2*n+1] - sb->decomp_disp[n];
}
else {
end[n] = sb->decomp_size[n];
}
if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n]))
end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]);
}
#else //OPS_MPI
for ( int n=0; n<3; n++ ){
start[n] = range[2*n];end[n] = range[2*n+1];
}
#endif //OPS_MPI
#ifdef OPS_DEBUG
ops_register_args(args, "update_halo_kernel1_fr2");
#endif
offs[0][0] = args[0].stencil->stride[0]*1; //unit step in x dimension
offs[0][1] = off3D(1, &start[0],
&end[0],args[0].dat->size, args[0].stencil->stride) - offs[0][0];
offs[0][2] = off3D(2, &start[0],
&end[0],args[0].dat->size, args[0].stencil->stride) - offs[0][1] - offs[0][0];
offs[1][0] = args[1].stencil->stride[0]*1; //unit step in x dimension
offs[1][1] = off3D(1, &start[0],
&end[0],args[1].dat->size, args[1].stencil->stride) - offs[1][0];
offs[1][2] = off3D(2, &start[0],
&end[0],args[1].dat->size, args[1].stencil->stride) - offs[1][1] - offs[1][0];
offs[2][0] = args[2].stencil->stride[0]*1; //unit step in x dimension
offs[2][1] = off3D(1, &start[0],
&end[0],args[2].dat->size, args[2].stencil->stride) - offs[2][0];
offs[2][2] = off3D(2, &start[0],
&end[0],args[2].dat->size, args[2].stencil->stride) - offs[2][1] - offs[2][0];
offs[3][0] = args[3].stencil->stride[0]*1; //unit step in x dimension
offs[3][1] = off3D(1, &start[0],
&end[0],args[3].dat->size, args[3].stencil->stride) - offs[3][0];
offs[3][2] = off3D(2, &start[0],
&end[0],args[3].dat->size, args[3].stencil->stride) - offs[3][1] - offs[3][0];
offs[4][0] = args[4].stencil->stride[0]*1; //unit step in x dimension
offs[4][1] = off3D(1, &start[0],
&end[0],args[4].dat->size, args[4].stencil->stride) - offs[4][0];
offs[4][2] = off3D(2, &start[0],
&end[0],args[4].dat->size, args[4].stencil->stride) - offs[4][1] - offs[4][0];
offs[5][0] = args[5].stencil->stride[0]*1; //unit step in x dimension
offs[5][1] = off3D(1, &start[0],
&end[0],args[5].dat->size, args[5].stencil->stride) - offs[5][0];
offs[5][2] = off3D(2, &start[0],
&end[0],args[5].dat->size, args[5].stencil->stride) - offs[5][1] - offs[5][0];
offs[6][0] = args[6].stencil->stride[0]*1; //unit step in x dimension
offs[6][1] = off3D(1, &start[0],
&end[0],args[6].dat->size, args[6].stencil->stride) - offs[6][0];
offs[6][2] = off3D(2, &start[0],
&end[0],args[6].dat->size, args[6].stencil->stride) - offs[6][1] - offs[6][0];
int off0_0 = offs[0][0];
int off0_1 = offs[0][1];
int off0_2 = offs[0][2];
//.........这里部分代码省略.........
示例3: ops_par_loop_update_halo_kernel1_t1
// host stub function
void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block,
int dim, int *range, ops_arg arg0,
ops_arg arg1, ops_arg arg2,
ops_arg arg3, ops_arg arg4,
ops_arg arg5, ops_arg arg6,
ops_arg arg7) {
// Timing
double t1, t2, c1, c2;
char *p_a[8];
int offs[8][3];
ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 8, range, 14))
return;
#endif
if (OPS_diags > 1) {
ops_timing_realloc(14, "update_halo_kernel1_t1");
OPS_kernels[14].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[3];
int end[3];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
#endif
#ifdef OPS_DEBUG
ops_register_args(args, "update_halo_kernel1_t1");
#endif
int arg_idx[3];
int arg_idx_base[3];
#ifdef OPS_MPI
if (compute_ranges(args, 8, block, range, start, end, arg_idx) < 0)
return;
#else // OPS_MPI
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
arg_idx[n] = start[n];
}
#endif // OPS_MPI
for (int n = 0; n < 3; n++) {
arg_idx_base[n] = arg_idx[n];
}
offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
offs[0][1] =
off3D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][0];
offs[0][2] =
off3D(2, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][1] - offs[0][0];
offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension
offs[1][1] =
off3D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][0];
offs[1][2] =
off3D(2, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][1] - offs[1][0];
offs[2][0] = args[2].stencil->stride[0] * 1; // unit step in x dimension
offs[2][1] =
off3D(1, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
offs[2][0];
offs[2][2] =
off3D(2, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
offs[2][1] - offs[2][0];
offs[3][0] = args[3].stencil->stride[0] * 1; // unit step in x dimension
offs[3][1] =
off3D(1, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
offs[3][0];
offs[3][2] =
off3D(2, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
offs[3][1] - offs[3][0];
offs[4][0] = args[4].stencil->stride[0] * 1; // unit step in x dimension
offs[4][1] =
off3D(1, &start[0], &end[0], args[4].dat->size, args[4].stencil->stride) -
offs[4][0];
offs[4][2] =
off3D(2, &start[0], &end[0], args[4].dat->size, args[4].stencil->stride) -
offs[4][1] - offs[4][0];
offs[5][0] = args[5].stencil->stride[0] * 1; // unit step in x dimension
offs[5][1] =
off3D(1, &start[0], &end[0], args[5].dat->size, args[5].stencil->stride) -
offs[5][0];
offs[5][2] =
off3D(2, &start[0], &end[0], args[5].dat->size, args[5].stencil->stride) -
offs[5][1] - offs[5][0];
//.........这里部分代码省略.........
示例4: ops_par_loop_update_halo_kernel3_plus_4_a
// host stub function
void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name,
ops_block block, int dim,
int *range, ops_arg arg0,
ops_arg arg1, ops_arg arg2) {
// Timing
double t1, t2, c1, c2;
char *p_a[3];
int offs[3][2];
ops_arg args[3] = {arg0, arg1, arg2};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 3, range, 33))
return;
#endif
if (OPS_diags > 1) {
ops_timing_realloc(33, "update_halo_kernel3_plus_4_a");
OPS_kernels[33].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[2];
int end[2];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
#endif
#ifdef OPS_DEBUG
ops_register_args(args, "update_halo_kernel3_plus_4_a");
#endif
int arg_idx[2];
int arg_idx_base[2];
#ifdef OPS_MPI
if (compute_ranges(args, 3, block, range, start, end, arg_idx) < 0)
return;
#else // OPS_MPI
for (int n = 0; n < 2; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
arg_idx[n] = start[n];
}
#endif // OPS_MPI
for (int n = 0; n < 2; n++) {
arg_idx_base[n] = arg_idx[n];
}
offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
offs[0][1] =
off2D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][0];
offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension
offs[1][1] =
off2D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][0];
int off0_0 = offs[0][0];
int off0_1 = offs[0][1];
int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size);
int off1_0 = offs[1][0];
int off1_1 = offs[1][1];
int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size);
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset +
(OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) *
start[0] * args[0].stencil->stride[0];
base0 = base0 +
(OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) *
args[0].dat->size[0] * start[1] * args[0].stencil->stride[1];
p_a[0] = (char *)args[0].data + base0;
int base1 = args[1].dat->base_offset +
(OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) *
start[0] * args[1].stencil->stride[0];
base1 = base1 +
(OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) *
args[1].dat->size[0] * start[1] * args[1].stencil->stride[1];
p_a[1] = (char *)args[1].data + base1;
p_a[2] = args[2].data;
// initialize global variable with the dimension of dats
xdim0 = args[0].dat->size[0];
xdim1 = args[1].dat->size[0];
// Halo Exchanges
ops_H_D_exchanges_host(args, 3);
ops_halo_exchanges(args, 3, range);
ops_H_D_exchanges_host(args, 3);
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[33].mpi_time += t1 - t2;
}
//.........这里部分代码省略.........
示例5: ops_par_loop_advec_cell_kernel2_zdir
// host stub function
void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block,
int dim, int *range, ops_arg arg0,
ops_arg arg1, ops_arg arg2,
ops_arg arg3) {
// Timing
double t1, t2, c1, c2;
int offs[4][3];
ops_arg args[4] = {arg0, arg1, arg2, arg3};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 4, range, 118))
return;
#endif
if (OPS_diags > 1) {
ops_timing_realloc(118, "advec_cell_kernel2_zdir");
OPS_kernels[118].count++;
ops_timers_core(&c1, &t1);
}
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
#endif
// compute locally allocated range for the sub-block
int start[3];
int end[3];
int arg_idx[3];
#ifdef OPS_MPI
if (!sb->owned)
return;
for (int n = 0; n < 3; n++) {
start[n] = sb->decomp_disp[n];
end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
if (start[n] >= range[2 * n]) {
start[n] = 0;
} else {
start[n] = range[2 * n] - start[n];
}
if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
start[n] = range[2 * n];
if (end[n] >= range[2 * n + 1]) {
end[n] = range[2 * n + 1] - sb->decomp_disp[n];
} else {
end[n] = sb->decomp_size[n];
}
if (sb->id_p[n] == MPI_PROC_NULL &&
(range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
if (end[n] < start[n])
end[n] = start[n];
}
#else
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#endif
#ifdef OPS_DEBUG
ops_register_args(args, "advec_cell_kernel2_zdir");
#endif
offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
offs[0][1] =
off3D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][0];
offs[0][2] =
off3D(2, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][1] - offs[0][0];
offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension
offs[1][1] =
off3D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][0];
offs[1][2] =
off3D(2, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][1] - offs[1][0];
offs[2][0] = args[2].stencil->stride[0] * 1; // unit step in x dimension
offs[2][1] =
off3D(1, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
offs[2][0];
offs[2][2] =
off3D(2, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
offs[2][1] - offs[2][0];
offs[3][0] = args[3].stencil->stride[0] * 1; // unit step in x dimension
offs[3][1] =
off3D(1, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
offs[3][0];
offs[3][2] =
off3D(2, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
offs[3][1] - offs[3][0];
int off0_0 = offs[0][0];
//.........这里部分代码省略.........
示例6: ops_par_loop_initialise_chunk_kernel_z_execute
// host stub function
void ops_par_loop_initialise_chunk_kernel_z_execute(
ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
// Timing
double t1, t2, c1, c2;
ops_arg args[3] = {arg0, arg1, arg2};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 3, range, 5))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[5].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[3];
int end[3];
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "initialise_chunk_kernel_z");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
double *__restrict__ vertexz = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
const int *__restrict__ zz = (int *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
double *__restrict__ vertexdz = (double *)(args[2].data + base2);
// initialize global variable with the dimension of dats
int xdim0_initialise_chunk_kernel_z = args[0].dat->size[0];
int ydim0_initialise_chunk_kernel_z = args[0].dat->size[1];
int xdim1_initialise_chunk_kernel_z = args[1].dat->size[0];
int ydim1_initialise_chunk_kernel_z = args[1].dat->size[1];
int xdim2_initialise_chunk_kernel_z = args[2].dat->size[0];
int ydim2_initialise_chunk_kernel_z = args[2].dat->size[1];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[5].mpi_time += t1 - t2;
}
#pragma omp parallel for collapse(2)
for (int n_z = start[2]; n_z < end[2]; n_z++) {
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(vertexz, zz, vertexdz)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
int z_min = field.z_min - 2;
double min_z, d_z;
d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells;
min_z = grid.zmin + d_z * field.back;
vertexz[OPS_ACC0(0, 0, 0)] =
min_z + d_z * (zz[OPS_ACC1(0, 0, 0)] - z_min);
vertexdz[OPS_ACC2(0, 0, 0)] = (double)d_z;
}
}
}
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[5].time += t2 - t1;
}
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c1, &t1);
OPS_kernels[5].mpi_time += t1 - t2;
OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1);
OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2);
}
}
示例7: ops_par_loop_tea_leaf_init_zero2_kernel_execute
// host stub function
void ops_par_loop_tea_leaf_init_zero2_kernel_execute(
ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
// Timing
double t1, t2, c1, c2;
ops_arg args[2] = {arg0, arg1};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 2, range, 16))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[16].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[2];
int end[2];
for (int n = 0; n < 2; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "tea_leaf_init_zero2_kernel");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
double *__restrict__ p = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
double *__restrict__ z = (double *)(args[1].data + base1);
// initialize global variable with the dimension of dats
int xdim0_tea_leaf_init_zero2_kernel = args[0].dat->size[0];
int xdim1_tea_leaf_init_zero2_kernel = args[1].dat->size[0];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[16].mpi_time += t1 - t2;
}
#pragma omp parallel for
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(p, z)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
p[OPS_ACC0(0, 0)] = 0.0;
z[OPS_ACC1(0, 0)] = 0.0;
}
}
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[16].time += t2 - t1;
}
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c1, &t1);
OPS_kernels[16].mpi_time += t1 - t2;
OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1);
}
}
示例8: ops_par_loop_advec_mom_kernel_y2
// host stub function
void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range,
ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) {
char *p_a[4];
int offs[4][2];
ops_arg args[4] = { arg0, arg1, arg2, arg3};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args,4,range,18)) return;
#endif
ops_timing_realloc(18,"advec_mom_kernel_y2");
OPS_kernels[18].count++;
//compute locally allocated range for the sub-block
int start[2];
int end[2];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
if (!sb->owned) return;
for ( int n=0; n<2; n++ ){
start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n];
if (start[n] >= range[2*n]) {
start[n] = 0;
}
else {
start[n] = range[2*n] - start[n];
}
if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n];
if (end[n] >= range[2*n+1]) {
end[n] = range[2*n+1] - sb->decomp_disp[n];
}
else {
end[n] = sb->decomp_size[n];
}
if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n]))
end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]);
}
#else //OPS_MPI
for ( int n=0; n<2; n++ ){
start[n] = range[2*n];end[n] = range[2*n+1];
}
#endif //OPS_MPI
#ifdef OPS_DEBUG
ops_register_args(args, "advec_mom_kernel_y2");
#endif
offs[0][0] = args[0].stencil->stride[0]*1; //unit step in x dimension
offs[0][1] = off2D(1, &start[0],
&end[0],args[0].dat->size, args[0].stencil->stride) - offs[0][0];
offs[1][0] = args[1].stencil->stride[0]*1; //unit step in x dimension
offs[1][1] = off2D(1, &start[0],
&end[0],args[1].dat->size, args[1].stencil->stride) - offs[1][0];
offs[2][0] = args[2].stencil->stride[0]*1; //unit step in x dimension
offs[2][1] = off2D(1, &start[0],
&end[0],args[2].dat->size, args[2].stencil->stride) - offs[2][0];
offs[3][0] = args[3].stencil->stride[0]*1; //unit step in x dimension
offs[3][1] = off2D(1, &start[0],
&end[0],args[3].dat->size, args[3].stencil->stride) - offs[3][0];
//Timing
double t1,t2,c1,c2;
ops_timers_core(&c2,&t2);
int off0_0 = offs[0][0];
int off0_1 = offs[0][1];
int dat0 = args[0].dat->elem_size;
int off1_0 = offs[1][0];
int off1_1 = offs[1][1];
int dat1 = args[1].dat->elem_size;
int off2_0 = offs[2][0];
int off2_1 = offs[2][1];
int dat2 = args[2].dat->elem_size;
int off3_0 = offs[3][0];
int off3_1 = offs[3][1];
int dat3 = args[3].dat->elem_size;
//set up initial pointers and exchange halos if necessary
int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else //OPS_MPI
for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d];
#endif //OPS_MPI
int base0 = dat0 * 1 *
(start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]);
base0 = base0+ dat0 *
args[0].dat->size[0] *
(start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]);
p_a[0] = (char *)args[0].data + base0;
//.........这里部分代码省略.........
示例9: ops_par_loop_ideal_gas_kernel_execute
// host stub function
void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
ops_arg arg3 = desc->args[3];
// Timing
double t1, t2, c1, c2;
ops_arg args[4] = {arg0, arg1, arg2, arg3};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 4, range, 8))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[8].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[2];
int end[2];
for (int n = 0; n < 2; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "ideal_gas_kernel");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
const double *__restrict__ density = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
const double *__restrict__ energy = (double *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
double *__restrict__ pressure = (double *)(args[2].data + base2);
int base3 = args[3].dat->base_offset;
double *__restrict__ soundspeed = (double *)(args[3].data + base3);
// initialize global variable with the dimension of dats
int xdim0_ideal_gas_kernel = args[0].dat->size[0];
int xdim1_ideal_gas_kernel = args[1].dat->size[0];
int xdim2_ideal_gas_kernel = args[2].dat->size[0];
int xdim3_ideal_gas_kernel = args[3].dat->size[0];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[8].mpi_time += t1 - t2;
}
#pragma omp parallel for
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(density, energy, pressure, soundspeed)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
double sound_speed_squared, v, pressurebyenergy, pressurebyvolume;
v = 1.0 / density[OPS_ACC0(0, 0)];
pressure[OPS_ACC2(0, 0)] =
(1.4 - 1.0) * density[OPS_ACC0(0, 0)] * energy[OPS_ACC1(0, 0)];
pressurebyenergy = (1.4 - 1.0) * density[OPS_ACC0(0, 0)];
pressurebyvolume =
-1 * density[OPS_ACC0(0, 0)] * pressure[OPS_ACC2(0, 0)];
sound_speed_squared =
v * v *
(pressure[OPS_ACC2(0, 0)] * pressurebyenergy - pressurebyvolume);
soundspeed[OPS_ACC3(0, 0)] = sqrt(sound_speed_squared);
}
}
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[8].time += t2 - t1;
}
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c1, &t1);
OPS_kernels[8].mpi_time += t1 - t2;
OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1);
OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2);
OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3);
}
//.........这里部分代码省略.........
示例10: ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute
// host stub function
void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(
ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
ops_arg arg3 = desc->args[3];
ops_arg arg4 = desc->args[4];
// Timing
double t1, t2, c1, c2;
ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 5, range, 136))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[136].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[3];
int end[3];
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "advec_mom_kernel_post_pre_advec_z");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
double *__restrict__ node_mass_post = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
const double *__restrict__ post_vol = (double *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
const double *__restrict__ density1 = (double *)(args[2].data + base2);
int base3 = args[3].dat->base_offset;
double *__restrict__ node_mass_pre = (double *)(args[3].data + base3);
int base4 = args[4].dat->base_offset;
const double *__restrict__ node_flux = (double *)(args[4].data + base4);
// initialize global variable with the dimension of dats
int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0];
int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1];
int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0];
int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1];
int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0];
int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1];
int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0];
int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1];
int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0];
int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[136].mpi_time += t1 - t2;
}
#pragma omp parallel for collapse(2)
for (int n_z = start[2]; n_z < end[2]; n_z++) {
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \
node_flux)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
node_mass_post[OPS_ACC0(0, 0, 0)] =
0.125 *
(density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] +
density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] +
density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] +
density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] +
density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] +
density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] +
density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] +
density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]);
node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] -
node_flux[OPS_ACC4(0, 0, -1)] +
node_flux[OPS_ACC4(0, 0, 0)];
}
//.........这里部分代码省略.........
示例11: ops_par_loop_advec_mom_kernel1_z_nonvector_execute
// host stub function
void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(
ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
ops_arg arg3 = desc->args[3];
ops_arg arg4 = desc->args[4];
// Timing
double t1, t2, c1, c2;
ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 5, range, 137))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[137].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[3];
int end[3];
for (int n = 0; n < 3; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "advec_mom_kernel1_z_nonvector");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
const double *__restrict__ node_flux = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
double *__restrict__ mom_flux = (double *)(args[2].data + base2);
int base3 = args[3].dat->base_offset;
const double *__restrict__ celldz = (double *)(args[3].data + base3);
int base4 = args[4].dat->base_offset;
const double *__restrict__ vel1 = (double *)(args[4].data + base4);
// initialize global variable with the dimension of dats
int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0];
int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1];
int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0];
int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1];
int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0];
int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1];
int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0];
int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1];
int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0];
int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[137].mpi_time += t1 - t2;
}
#pragma omp parallel for collapse(2)
for (int n_z = start[2]; n_z < end[2]; n_z++) {
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldz, vel1)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
double sigma, wind, width;
double vdiffuw, vdiffdw, auw, adw, limiter;
int upwind, donor, downwind, dif;
double advec_vel_temp;
if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) {
upwind = 2;
donor = 1;
downwind = 0;
dif = donor;
} else {
upwind = -1;
donor = 0;
downwind = 1;
dif = upwind;
}
//.........这里部分代码省略.........
示例12: ops_par_loop_initialise_chunk_kernel_cellx_execute
// host stub function
void ops_par_loop_initialise_chunk_kernel_cellx_execute(
ops_kernel_descriptor *desc) {
ops_block block = desc->block;
int dim = desc->dim;
int *range = desc->range;
ops_arg arg0 = desc->args[0];
ops_arg arg1 = desc->args[1];
ops_arg arg2 = desc->args[2];
// Timing
double t1, t2, c1, c2;
ops_arg args[3] = {arg0, arg1, arg2};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 3, range, 12))
return;
#endif
if (OPS_diags > 1) {
OPS_kernels[12].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[2];
int end[2];
for (int n = 0; n < 2; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#ifdef OPS_DEBUG
ops_register_args(args, "initialise_chunk_kernel_cellx");
#endif
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset;
const double *__restrict__ vertexx = (double *)(args[0].data + base0);
int base1 = args[1].dat->base_offset;
double *__restrict__ cellx = (double *)(args[1].data + base1);
int base2 = args[2].dat->base_offset;
double *__restrict__ celldx = (double *)(args[2].data + base2);
// initialize global variable with the dimension of dats
int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0];
int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0];
int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0];
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[12].mpi_time += t1 - t2;
}
#pragma omp parallel for
for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(vertexx, cellx, celldx)
#else
#pragma simd
#endif
for (int n_x = start[0]; n_x < end[0]; n_x++) {
double d_x;
d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells;
cellx[OPS_ACC1(0, 0)] =
0.5 * (vertexx[OPS_ACC0(0, 0)] + vertexx[OPS_ACC0(1, 0)]);
celldx[OPS_ACC2(0, 0)] = d_x;
}
}
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[12].time += t2 - t1;
}
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c1, &t1);
OPS_kernels[12].mpi_time += t1 - t2;
OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1);
OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2);
}
}
示例13: ops_par_loop_PdV_kernel_predict
// host stub function
void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range,
ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3,
ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7,
ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11,
ops_arg arg12, ops_arg arg13) {
char *p_a[14];
int offs[14][3];
ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13};
ops_timing_realloc(5,"PdV_kernel_predict");
OPS_kernels[5].count++;
//compute locally allocated range for the sub-block
int start[3];
int end[3];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
if (!sb->owned) return;
for ( int n=0; n<3; n++ ){
start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n];
if (start[n] >= range[2*n]) {
start[n] = 0;
}
else {
start[n] = range[2*n] - start[n];
}
if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n];
if (end[n] >= range[2*n+1]) {
end[n] = range[2*n+1] - sb->decomp_disp[n];
}
else {
end[n] = sb->decomp_size[n];
}
if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n]))
end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]);
}
#else //OPS_MPI
for ( int n=0; n<3; n++ ){
start[n] = range[2*n];end[n] = range[2*n+1];
}
#endif //OPS_MPI
#ifdef OPS_DEBUG
ops_register_args(args, "PdV_kernel_predict");
#endif
offs[0][0] = args[0].stencil->stride[0]*1; //unit step in x dimension
offs[0][1] = off3D(1, &start[0],
&end[0],args[0].dat->size, args[0].stencil->stride) - offs[0][0];
offs[0][2] = off3D(2, &start[0],
&end[0],args[0].dat->size, args[0].stencil->stride) - offs[0][1] - offs[0][0];
offs[1][0] = args[1].stencil->stride[0]*1; //unit step in x dimension
offs[1][1] = off3D(1, &start[0],
&end[0],args[1].dat->size, args[1].stencil->stride) - offs[1][0];
offs[1][2] = off3D(2, &start[0],
&end[0],args[1].dat->size, args[1].stencil->stride) - offs[1][1] - offs[1][0];
offs[2][0] = args[2].stencil->stride[0]*1; //unit step in x dimension
offs[2][1] = off3D(1, &start[0],
&end[0],args[2].dat->size, args[2].stencil->stride) - offs[2][0];
offs[2][2] = off3D(2, &start[0],
&end[0],args[2].dat->size, args[2].stencil->stride) - offs[2][1] - offs[2][0];
offs[3][0] = args[3].stencil->stride[0]*1; //unit step in x dimension
offs[3][1] = off3D(1, &start[0],
&end[0],args[3].dat->size, args[3].stencil->stride) - offs[3][0];
offs[3][2] = off3D(2, &start[0],
&end[0],args[3].dat->size, args[3].stencil->stride) - offs[3][1] - offs[3][0];
offs[4][0] = args[4].stencil->stride[0]*1; //unit step in x dimension
offs[4][1] = off3D(1, &start[0],
&end[0],args[4].dat->size, args[4].stencil->stride) - offs[4][0];
offs[4][2] = off3D(2, &start[0],
&end[0],args[4].dat->size, args[4].stencil->stride) - offs[4][1] - offs[4][0];
offs[5][0] = args[5].stencil->stride[0]*1; //unit step in x dimension
offs[5][1] = off3D(1, &start[0],
&end[0],args[5].dat->size, args[5].stencil->stride) - offs[5][0];
offs[5][2] = off3D(2, &start[0],
&end[0],args[5].dat->size, args[5].stencil->stride) - offs[5][1] - offs[5][0];
offs[6][0] = args[6].stencil->stride[0]*1; //unit step in x dimension
offs[6][1] = off3D(1, &start[0],
&end[0],args[6].dat->size, args[6].stencil->stride) - offs[6][0];
offs[6][2] = off3D(2, &start[0],
&end[0],args[6].dat->size, args[6].stencil->stride) - offs[6][1] - offs[6][0];
offs[7][0] = args[7].stencil->stride[0]*1; //unit step in x dimension
offs[7][1] = off3D(1, &start[0],
&end[0],args[7].dat->size, args[7].stencil->stride) - offs[7][0];
offs[7][2] = off3D(2, &start[0],
&end[0],args[7].dat->size, args[7].stencil->stride) - offs[7][1] - offs[7][0];
offs[8][0] = args[8].stencil->stride[0]*1; //unit step in x dimension
offs[8][1] = off3D(1, &start[0],
//.........这里部分代码省略.........
示例14: ops_par_loop_field_summary_kernel
// host stub function
void ops_par_loop_field_summary_kernel(char const *name, ops_block block,
int dim, int *range, ops_arg arg0,
ops_arg arg1, ops_arg arg2, ops_arg arg3,
ops_arg arg4, ops_arg arg5, ops_arg arg6,
ops_arg arg7, ops_arg arg8, ops_arg arg9,
ops_arg arg10) {
// Timing
double t1, t2, c1, c2;
int offs[11][2];
ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5,
arg6, arg7, arg8, arg9, arg10};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 11, range, 49))
return;
#endif
if (OPS_diags > 1) {
ops_timing_realloc(49, "field_summary_kernel");
OPS_kernels[49].count++;
ops_timers_core(&c1, &t1);
}
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
#endif
// compute locally allocated range for the sub-block
int start[2];
int end[2];
int arg_idx[2];
#ifdef OPS_MPI
if (!sb->owned)
return;
for (int n = 0; n < 2; n++) {
start[n] = sb->decomp_disp[n];
end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
if (start[n] >= range[2 * n]) {
start[n] = 0;
} else {
start[n] = range[2 * n] - start[n];
}
if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
start[n] = range[2 * n];
if (end[n] >= range[2 * n + 1]) {
end[n] = range[2 * n + 1] - sb->decomp_disp[n];
} else {
end[n] = sb->decomp_size[n];
}
if (sb->id_p[n] == MPI_PROC_NULL &&
(range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
if (end[n] < start[n])
end[n] = start[n];
}
#else
for (int n = 0; n < 2; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
}
#endif
#ifdef OPS_DEBUG
ops_register_args(args, "field_summary_kernel");
#endif
offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
offs[0][1] =
off2D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
offs[0][0];
offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension
offs[1][1] =
off2D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
offs[1][0];
offs[2][0] = args[2].stencil->stride[0] * 1; // unit step in x dimension
offs[2][1] =
off2D(1, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
offs[2][0];
offs[3][0] = args[3].stencil->stride[0] * 1; // unit step in x dimension
offs[3][1] =
off2D(1, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
offs[3][0];
offs[4][0] = args[4].stencil->stride[0] * 1; // unit step in x dimension
offs[4][1] =
off2D(1, &start[0], &end[0], args[4].dat->size, args[4].stencil->stride) -
offs[4][0];
offs[5][0] = args[5].stencil->stride[0] * 1; // unit step in x dimension
offs[5][1] =
off2D(1, &start[0], &end[0], args[5].dat->size, args[5].stencil->stride) -
offs[5][0];
//.........这里部分代码省略.........
示例15: ops_par_loop_test_kernel
// host stub function
void ops_par_loop_test_kernel(char const *name, ops_block block, int dim,
int *range, ops_arg arg0, ops_arg arg1) {
// Timing
double t1, t2, c1, c2;
char *p_a[2];
int offs[2][1];
ops_arg args[2] = {arg0, arg1};
#ifdef CHECKPOINTING
if (!ops_checkpointing_before(args, 2, range, 14))
return;
#endif
if (OPS_diags > 1) {
ops_timing_realloc(14, "test_kernel");
OPS_kernels[14].count++;
ops_timers_core(&c2, &t2);
}
// compute locally allocated range for the sub-block
int start[1];
int end[1];
#ifdef OPS_MPI
sub_block_list sb = OPS_sub_block_list[block->index];
#endif
#ifdef OPS_DEBUG
ops_register_args(args, "test_kernel");
#endif
int arg_idx[1];
int arg_idx_base[1];
#ifdef OPS_MPI
if (compute_ranges(args, 2, block, range, start, end, arg_idx) < 0)
return;
#else // OPS_MPI
for (int n = 0; n < 1; n++) {
start[n] = range[2 * n];
end[n] = range[2 * n + 1];
arg_idx[n] = start[n];
}
#endif // OPS_MPI
for (int n = 0; n < 1; n++) {
arg_idx_base[n] = arg_idx[n];
}
offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
int off0_0 = offs[0][0];
int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size);
// set up initial pointers and exchange halos if necessary
int base0 = args[0].dat->base_offset +
(OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) *
start[0] * args[0].stencil->stride[0];
p_a[0] = (char *)args[0].data + base0;
#ifdef OPS_MPI
p_a[1] = ((ops_reduction)args[1].data)->data +
((ops_reduction)args[1].data)->size * block->index;
#else
p_a[1] = ((ops_reduction)args[1].data)->data;
#endif
// initialize global variable with the dimension of dats
xdim0 = args[0].dat->size[0];
// Halo Exchanges
ops_H_D_exchanges_host(args, 2);
ops_halo_exchanges(args, 2, range);
ops_H_D_exchanges_host(args, 2);
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[14].mpi_time += t1 - t2;
}
int n_x;
#pragma novector
for (n_x = start[0];
n_x < start[0] + ((end[0] - start[0]) / SIMD_VEC) * SIMD_VEC;
n_x += SIMD_VEC) {
// call kernel function, passing in pointers to data -vectorised
for (int i = 0; i < SIMD_VEC; i++) {
test_kernel((double *)p_a[0] + i * 1 * 1, (double *)p_a[1]);
}
// shift pointers to data x direction
p_a[0] = p_a[0] + (dat0 * off0_0) * SIMD_VEC;
}
for (int n_x = start[0] + ((end[0] - start[0]) / SIMD_VEC) * SIMD_VEC;
n_x < end[0]; n_x++) {
// call kernel function, passing in pointers to data - remainder
test_kernel((double *)p_a[0], (double *)p_a[1]);
// shift pointers to data x direction
p_a[0] = p_a[0] + (dat0 * off0_0);
//.........这里部分代码省略.........