本文整理汇总了C++中Binary::init方法的典型用法代码示例。如果您正苦于以下问题:C++ Binary::init方法的具体用法?C++ Binary::init怎么用?C++ Binary::init使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Binary
的用法示例。
在下文中一共展示了Binary::init方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: get_scan_dim_kernels
static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y)
{
std::string ref_name =
std::string("scan_") +
std::to_string(dim) +
std::string("_") +
std::to_string(isFinalPass) +
std::string("_") +
std::string(dtype_traits<Ti>::getName()) +
std::string("_") +
std::string(dtype_traits<To>::getName()) +
std::string("_") +
std::to_string(op) +
std::string("_") +
std::to_string(threads_y) +
std::string("_") +
std::to_string(int(inclusive_scan));
int device = getActiveDeviceId();
kc_entry_t entry = kernelCache(device, ref_name);
if (entry.prog==0 && entry.ker==0) {
Binary<To, op> scan;
ToNumStr<To> toNumStr;
std::ostringstream options;
options << " -D To=" << dtype_traits<To>::getName()
<< " -D Ti=" << dtype_traits<Ti>::getName()
<< " -D T=To"
<< " -D dim=" << dim
<< " -D DIMY=" << threads_y
<< " -D THREADS_X=" << THREADS_X
<< " -D init=" << toNumStr(scan.init())
<< " -D " << binOpName<op>()
<< " -D CPLX=" << af::iscplx<Ti>()
<< " -D isFinalPass=" << (int)(isFinalPass)
<< " -D inclusive_scan=" << inclusive_scan;
if (std::is_same<Ti, double>::value ||
std::is_same<Ti, cdouble>::value) {
options << " -D USE_DOUBLE";
}
const char *ker_strs[] = {ops_cl, scan_dim_cl};
const int ker_lens[] = {ops_cl_len, scan_dim_cl_len};
cl::Program prog;
buildProgram(prog, 2, ker_strs, ker_lens, options.str());
entry.prog = new Program(prog);
entry.ker = new Kernel[2];
entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel");
entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel");
addKernelToCache(device, ref_name, entry);
}
return entry.ker[kerIdx];
}
示例2: reduce_all
To reduce_all(const Array<Ti> &in)
{
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
To out = reduce.init();
// Decrement dimension of select dimension
af::dim4 dims = in.dims();
af::dim4 strides = in.strides();
const Ti *inPtr = in.get();
for(dim_t l = 0; l < dims[3]; l++) {
dim_t off3 = l * strides[3];
for(dim_t k = 0; k < dims[2]; k++) {
dim_t off2 = k * strides[2];
for(dim_t j = 0; j < dims[1]; j++) {
dim_t off1 = j * strides[1];
for(dim_t i = 0; i < dims[0]; i++) {
dim_t idx = i + off1 + off2 + off3;
To val = transform(inPtr[idx]);
out = reduce(val, out);
}
}
}
}
return out;
}
示例3: operator
void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
const Ti *in , const dim4 &istrides, const dim4 &idims,
const int dim)
{
dim_t stride = istrides[dim];
To out_val = reduce.init();
for (dim_t i = 0; i < idims[dim]; i++) {
To in_val = transform(in[i * stride]);
out_val = reduce(in_val, out_val);
}
*out = out_val;
}
示例4: operator
void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
const Ti *in , const dim4 &istrides, const dim4 &idims,
const int dim, bool change_nan, double nanval)
{
dim_t stride = istrides[dim];
To out_val = reduce.init();
for (dim_t i = 0; i < idims[dim]; i++) {
To in_val = transform(in[i * stride]);
if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
out_val = reduce(in_val, out_val);
}
*out = out_val;
}
示例5: get_scan_dim_kernels
static Kernel* get_scan_dim_kernels(int kerIdx)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> scanProgs;
static std::map<int, Kernel*> scanKerns;
static std::map<int, Kernel*> bcastKerns;
int device= getActiveDeviceId();
std::call_once(compileFlags[device], [device] () {
Binary<To, op> scan;
ToNum<To> toNum;
std::ostringstream options;
options << " -D To=" << dtype_traits<To>::getName()
<< " -D Ti=" << dtype_traits<Ti>::getName()
<< " -D T=To"
<< " -D dim=" << dim
<< " -D DIMY=" << threads_y
<< " -D THREADS_X=" << THREADS_X
<< " -D init=" << toNum(scan.init())
<< " -D " << binOpName<op>()
<< " -D CPLX=" << af::iscplx<Ti>()
<< " -D isFinalPass=" << (int)(isFinalPass);
if (std::is_same<Ti, double>::value ||
std::is_same<Ti, cdouble>::value) {
options << " -D USE_DOUBLE";
}
const char *ker_strs[] = {ops_cl, scan_dim_cl};
const int ker_lens[] = {ops_cl_len, scan_dim_cl_len};
cl::Program prog;
buildProgram(prog, 2, ker_strs, ker_lens, options.str());
scanProgs[device] = new Program(prog);
scanKerns[device] = new Kernel(*scanProgs[device], "scan_dim_kernel");
bcastKerns[device] = new Kernel(*scanProgs[device], "bcast_dim_kernel");
});
return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
} catch (cl::Error err) {
CL_TO_AF_ERROR(err);
throw;
}
}
示例6: operator
void operator()(To *out, const dim4 ostrides, const dim4 odims,
const Ti *in , const dim4 istrides, const dim4 idims,
const int dim)
{
dim_type istride = istrides[dim];
dim_type ostride = ostrides[dim];
Transform<Ti, To, op> transform;
// FIXME: Change the name to something better
Binary<To, op> scan;
To out_val = scan.init();
for (dim_type i = 0; i < idims[dim]; i++) {
To in_val = transform(in[i * istride]);
out_val = scan(in_val, out_val);
out[i * ostride] = out_val;
}
}
示例7: get_scan_dim_kernels
static Kernel get_scan_dim_kernels(int kerIdx)
{
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static Program scanProgs[DeviceManager::MAX_DEVICES];
static Kernel scanKerns[DeviceManager::MAX_DEVICES];
static Kernel bcastKerns[DeviceManager::MAX_DEVICES];
int device= getActiveDeviceId();
std::call_once(compileFlags[device], [device] () {
Binary<To, op> scan;
ToNum<To> toNum;
std::ostringstream options;
options << " -D To=" << dtype_traits<To>::getName()
<< " -D Ti=" << dtype_traits<Ti>::getName()
<< " -D T=To"
<< " -D dim=" << dim
<< " -D DIMY=" << threads_y
<< " -D THREADS_X=" << THREADS_X
<< " -D init=" << toNum(scan.init())
<< " -D " << binOpName<op>()
<< " -D CPLX=" << af::iscplx<Ti>()
<< " -D isFinalPass=" << (int)(isFinalPass);
const char *ker_strs[] = {ops_cl, scan_dim_cl};
const int ker_lens[] = {ops_cl_len, scan_dim_cl_len};
buildProgram(scanProgs[device], 2, ker_strs, ker_lens, options.str());
scanKerns[device] = Kernel(scanProgs[device], "scan_dim_kernel");
bcastKerns[device] = Kernel(scanProgs[device], "bcast_dim_kernel");
});
return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
}
示例8: reduce_dim_kernel
__global__
static void reduce_dim_kernel(Param<To> out,
CParam <Ti> in,
uint blocks_x, uint blocks_y, uint offset_dim,
bool change_nan, To nanval)
{
const uint tidx = threadIdx.x;
const uint tidy = threadIdx.y;
const uint tid = tidy * THREADS_X + tidx;
const uint zid = blockIdx.x / blocks_x;
const uint wid = blockIdx.y / blocks_y;
const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
const uint xid = blockIdx_x * blockDim.x + tidx;
const uint yid = blockIdx_y; // yid of output. updated for input later.
uint ids[4] = {xid, yid, zid, wid};
const Ti *iptr = in.ptr;
To *optr = out.ptr;
// There is only one element per block for out
// There are blockDim.y elements per block for in
// Hence increment ids[dim] just after offseting out and before offsetting in
optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
const uint blockIdx_dim = ids[dim];
ids[dim] = ids[dim] * blockDim.y + tidy;
iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
const uint id_dim_in = ids[dim];
const uint istride_dim = in.strides[dim];
bool is_valid =
(ids[0] < in.dims[0]) &&
(ids[1] < in.dims[1]) &&
(ids[2] < in.dims[2]) &&
(ids[3] < in.dims[3]);
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
__shared__ To s_val[THREADS_X * DIMY];
To out_val = reduce.init();
for (int id = id_dim_in; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) {
To in_val = transform(*iptr);
if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
out_val = reduce(in_val, out_val);
iptr = iptr + offset_dim * blockDim.y * istride_dim;
}
s_val[tid] = out_val;
To *s_ptr = s_val + tid;
__syncthreads();
if (DIMY == 8) {
if (tidy < 4) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 4]);
__syncthreads();
}
if (DIMY >= 4) {
if (tidy < 2) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 2]);
__syncthreads();
}
if (DIMY >= 2) {
if (tidy < 1) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 1]);
__syncthreads();
}
if (tidy == 0 && is_valid &&
(blockIdx_dim < out.dims[dim])) {
*optr = *s_ptr;
}
}
示例9: scan_dim_nonfinal_kernel
__global__
static void scan_dim_nonfinal_kernel(Param<To> out,
Param<To> tmp,
Param<char> tflg,
Param<int> tlid,
CParam<Ti> in,
CParam<Tk> key,
int dim,
uint blocks_x,
uint blocks_y,
uint lim,
bool inclusive_scan)
{
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int tid = tidy * THREADS_X + tidx;
const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x + tidx;
const int yid = blockIdx_y; // yid of output. updated for input later.
int ids[4] = {xid, yid, zid, wid};
const Ti *iptr = in.ptr;
const Tk *kptr = key.ptr;
To *optr = out.ptr;
To *tptr = tmp.ptr;
char *tfptr = tflg.ptr;
int *tiptr = tlid.ptr;
// There is only one element per block for out
// There are blockDim.y elements per block for in
// Hence increment ids[dim] just after offseting out and before offsetting in
tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0];
tfptr += ids[3] * tflg.strides[3] + ids[2] * tflg.strides[2] + ids[1] * tflg.strides[1] + ids[0];
tiptr += ids[3] * tlid.strides[3] + ids[2] * tlid.strides[2] + ids[1] * tlid.strides[1] + ids[0];
const int blockIdx_dim = ids[dim];
ids[dim] = ids[dim] * blockDim.y * lim + tidy;
optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
kptr += ids[3] * key.strides[3] + ids[2] * key.strides[2] + ids[1] * key.strides[1] + ids[0];
int id_dim = ids[dim];
const int out_dim = out.dims[dim];
bool is_valid =
(ids[0] < out.dims[0]) &&
(ids[1] < out.dims[1]) &&
(ids[2] < out.dims[2]) &&
(ids[3] < out.dims[3]);
const int ostride_dim = out.strides[dim];
const int istride_dim = in.strides[dim];
__shared__ char s_flg[THREADS_X * DIMY * 2];
__shared__ To s_val[THREADS_X * DIMY * 2];
__shared__ char s_ftmp[THREADS_X];
__shared__ To s_tmp[THREADS_X];
__shared__ int boundaryid[THREADS_X];
To *sptr = s_val + tid;
char *sfptr = s_flg + tid;
Transform<Ti, To, op> transform;
Binary<To, op> binop;
const To init = binop.init();
To val = init;
const bool isLast = (tidy == (DIMY - 1));
if (isLast) {
s_tmp[tidx] = val;
s_ftmp[tidx] = 0;
boundaryid[tidx] = -1;
}
__syncthreads();
char flag = 0;
for (int k = 0; k < lim; k++) {
if (id_dim < out_dim) {
flag = calculate_head_flags_dim(kptr, id_dim, key.strides[dim]);
} else {
flag = 0;
}
//Load val from global in
if (inclusive_scan) {
if (id_dim >= out_dim) {
val = init;
} else {
val = transform(*iptr);
}
} else {
if ((id_dim == 0) || (id_dim >= out_dim) || flag) {
val = init;
} else {
val = transform(*(iptr - istride_dim));
//.........这里部分代码省略.........
示例10: scan_dim_kernel
__global__
static void scan_dim_kernel(Param<To> out,
Param<To> tmp,
CParam<Ti> in,
uint blocks_x,
uint blocks_y,
uint blocks_dim,
uint lim)
{
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int tid = tidy * THREADS_X + tidx;
const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x + tidx;
const int yid = blockIdx_y; // yid of output. updated for input later.
int ids[4] = {xid, yid, zid, wid};
const Ti *iptr = in.ptr;
To *optr = out.ptr;
To *tptr = tmp.ptr;
// There is only one element per block for out
// There are blockDim.y elements per block for in
// Hence increment ids[dim] just after offseting out and before offsetting in
tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0];
const int blockIdx_dim = ids[dim];
ids[dim] = ids[dim] * blockDim.y * lim + tidy;
optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
int id_dim = ids[dim];
const int out_dim = out.dims[dim];
bool is_valid =
(ids[0] < out.dims[0]) &&
(ids[1] < out.dims[1]) &&
(ids[2] < out.dims[2]) &&
(ids[3] < out.dims[3]);
const int ostride_dim = out.strides[dim];
const int istride_dim = in.strides[dim];
__shared__ To s_val[THREADS_X * DIMY * 2];
__shared__ To s_tmp[THREADS_X];
To *sptr = s_val + tid;
Transform<Ti, To, op> transform;
Binary<To, op> binop;
const To init = binop.init();
To val = init;
const bool isLast = (tidy == (DIMY - 1));
for (int k = 0; k < lim; k++) {
if (isLast) s_tmp[tidx] = val;
bool cond = (is_valid) && (id_dim < out_dim);
val = cond ? transform(*iptr) : init;
*sptr = val;
__syncthreads();
int start = 0;
#pragma unroll
for (int off = 1; off < DIMY; off *= 2) {
if (tidy >= off) val = binop(val, sptr[(start - off) * THREADS_X]);
start = DIMY - start;
sptr[start * THREADS_X] = val;
__syncthreads();
}
val = binop(val, s_tmp[tidx]);
__syncthreads();
if (cond) *optr = val;
id_dim += blockDim.y;
iptr += blockDim.y * istride_dim;
optr += blockDim.y * ostride_dim;
}
if (!isFinalPass &&
is_valid &&
(blockIdx_dim < tmp.dims[dim]) &&
isLast) {
*tptr = val;
}
}
示例11: ireduce_dim_kernel
__global__
static void ireduce_dim_kernel(Param<T> out, uint *olptr,
CParam <T> in, const uint *ilptr,
uint blocks_x, uint blocks_y, uint offset_dim)
{
const uint tidx = threadIdx.x;
const uint tidy = threadIdx.y;
const uint tid = tidy * THREADS_X + tidx;
const uint zid = blockIdx.x / blocks_x;
const uint wid = blockIdx.y / blocks_y;
const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
const uint xid = blockIdx_x * blockDim.x + tidx;
const uint yid = blockIdx_y; // yid of output. updated for input later.
uint ids[4] = {xid, yid, zid, wid};
const T *iptr = in.ptr;
T *optr = out.ptr;
// There is only one element per block for out
// There are blockDim.y elements per block for in
// Hence increment ids[dim] just after offseting out and before offsetting in
optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
olptr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
const uint blockIdx_dim = ids[dim];
ids[dim] = ids[dim] * blockDim.y + tidy;
iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
if (!is_first) ilptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
const uint id_dim_in = ids[dim];
const uint istride_dim = in.strides[dim];
bool is_valid =
(ids[0] < in.dims[0]) &&
(ids[1] < in.dims[1]) &&
(ids[2] < in.dims[2]) &&
(ids[3] < in.dims[3]);
Binary<T, op> ireduce;
T val = ireduce.init();
uint idx = id_dim_in;
if (is_valid && id_dim_in < in.dims[dim]) {
val = *iptr;
if (!is_first) idx = *ilptr;
}
MinMaxOp<op, T> Op(val, idx);
const uint id_dim_in_start = id_dim_in + offset_dim * blockDim.y;
__shared__ T s_val[THREADS_X * DIMY];
__shared__ uint s_idx[THREADS_X * DIMY];
for (int id = id_dim_in_start;
is_valid && (id < in.dims[dim]);
id += offset_dim * blockDim.y) {
iptr = iptr + offset_dim * blockDim.y * istride_dim;
if (!is_first) {
ilptr = ilptr + offset_dim * blockDim.y * istride_dim;
Op(*iptr, *ilptr);
} else {
Op(*iptr, id);
}
}
s_val[tid] = Op.m_val;
s_idx[tid] = Op.m_idx;
T *s_vptr = s_val + tid;
uint *s_iptr = s_idx + tid;
__syncthreads();
if (DIMY == 8) {
if (tidy < 4) {
Op(s_vptr[THREADS_X * 4], s_iptr[THREADS_X * 4]);
*s_vptr = Op.m_val;
*s_iptr = Op.m_idx;
}
__syncthreads();
}
if (DIMY >= 4) {
if (tidy < 2) {
Op(s_vptr[THREADS_X * 2], s_iptr[THREADS_X * 2]);
*s_vptr = Op.m_val;
*s_iptr = Op.m_idx;
}
__syncthreads();
}
if (DIMY >= 2) {
if (tidy < 1) {
Op(s_vptr[THREADS_X * 1], s_iptr[THREADS_X * 1]);
*s_vptr = Op.m_val;
//.........这里部分代码省略.........
示例12: ireduce_first_kernel
__global__
static void ireduce_first_kernel(Param<T> out, uint *olptr,
CParam<T> in, const uint *ilptr,
uint blocks_x, uint blocks_y, uint repeat)
{
const uint tidx = threadIdx.x;
const uint tidy = threadIdx.y;
const uint tid = tidy * blockDim.x + tidx;
const uint zid = blockIdx.x / blocks_x;
const uint wid = blockIdx.y / blocks_y;
const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
const uint xid = blockIdx_x * blockDim.x * repeat + tidx;
const uint yid = blockIdx_y * blockDim.y + tidy;
const T *iptr = in.ptr;
T *optr = out.ptr;
iptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1];
optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
if (!is_first) ilptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1];
olptr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
if (yid >= in.dims[1] ||
zid >= in.dims[2] ||
wid >= in.dims[3]) return;
int lim = min((int)(xid + repeat * DIMX), in.dims[0]);
Binary<T, op> ireduce;
T val = ireduce.init();
uint idx = xid;
if (xid < lim) {
val = iptr[xid];
if (!is_first) idx = ilptr[xid];
}
MinMaxOp<op, T> Op(val, idx);
__shared__ T s_val[THREADS_PER_BLOCK];
__shared__ uint s_idx[THREADS_PER_BLOCK];
for (int id = xid + DIMX; id < lim; id += DIMX) {
Op(iptr[id], (!is_first) ? ilptr[id] : id);
}
s_val[tid] = Op.m_val;
s_idx[tid] = Op.m_idx;
__syncthreads();
T *s_vptr = s_val + tidy * DIMX;
uint *s_iptr = s_idx + tidy * DIMX;
if (DIMX == 256) {
if (tidx < 128) {
Op(s_vptr[tidx + 128], s_iptr[tidx + 128]);
s_vptr[tidx] = Op.m_val;
s_iptr[tidx] = Op.m_idx;
}
__syncthreads();
}
if (DIMX >= 128) {
if (tidx < 64) {
Op(s_vptr[tidx + 64], s_iptr[tidx + 64]);
s_vptr[tidx] = Op.m_val;
s_iptr[tidx] = Op.m_idx;
}
__syncthreads();
}
if (DIMX >= 64) {
if (tidx < 32) {
Op(s_vptr[tidx + 32], s_iptr[tidx + 32]);
s_vptr[tidx] = Op.m_val;
s_iptr[tidx] = Op.m_idx;
}
__syncthreads();
}
warp_reduce<T, op>(s_vptr, s_iptr, tidx);
if (tidx == 0) {
optr[blockIdx_x] = s_vptr[0];
olptr[blockIdx_x] = s_iptr[0];
}
}
示例13: mean_first_launcher
void mean_first_launcher(Param out, Param owt,
Param in, Param inWeight,
const int threads_x,
const uint groups_x,
const uint groups_y)
{
bool input_weight = ((inWeight.info.dims[0] *
inWeight.info.dims[1] *
inWeight.info.dims[2] *
inWeight.info.dims[3]) != 0);
bool output_weight = (( owt.info.dims[0] *
owt.info.dims[1] *
owt.info.dims[2] *
owt.info.dims[3]) != 0);
std::string ref_name =
std::string("mean_0_") +
std::string(dtype_traits<Ti>::getName()) +
std::string("_") +
std::string(dtype_traits<Tw>::getName()) +
std::string("_") +
std::string(dtype_traits<To>::getName()) +
std::string("_") +
std::to_string(threads_x) +
std::string("_") +
std::to_string(input_weight) +
std::string("_") +
std::to_string(output_weight);
int device = getActiveDeviceId();
kc_entry_t entry = kernelCache(device, ref_name);
if (entry.prog==0 && entry.ker==0) {
Binary<To, af_add_t> mean;
ToNumStr<To> toNumStr;
ToNumStr<Tw> twNumStr;
Transform<uint, Tw, af_add_t> transform_weight;
std::ostringstream options;
options << " -D Ti=" << dtype_traits<Ti>::getName()
<< " -D Tw=" << dtype_traits<Tw>::getName()
<< " -D To=" << dtype_traits<To>::getName()
<< " -D DIMX=" << threads_x
<< " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
<< " -D init_To=" << toNumStr(mean.init())
<< " -D init_Tw=" << twNumStr(transform_weight(0))
<< " -D one_Tw=" << twNumStr(transform_weight(1));
if (input_weight) { options << " -D INPUT_WEIGHT"; }
if (output_weight) { options << " -D OUTPUT_WEIGHT"; }
if (std::is_same<Ti, double>::value ||
std::is_same<Ti, cdouble>::value ||
std::is_same<To, double>::value) {
options << " -D USE_DOUBLE";
}
const char *ker_strs[] = {mean_ops_cl, mean_first_cl};
const int ker_lens[] = {mean_ops_cl_len, mean_first_cl_len};
Program prog;
buildProgram(prog, 2, ker_strs, ker_lens, options.str());
entry.prog = new Program(prog);
entry.ker = new Kernel(*entry.prog, "mean_first_kernel");
addKernelToCache(device, ref_name, entry);
}
NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
NDRange global(groups_x * in.info.dims[2] * local[0],
groups_y * in.info.dims[3] * local[1]);
uint repeat = divup(in.info.dims[0], (local[0] * groups_x));
if (input_weight && output_weight) {
auto meanOp = KernelFunctor<
Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
uint, uint, uint>(*entry.ker);
meanOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
*owt.data, owt.info,
*in.data, in.info,
*inWeight.data, inWeight.info,
groups_x, groups_y, repeat);
} else if (!input_weight && !output_weight) {
auto meanOp = KernelFunctor<
Buffer, KParam,
Buffer, KParam,
uint, uint, uint>(*entry.ker);
meanOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
*in.data, in.info,
groups_x, groups_y, repeat);
} else if ( input_weight && !output_weight) {
//.........这里部分代码省略.........
示例14: scan_first_kernel
__global__
static void scan_first_kernel(Param<To> out,
Param<To> tmp,
CParam<Ti> in,
uint blocks_x,
uint blocks_y,
uint lim)
{
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x * lim + tidx;
const int yid = blockIdx_y * blockDim.y + tidy;
bool cond_yzw = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);
if (!cond_yzw) return; // retire warps early
const Ti *iptr = in.ptr;
To *optr = out.ptr;
To *tptr = tmp.ptr;
iptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1];
optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
tptr += wid * tmp.strides[3] + zid * tmp.strides[2] + yid * tmp.strides[1];
const int DIMY = THREADS_PER_BLOCK / DIMX;
const int SHARED_MEM_SIZE = (2 * DIMX + 1) * (DIMY);
__shared__ To s_val[SHARED_MEM_SIZE];
__shared__ To s_tmp[DIMY];
To *sptr = s_val + tidy * (2 * DIMX + 1);
Transform<Ti, To, op> transform;
Binary<To, op> binop;
const To init = binop.init();
int id = xid;
To val = init;
const bool isLast = (tidx == (DIMX - 1));
for (int k = 0; k < lim; k++) {
if (isLast) s_tmp[tidy] = val;
bool cond = ((id < out.dims[0]));
val = cond ? transform(iptr[id]) : init;
sptr[tidx] = val;
__syncthreads();
int start = 0;
#pragma unroll
for (int off = 1; off < DIMX; off *= 2) {
if (tidx >= off) val = binop(val, sptr[(start - off) + tidx]);
start = DIMX - start;
sptr[start + tidx] = val;
__syncthreads();
}
val = binop(val, s_tmp[tidy]);
if (cond) optr[id] = val;
id += blockDim.x;
__syncthreads();
}
if (!isFinalPass && isLast) {
tptr[blockIdx_x] = val;
}
}