本文整理汇总了C++中SharedMemory::getPointer方法的典型用法代码示例。如果您正苦于以下问题:C++ SharedMemory::getPointer方法的具体用法?C++ SharedMemory::getPointer怎么用?C++ SharedMemory::getPointer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类SharedMemory
的用法示例。
在下文中一共展示了SharedMemory::getPointer方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: susanKernel
__global__
void susanKernel(T* out, const T* in,
const unsigned idim0, const unsigned idim1,
const unsigned radius, const float t, const float g,
const unsigned edge)
{
const int rSqrd = radius*radius;
const int windLen = 2*radius+1;
const int shrdLen = BLOCK_X + windLen-1;
SharedMemory<T> shared;
T* shrdMem = shared.getPointer();
const unsigned lx = threadIdx.x;
const unsigned ly = threadIdx.y;
const unsigned gx = blockDim.x * blockIdx.x + lx + edge;
const unsigned gy = blockDim.y * blockIdx.y + ly + edge;
const unsigned nucleusIdx = (ly+radius)*shrdLen + lx+radius;
shrdMem[nucleusIdx] = gx<idim0 && gy<idim1 ? in[gy*idim0+gx] : 0;
T m_0 = shrdMem[nucleusIdx];
#pragma unroll
for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
int j = gy2-radius;
#pragma unroll
for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
int i = gx2-radius;
shrdMem[b*shrdLen+a] = (i<idim0 && j<idim1 ? in[j*idim0+i]: m_0);
}
}
__syncthreads();
if (gx < idim0 - edge && gy < idim1 - edge) {
unsigned idx = gy*idim0 + gx;
float nM = 0.0f;
#pragma unroll
for (int p=0; p<windLen; ++p) {
#pragma unroll
for (int q=0; q<windLen; ++q) {
int i = p - radius;
int j = q - radius;
int a = lx + radius + i;
int b = ly + radius + j;
if (i*i + j*j < rSqrd) {
float c = m_0;
float m = shrdMem[b * shrdLen + a];
float exp_pow = powf((m - c)/t, 6.0f);
float cM = expf(-exp_pow);
nM += cM;
}
}
}
out[idx] = nM < g ? g - nM : T(0);
}
}
示例2: histogramKernel
static __global__
void histogramKernel(Param<outType> out, CParam<inType> in,
int len, int nbins, float minval, float maxval, int nBBS)
{
SharedMemory<outType> shared;
outType * shrdMem = shared.getPointer();
// offset input and output to account for batch ops
unsigned b2 = blockIdx.x / nBBS;
const inType *iptr = in.ptr + b2 * in.strides[2] + blockIdx.y * in.strides[3];
outType *optr = out.ptr + b2 * out.strides[2] + blockIdx.y * out.strides[3];
int start = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
int end = minimum((start + THRD_LOAD * blockDim.x), len);
float step = (maxval-minval) / (float)nbins;
// If nbins > max shared memory allocated, then just use atomicAdd on global memory
bool use_global = nbins > MAX_BINS;
// Skip initializing shared memory
if (!use_global) {
for (int i = threadIdx.x; i < nbins; i += blockDim.x)
shrdMem[i] = 0;
__syncthreads();
}
for (int row = start; row < end; row += blockDim.x) {
int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
int bin = (int)((iptr[idx] - minval) / step);
bin = (bin < 0) ? 0 : bin;
bin = (bin >= nbins) ? (nbins-1) : bin;
if (use_global) {
atomicAdd((optr + bin), 1);
} else {
atomicAdd((shrdMem + bin), 1);
}
}
// No need to write to global if use_global is true
if (!use_global) {
__syncthreads();
for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
atomicAdd((optr + i), shrdMem[i]);
}
}
}
示例3: morphKernel
static __global__ void morphKernel(Param<T> out, CParam<T> in,
int nBBS0, int nBBS1)
{
// get shared memory pointer
SharedMemory<T> shared;
T * shrdMem = shared.getPointer();
// calculate necessary offset and window parameters
const int halo = windLen/2;
const int padding= 2*halo;
const int shrdLen = blockDim.x + padding + 1;
const int shrdLen1 = blockDim.y + padding;
// gfor batch offsets
unsigned b2 = blockIdx.x / nBBS0;
unsigned b3 = blockIdx.y / nBBS1;
const T* iptr = (const T *) in.ptr + (b2 * in.strides[2] + b3 * in.strides[3]);
T* optr = (T * )out.ptr + (b2 * out.strides[2] + b3 * out.strides[3]);
const int lx = threadIdx.x;
const int ly = threadIdx.y;
// global indices
const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
// pull image to local memory
for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
// move row_set get_local_size(1) along coloumns
for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
load2ShrdMem<T, isDilation>(shrdMem, iptr, a, b, shrdLen,
in.dims[0], in.dims[1],
gx2-halo, gy2-halo, in.strides[1], in.strides[0]);
}
}
int i = lx + halo;
int j = ly + halo;
__syncthreads();
const T * d_filt = (const T *)cFilter;
T acc = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init();
#pragma unroll
for(int wj=0; wj<windLen; ++wj) {
int joff = wj*windLen;
int w_joff = (j+wj-halo)*shrdLen;
#pragma unroll
for(int wi=0; wi<windLen; ++wi) {
if (d_filt[joff+wi] > (T)0) {
T cur = shrdMem[w_joff + (i+wi-halo)];
if (isDilation)
acc = max(acc, cur);
else
acc = min(acc, cur);
}
}
}
if (gx<in.dims[0] && gy<in.dims[1]) {
int outIdx = lIdx(gx, gy, out.strides[1], out.strides[0]);
optr[outIdx] = acc;
}
}
示例4: max
static __global__ void morph3DKernel(Param<T> out, CParam<T> in, int nBBS)
{
// get shared memory pointer
SharedMemory<T> shared;
T * shrdMem = shared.getPointer();
const int halo = windLen/2;
const int padding = 2*halo;
const int se_area = windLen*windLen;
const int shrdLen = blockDim.x + padding + 1;
const int shrdLen1 = blockDim.y + padding;
const int shrdLen2 = blockDim.z + padding;
const int shrdArea = shrdLen * (blockDim.y+padding);
// gfor batch offsets
unsigned batchId = blockIdx.x / nBBS;
const T* iptr = (const T *) in.ptr + (batchId * in.strides[3]);
T* optr = (T * )out.ptr + (batchId * out.strides[3]);
const int lx = threadIdx.x;
const int ly = threadIdx.y;
const int lz = threadIdx.z;
const int gx = blockDim.x * (blockIdx.x-batchId*nBBS) + lx;
const int gy = blockDim.y * blockIdx.y + ly;
const int gz = blockDim.z * blockIdx.z + lz;
for (int c=lz, gz2=gz; c<shrdLen2; c+=blockDim.z, gz2+=blockDim.z) {
for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
load2ShrdVolume<T, isDilation>(shrdMem, iptr, a, b, c, shrdLen, shrdArea,
in.dims[0], in.dims[1], in.dims[2],
gx2-halo, gy2-halo, gz2-halo,
in.strides[2], in.strides[1], in.strides[0]);
}
}
}
__syncthreads();
// indices of voxel owned by current thread
int i = lx + halo;
int j = ly + halo;
int k = lz + halo;
const T * d_filt = (const T *)cFilter;
T acc = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init();
#pragma unroll
for(int wk=0; wk<windLen; ++wk) {
int koff = wk*se_area;
int w_koff = (k+wk-halo)*shrdArea;
#pragma unroll
for(int wj=0; wj<windLen; ++wj) {
int joff = wj*windLen;
int w_joff = (j+wj-halo)*shrdLen;
#pragma unroll
for(int wi=0; wi<windLen; ++wi) {
if (d_filt[koff+joff+wi]) {
T cur = shrdMem[w_koff+w_joff + i+wi-halo];
if (isDilation)
acc = max(acc, cur);
else
acc = min(acc, cur);
}
}
}
}
if (gx<in.dims[0] && gy<in.dims[1] && gz<in.dims[2]) {
int outIdx = gz * out.strides[2] +
gy * out.strides[1] +
gx * out.strides[0];
optr[outIdx] = acc;
}
}
示例5: meanshiftKernel
static __global__
void meanshiftKernel(Param<T> out, CParam<T> in,
float space_, int radius, float cvar,
uint iter, int nBBS0, int nBBS1)
{
SharedMemory<T> shared;
T * shrdMem = shared.getPointer();
// calculate necessary offset and window parameters
const int padding = 2*radius + 1;
const int shrdLen = blockDim.x + padding;
const int schStride = shrdLen*(blockDim.y + padding);
// the variable ichStride will only effect when we have >1
// channels. in the other cases, the expression in question
// will not use the variable
const int ichStride = in.strides[2];
// gfor batch offsets
unsigned b2 = blockIdx.x / nBBS0;
unsigned b3 = blockIdx.y / nBBS1;
const T* iptr = (const T *) in.ptr + (b2 * in.strides[2] + b3 * in.strides[3]);
T* optr = (T * )out.ptr + (b2 * out.strides[2] + b3 * out.strides[3]);
const int lx = threadIdx.x;
const int ly = threadIdx.y;
const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
// pull image to local memory
for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
// move row_set get_local_size(1) along coloumns
for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
load2ShrdMem<T, channels>(shrdMem, iptr, a, b, shrdLen, schStride,
in.dims[0], in.dims[1], gx2-radius, gy2-radius, ichStride,
in.strides[1], in.strides[0]);
}
}
int i = lx + radius;
int j = ly + radius;
__syncthreads();
if (gx>=in.dims[0] || gy>=in.dims[1])
return;
float means[channels];
float centers[channels];
float tmpclrs[channels];
// clear means and centers for this pixel
#pragma unroll
for(int ch=0; ch<channels; ++ch) {
means[ch] = 0.0f;
centers[ch] = shrdMem[lIdx(i, j, shrdLen, 1)+ch*schStride];
}
// scope of meanshift iterationd begin
for(uint it=0; it<iter; ++it) {
int count = 0;
int shift_x = 0;
int shift_y = 0;
for(int wj=-radius; wj<=radius; ++wj) {
int hit_count = 0;
for(int wi=-radius; wi<=radius; ++wi) {
int tj = j + wj;
int ti = i + wi;
// proceed
float norm = 0.0f;
#pragma unroll
for(int ch=0; ch<channels; ++ch) {
tmpclrs[ch] = shrdMem[lIdx(ti, tj, shrdLen, 1)+ch*schStride];
norm += (centers[ch]-tmpclrs[ch]) * (centers[ch]-tmpclrs[ch]);
}
if (norm<= cvar) {
#pragma unroll
for(int ch=0; ch<channels; ++ch)
means[ch] += tmpclrs[ch];
shift_x += wi;
++hit_count;
}
}
count+= hit_count;
shift_y += wj*hit_count;
}
if (count==0) {
break;
}
const float fcount = 1.f/count;
const int mean_x = (int)(shift_x*fcount+0.5f);
//.........这里部分代码省略.........
示例6: bilateralKernel
static __global__
void bilateralKernel(Param<outType> out, CParam<inType> in,
float sigma_space, float sigma_color,
int gaussOff, int nBBS0, int nBBS1)
{
SharedMemory<outType> shared;
outType *localMem = shared.getPointer();
outType *gauss2d = localMem + gaussOff;
const int radius = max((int)(sigma_space * 1.5f), 1);
const int padding = 2 * radius;
const int window_size = padding + 1;
const int shrdLen = THREADS_X + padding;
const float variance_range = sigma_color * sigma_color;
const float variance_space = sigma_space * sigma_space;
// gfor batch offsets
unsigned b2 = blockIdx.x / nBBS0;
unsigned b3 = blockIdx.y / nBBS1;
const inType* iptr = (const inType *) in.ptr + (b2 * in.strides[2] + b3 * in.strides[3] );
outType* optr = (outType * )out.ptr + (b2 * out.strides[2] + b3 * out.strides[3]);
int lx = threadIdx.x;
int ly = threadIdx.y;
const int gx = THREADS_X * (blockIdx.x-b2*nBBS0) + lx;
const int gy = THREADS_Y * (blockIdx.y-b3*nBBS1) + ly;
// generate gauss2d spatial variance values for block
if (lx<window_size && ly<window_size) {
int x = lx - radius;
int y = ly - radius;
gauss2d[ly*window_size+lx] = exp( ((x*x) + (y*y)) / (-2.f * variance_space));
}
// pull image to local memory
for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
// move row_set get_local_size(1) along coloumns
for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
load2ShrdMem<inType, outType>(localMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
gx2-radius, gy2-radius, in.strides[1], in.strides[0]);
}
}
__syncthreads();
if (gx<in.dims[0] && gy<in.dims[1]) {
lx += radius;
ly += radius;
const outType center_color = localMem[ly*shrdLen+lx];
outType res = 0;
outType norm = 0;
#pragma unroll
for(int wj=0; wj<window_size; ++wj) {
int joff = (ly+wj-radius)*shrdLen + (lx-radius);
int goff = wj*window_size;
#pragma unroll
for(int wi=0; wi<window_size; ++wi) {
const outType tmp_color = localMem[joff+wi];
const outType gauss_range = gaussian1d(center_color - tmp_color, variance_range);
const outType weight = gauss2d[goff+wi] * gauss_range;
norm += weight;
res += tmp_color * weight;
}
}
optr[gy*out.strides[1]+gx] = res / norm;
}
}