本文整理汇总了Python中pygpu.gpuarray.dtype_to_ctype函数的典型用法代码示例。如果您正苦于以下问题:Python dtype_to_ctype函数的具体用法?Python dtype_to_ctype怎么用?Python dtype_to_ctype使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了dtype_to_ctype函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: inline_softmax
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
Generate code for a softmax.
On entry, `buf` and `buf2` must contain two identical copies of
the input to softmax.
After the code returns `buf` contains the softmax, `buf2` contains
un-normalized softmax.
Parameters
----------
N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
dtype
Dtype of the softmax's output.
Notes
-----
`buf` and `buf2` should be in gpu shared memory, we access it many
times.
We use __i as an int variable in a loop.
"""
ctype = gpuarray.dtype_to_ctype(dtype)
# get max of buf (trashing all but buf[0])
return [inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
'__syncthreads()',
]
示例2: gpu_kernels
def gpu_kernels(self, node, name):
dt = node.inputs[0].type
code = """
KERNEL void doublek(GLOBAL_MEM %(ctype) *out,
GLOBAL_MEM const %(ctype)s *a,
ga_size n) {
for (ga_size i = LID_0; i < n; i += LDIM_0) {
out[i] = 2 * a[i];
}
}
""" % dict(ctype=gpuarray.dtype_to_ctype(dt))
return [Kernel(code=code, name="doublek",
params=[gpuarray.GpuArray,
gpuarray.GpuArray,
gpuarray.SIZE],
flags=Kernel.get_flags(dt))]
示例3: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
work_x = gpuarray.dtype_to_ctype(work_x)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
f = '' if dtype_x == 'float64' else 'f'
sio = StringIO()
print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
{
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){
const %(type_x)s* x = x_data + xs0 * row;
%(type_x)s* sm = sm_data + sms0 * row;
extern LOCAL_MEM %(work_x)s per_thread_values[];
LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
LOCAL_MEM int row_max_threadIdx;
%(work_x)s per_thread_row_max, per_thread_sum;
int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
// compute separate per-thread maximums and argmaxes
per_thread_row_max = NAN;
per_thread_row_max_j = 0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
}
per_thread_values[threadIdx.x] = per_thread_row_max;
local_barrier();
if (threadIdx.x == 0) {
row_max = NAN;
row_max_threadIdx = 0;
for (int j = 0; j < blockDim.x; j++)
{
%(work_x)s per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
row_max = fmax%(f)s(per_thread_max, row_max);
}
}
local_barrier();
// The thread with the higest max writes out which of its
// values was the winner.
if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
per_thread_sum = 0.0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
per_thread_sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
per_thread_values[threadIdx.x] = per_thread_sum;
local_barrier();
#.........这里部分代码省略.........
示例4: c_code
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
type_intp = gpuarray.dtype_to_ctype(numpy.intp)
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
ctx = sub['params']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] :
(ssize_t) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element.
const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] :
(ssize_t) 0);
if ((PyGpuArray_NDIM(%(dnll)s) > 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2)
|| (PyGpuArray_NDIM(%(y_idx)s) != 1))
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if (%(dnll)s_dims0 !=
PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_Format(PyExc_ValueError,
"dnll.shape[0] == %%i, but sm.shape[0] == %%i",
%(dnll)s_dims0,
PyGpuArray_DIMS(%(sm)s)[0]);
%(fail)s;
}
if (%(dnll)s_dims0 !=
PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_SetString(PyExc_ValueError,
"dnll.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"sm.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if ((NULL == %(dx)s)
|| (PyGpuArray_DIMS(%(dx)s)[0] !=
PyGpuArray_DIMS(%(sm)s)[0])
|| (PyGpuArray_DIMS(%(dx)s)[1] !=
PyGpuArray_DIMS(%(sm)s)[1]))
{
Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode_dx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(dx)s) {
%(fail)s
}
}
{
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(dx)s)[0],
(void *)&PyGpuArray_DIMS(%(dx)s)[1],
#.........这里部分代码省略.........
示例5: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype)
load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(work_sm)
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = """
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
buf[tx] += %(load_b)s(b[tx * sb0]);
buf2[tx] = buf[tx];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared"
#.........这里部分代码省略.........
示例6: gpu_kernels
def gpu_kernels(self, node, nodename):
# load kernel source
device_type = node.inputs[0].type.context.kind
kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]
# prepare "$" macros
if device_type == b'cuda':
ndim = node.inputs[0].ndim
dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim))
dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim))
src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim))
set_slice_code = '''
gidx = gid %% dims_%(i)d;
gid /= dims_%(i)d;
{dstv};
{dsti};
src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
set_slice_code = ''.join(
set_slice_code % dict(i=j) for j in range(1, ndim))
flags = Kernel.get_flags(node.inputs[0].dtype)
subs = dict(
inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
out_t=ga.dtype_to_ctype(self.idx_dtype),
dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
dstv='INPUT_TYPE *dstv,' if self.return_values else '',
dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
dstv_strides=dstv_strides_code if self.return_values else '',
dsti_strides=dsti_strides_code if self.return_indices else '',
src_strides=src_strides_code,
set_slice=set_slice_code,
write_value=int(self.return_values),
write_index=int(self.return_indices),
ndim=str(ndim),
use_half=int(node.inputs[0].dtype == 'float16')
)
elif device_type == b'opencl':
raise NotImplementedError()
# setup parameters
param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(self.return_values + self.return_indices):
param_types.append(ga.GpuArray) # dst*
param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src
param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size
# load and compile kernels
with open(os.path.join(
os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext
)) as f:
common_src = f.read()
kernels = []
def build_kernel(fname, kname, subs):
with open(os.path.join(
os.path.dirname(__file__), 'c_code', fname)
) as f:
kernel_src = f.read()
ker = Kernel(
code=Template(common_src + kernel_src).substitute(**subs),
name=kname,
params=param_types,
flags=flags,
objvar=kname + nodename)
return ker
subs['count_t'] = 'int'
kernels.append(
build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
subs['kname'] = 'k_topk_dense_large'
kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
subs['count_t'] = 'long long'
subs['kname'] = 'k_topk_dense_xlarge'
kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
return kernels
示例7: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(work_x)
type_b = gpuarray.dtype_to_ctype(work_b)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
{
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){
const %(type_x)s* x = x_data + xs0 * row;
const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
%(type_x)s* sm = sm_data + sms0 * row;
%(type_x)s sum = 0.0;
int row_max_j = 0;
%(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j)
{
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
}
//compute the exp
for (int j = 0; j < N; ++j)
{
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
%(type_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
%(type_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
%(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp);
}
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
- %(load_b)s(b[y_idx*bs0])
+ row_max
+ log(sum));
}
am_data[row*ams0] = row_max_j;
}
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
示例8: c_code
#.........这里部分代码省略.........
%(fail)s;
}
}
""" % locals()
emitted_inames[iname] = True
#check that all outputs have valid dimensions
for idx, oname in enumerate(outputs):
typecode = dtype_to_typecode(node.outputs[idx].dtype)
if idx not in self.inplace_pattern.keys():
code += """
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
{
Py_DECREF(%(oname)s);
%(oname)s = NULL;
}
}
if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
{
Py_XDECREF(%(oname)s);
%(oname)s = NULL;
}
if (NULL == %(oname)s)
{
%(oname)s = pygpu_empty(%(nd)d, dims,
%(typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(oname)s) {
//TODO, this check don't seam good.
//TODO, set exception?
%(fail)s
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
else:
input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx]
code += """
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_INCREF(%(oname)s);
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
{
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Output dimension mis-match. Output"
" %(idx)d (indices start at 0), working inplace"
" on input %(input_idx)s, has shape[%%i] == %%i"
", but the output's size on that axis is %%i.",
i,
PyGpuArray_DIMS(%(oname)s)[i],
dims[i]
);
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s);
//first use at least a full warp
threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE
//next start adding multiprocessors
// UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
n_blocks = std::min(numEls/threads_per_block +
(numEls %% threads_per_block?1:0),
(size_t)30);
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned\\n";
""" % locals()
code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
param = []
for i in range(nd):
param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
param.append("%(name)s->ga.offset" % locals())
for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
code += ',\n'.join(param) + ");\n"
if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
return str(code)
示例9: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
/*
* This is an atomicAdd that works for doubles since that is not provided
* natively by cuda before arch 6.0.
*/
#if __CUDA_ARCH__ < 600
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
__device__ ga_double atomicExch(ga_double *address, ga_double val) {
return atomicExch((ga_ulong *)address,
__double_as_longlong(val));
}
/* GA_LONG */
__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
(ga_ulong)(val + (ga_long)assumed));
} while (assumed != old);
return (ga_long)old;
}
__device__ ga_long atomicExch(ga_long *address, ga_long val) {
return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
}
/* GA_HALF */
/*
* This may read and write 2 bytes more than the size of the array
* if the array has an uneven number of elements. The actual value
* at that spot will not be modified.
*/
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, new_;
old = *base;
do {
assumed = old;
new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX1,
%(type_x)s *X,
const ga_size offset_X,
const ga_size numRowsY,
const ga_size numColsY,
const ga_ssize stridesY0,
const ga_ssize stridesY1,
%(type_y)s *Y,
#.........这里部分代码省略.........
示例10: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
// a version that uses less registers but doesn't work in all cases.
KERNEL void %(kname)s(
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
const ga_int width,
const ga_int c,
const ga_int d,
const ga_int step_x,
const ga_int step_y,
const ga_int grid_c,
const ga_int grid_d,
const ga_size stride0, const ga_size stride1,
const ga_size stride2, const ga_size stride3,
GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
const ga_size out_s0, const ga_size out_s1,
GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
)
{
const ga_int wrap_centered_idx_shift_x = c/2;
const ga_int wrap_centered_idx_shift_y = d/2;
global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
for(ga_int tblock = GID_0*LDIM_2+LID_2;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=GDIM_0*LDIM_2){
const ga_int b = tblock%%grid_d;
ga_int left = tblock/grid_d;
const ga_int a = left%%grid_c;
left = left/grid_c;
const ga_int s = left%%nb_stack;
left = left/nb_stack;
const ga_int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
ga_int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
ga_int i = LID_1; // loop over c
{
ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
}
ga_int j = LID_0; // loop over d
{
ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
}
ga_int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
ga_int z_col = j + d * i;
ga_int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}""" % locals()
params = [
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
KERNEL void %(kname)s(
#.........这里部分代码省略.........
示例11: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
work_x = gpuarray.dtype_to_ctype(work_x)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
if node.inputs[0].type.context.kind != b'cuda':
f = ''
else:
f = '' if dtype_x == 'float64' else 'f'
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
]
sio = StringIO()
print("""#include "cluda.h"
KERNEL void %(kname)s(const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
{
x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
for (ga_int row = GID_0; row < M; row += GDIM_0){
GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
LOCAL_MEM ga_int row_max_threadIdx;
%(work_x)s per_thread_row_max, per_thread_sum;
ga_int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
// compute separate per-thread maximums and argmaxes
per_thread_row_max = NAN;
per_thread_row_max_j = 0;
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
}
per_thread_values[LID_0] = per_thread_row_max;
local_barrier();
if (LID_0 == 0) {
row_max = NAN;
row_max_threadIdx = 0;
for (ga_int j = 0; j < LDIM_0; j++)
{
%(work_x)s per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
row_max = fmax%(f)s(per_thread_max, row_max);
}
}
local_barrier();
// The thread with the highest max writes out which of its
// values was the winner.
if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
per_thread_sum = 0.0;
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
per_thread_sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
per_thread_values[LID_0] = per_thread_sum;
local_barrier();
if (LID_0 == 0) {
sum = 0.0;
for (ga_int j = 0; j < LDIM_0; j++) {
sum += per_thread_values[j];
}
sum_inv = 1.0 / sum;
}
local_barrier();
#.........这里部分代码省略.........
示例12: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """#include "cluda.h"
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX1,
GLOBAL_MEM %(type_x)s *X,
const ga_size offset_X,
const ga_size numRowsY,
const ga_size numColsY,
const ga_ssize stridesY0,
const ga_ssize stridesY1,
GLOBAL_MEM %(type_y)s *Y,
const ga_size offset_Y,
const ga_size numIndices,
const ga_ssize stridesIndices,
GLOBAL_MEM %(type_ind)s *indices_arr,
const ga_size offset_indices_arr,
const ga_int set_instead_of_inc,
GLOBAL_MEM ga_int *err)
{
X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
{
for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
{
ga_ssize x_row = indices_arr[i * stridesIndices];
if (x_row < 0)
x_row += numRowsX;
ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) {
if (set_instead_of_inc) {
atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
}
} else {
*err = 1;
}
}
}
return;
}
""" % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
tc=np.dtype(dtype_x).char)
from pygpu.gpuarray import SIZE, SSIZE
params = [
SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
gpuarray.GpuArray]
return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)]
示例13: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = (
"""
// a version that uses less registers but doesn't work in all cases.
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
const int height,
const int width,
const int c,
const int d,
const int step_x,
const int step_y,
const int grid_c,
const int grid_d,
const size_t stride0, const size_t stride1,
const size_t stride2, const size_t stride3,
const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const size_t out_s0, const size_t out_s1,
%(type_z)s * global_out, const size_t offset_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
left = left/grid_c;
const int s = left%%nb_stack;
left = left/nb_stack;
const int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
int i = threadIdx.y; // loop over c
{
int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
}
int j = threadIdx.x; // loop over d
{
int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
}
int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}"""
% locals()
)
params = [
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"uintp",
#.........这里部分代码省略.........
示例14: inline_reduce_fixed_shared
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'):
"""
Return C++ code for a function that reduces a contiguous buffer.
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer of size warpSize * sizeof(dtype).
x
Input data.
stride_x
Input data stride.
load_x
Wrapper to read from x.
pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
return "%(a)s + %(b)s"
for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c
code for its initialization.
Notes
-----
`buf` should be in gpu shared memory, we access it many times.
"""
if b:
init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
" %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
loop_line = manner_fn("red",
manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
"%(load_b)s(%(b)s[i * %(stride_b)s])" %
locals()))
else:
init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s;
}
buf[%(pos)s] = red;
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line2)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
#.........这里部分代码省略.........
示例15: gpu_kernels
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
/*
* This is an atomicAdd that works for doubles since that is not provided
* natively by cuda.
*/
__device__ double atomicAdd(ga_double* address, ga_double val) {
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
/*
* This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array
* has an uneven number of elements. The actual value at that spot
* will not be modified.
*/
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX1,
%(type_x)s *X,
const ga_size offset_X,
const ga_size numRowsY,
const ga_size numColsY,
const ga_ssize stridesY0,
const ga_ssize stridesY1,
%(type_y)s *Y,
const ga_size offset_Y,
const ga_size numIndices,
const ga_ssize stridesIndices,
%(type_ind)s *indices_arr,
const ga_size offset_indices_arr,
ga_int *err)
{
X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y);
indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{
ga_ssize x_row = indices_arr[i * stridesIndices];
if (x_row < 0)
x_row += numRowsX;
ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
}
}
return;
}
""" % locals()
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
#.........这里部分代码省略.........