当前位置: 首页>>代码示例>>Python>>正文


Python gpuarray.dtype_to_ctype函数代码示例

本文整理汇总了Python中pygpu.gpuarray.dtype_to_ctype函数的典型用法代码示例。如果您正苦于以下问题:Python dtype_to_ctype函数的具体用法?Python dtype_to_ctype怎么用?Python dtype_to_ctype使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了dtype_to_ctype函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: inline_softmax

def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
    Generate code for a softmax.

    On entry, `buf` and `buf2` must contain two identical copies of
    the input to softmax.

    After the code returns `buf` contains the softmax, `buf2` contains
    un-normalized softmax.

    Parameters
    ----------
    N
        Length of the buffer.
    threadPos
        Index of executing thread.
    threadCount
        Number of executing threads.
    dtype
        Dtype of the softmax's output.

    Notes
    -----
    `buf` and `buf2` should be in gpu shared memory, we access it many
    times.

    We use __i as an int variable in a loop.

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
    # get max of buf (trashing all but buf[0])
    return [inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
            '; __i+=' + threadCount + '){',
            buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
            buf2 + '[__i] = ' + buf + '[__i]',
            '}',
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_sum = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
            '; __i+=' + threadCount + '){',
            buf + '[__i] = ' + buf2 + '[__i] / row_sum',
            '}',
            '__syncthreads()',
            ]
开发者ID:12190143,项目名称:Theano,代码行数:52,代码来源:kernel_codegen.py

示例2: gpu_kernels

    def gpu_kernels(self, node, name):
        dt = node.inputs[0].type
        code = """
KERNEL void doublek(GLOBAL_MEM %(ctype) *out,
                   GLOBAL_MEM const %(ctype)s *a,
                   ga_size n) {
  for (ga_size i = LID_0; i < n; i += LDIM_0) {
    out[i] = 2 * a[i];
  }
}
""" % dict(ctype=gpuarray.dtype_to_ctype(dt))
        return [Kernel(code=code, name="doublek",
                       params=[gpuarray.GpuArray,
                               gpuarray.GpuArray,
                               gpuarray.SIZE],
                       flags=Kernel.get_flags(dt))]
开发者ID:abergeron,项目名称:ccw_tutorial_theano,代码行数:16,代码来源:doublegpu.py

示例3: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        f = '' if dtype_x == 'float64' else 'f'
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            %(type_x)s* sm = sm_data + sms0 * row;

            extern LOCAL_MEM %(work_x)s per_thread_values[];
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM int row_max_threadIdx;

            %(work_x)s per_thread_row_max, per_thread_sum;
            int per_thread_row_max_j;

            // COMPUTE ROW MAX AND ARGMAX

            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;

            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[threadIdx.x] = per_thread_row_max;

            local_barrier();

            if (threadIdx.x == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (int j = 0; j < blockDim.x; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }

            local_barrier();

            // The thread with the higest max writes out which of its
            // values was the winner.
            if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;

            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }

            per_thread_values[threadIdx.x] = per_thread_sum;

            local_barrier();

#.........这里部分代码省略.........
开发者ID:Abioy,项目名称:Theano,代码行数:101,代码来源:nnet.py

示例4: c_code

    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != 'cuda':
            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
        itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        type_intp = gpuarray.dtype_to_ctype(numpy.intp)
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
        ctx = sub['params']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: %(k_var)s: %%s.",
                             GpuKernel_error(&%(k_var)s, err));
                %(fail)s;
            }
        """ % locals()
        sync = ""
        if config.gpuarray.sync:
            sync = """
            err = GpuArray_sync(&%(z)s->ga);
            %(err_check)s
            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
                                        PyGpuArray_DIMS(%(dnll)s)[0] :
                                        (ssize_t) 0);

        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
        // or a vector with just one element.
        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
                                           (ssize_t) 0);

        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
            || (PyGpuArray_NDIM(%(sm)s) != 2)
            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
        {
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_Format(PyExc_ValueError,
                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
                         %(dnll)s_dims0,
                         PyGpuArray_DIMS(%(sm)s)[0]);
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_SetString(PyExc_ValueError,
                            "dnll.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(sm)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "sm.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if ((NULL == %(dx)s)
            || (PyGpuArray_DIMS(%(dx)s)[0] !=
                PyGpuArray_DIMS(%(sm)s)[0])
            || (PyGpuArray_DIMS(%(dx)s)[1] !=
                PyGpuArray_DIMS(%(sm)s)[1]))
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
                                 %(typecode_dx)s, GA_C_ORDER,
                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
        }
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
            void *kernel_params[] = {
                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
#.........这里部分代码省略.........
开发者ID:Abioy,项目名称:Theano,代码行数:101,代码来源:nnet.py

示例5: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
        load_x = load_w(node.inputs[0].dtype)
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_acc = gpuarray.dtype_to_ctype(work_sm)

        ctype = gpuarray.dtype_to_ctype(work_sm)

        params = [
            gpuarray.SIZE, gpuarray.SIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
        code = """
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
                    buf[tx] += %(load_b)s(b[tx * sb0]);
                    buf2[tx] = buf[tx];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
                    buf[__i] = exp(buf2[__i] - row_max);
                    buf2[__i] = buf[__i];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
                    buf[__i] = buf2[__i] / row_sum;
                }
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
                }
                local_barrier();
            }
        }
        """ % locals()
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
#.........这里部分代码省略.........
开发者ID:rezaprimasatya,项目名称:Theano,代码行数:101,代码来源:nnet.py

示例6: gpu_kernels

    def gpu_kernels(self, node, nodename):
        # load kernel source
        device_type = node.inputs[0].type.context.kind
        kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
        common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]

        # prepare "$" macros
        if device_type == b'cuda':
            ndim = node.inputs[0].ndim
            dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim))
            dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim))
            src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim))
            set_slice_code = '''
        gidx = gid %% dims_%(i)d;
        gid /= dims_%(i)d;
        {dstv};
        {dsti};
        src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
                dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
                dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
            set_slice_code = ''.join(
                set_slice_code % dict(i=j) for j in range(1, ndim))
            flags = Kernel.get_flags(node.inputs[0].dtype)
            subs = dict(
                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
                out_t=ga.dtype_to_ctype(self.idx_dtype),
                dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
                dstv='INPUT_TYPE *dstv,' if self.return_values else '',
                dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
                dstv_strides=dstv_strides_code if self.return_values else '',
                dsti_strides=dsti_strides_code if self.return_indices else '',
                src_strides=src_strides_code,
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
                ndim=str(ndim),
                use_half=int(node.inputs[0].dtype == 'float16')
                )
        elif device_type == b'opencl':
            raise NotImplementedError()

        # setup parameters
        param_types = [ga.SIZE] * (ndim - 1)  # dims
        for _ in range(self.return_values + self.return_indices):
            param_types.append(ga.GpuArray)  # dst*
            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
        param_types.append(ga.SIZE)  # size

        # load and compile kernels
        with open(os.path.join(
            os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext
        )) as f:
            common_src = f.read()

        kernels = []

        def build_kernel(fname, kname, subs):
            with open(os.path.join(
                os.path.dirname(__file__), 'c_code', fname)
            ) as f:
                kernel_src = f.read()
            ker = Kernel(
                code=Template(common_src + kernel_src).substitute(**subs),
                name=kname,
                params=param_types,
                flags=flags,
                objvar=kname + nodename)
            return ker

        subs['count_t'] = 'int'
        kernels.append(
            build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
        subs['kname'] = 'k_topk_dense_large'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
        subs['count_t'] = 'long long'
        subs['kname'] = 'k_topk_dense_xlarge'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
        return kernels
开发者ID:gvtulder,项目名称:Theano,代码行数:83,代码来源:sort.py

示例7: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(work_x)
        type_b = gpuarray.dtype_to_ctype(work_b)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
            %(type_x)s* sm = sm_data + sms0 * row;

            %(type_x)s sum = 0.0;
            int row_max_j = 0;
            %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                %(type_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
                sm[j * sms1] = %(write_x)s(sm_ij);
            }
            %(type_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
                __tmp *= sum_inv;
                sm[j * sms1] = %(write_x)s(__tmp);
            }
            if ((y_idx >= N) || (y_idx < 0))
            {
                //TODO: set raise an error bit in a global var?
                nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
            }
            else
            {
                nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
                           - %(load_b)s(b[y_idx*bs0])
                           + row_max
                           + log(sum));
            }
            am_data[row*ams0] = row_max_j;
          }
        }
        """ % locals(), file=sio)
        params = [
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp'
            ]
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
开发者ID:ballasn,项目名称:Theano,代码行数:99,代码来源:nnet.py

示例8: c_code


#.........这里部分代码省略.........
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            typecode = dtype_to_typecode(node.outputs[idx].dtype)
            if idx not in self.inplace_pattern.keys():
                code += """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
                            pygpu_default_context(), Py_None);
            if (!%(oname)s) {
                        //TODO, this check don't seam good.
                        //TODO, set exception?
                            %(fail)s
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                code += """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);

        //first use at least a full warp
        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE

        //next start adding multiprocessors
        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
        n_blocks = std::min(numEls/threads_per_block +
                               (numEls %% threads_per_block?1:0),
                           (size_t)30);

        // next start adding more warps per multiprocessor
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);

                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
        param = []
        for i in range(nd):
            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
                                                              i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
        code += ',\n'.join(param) + ");\n"
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
        return str(code)
开发者ID:Donghuan,项目名称:Theano,代码行数:101,代码来源:elemwise.py

示例9: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
/*
 * This is an atomicAdd that works for doubles since that is not provided
 * natively by cuda before arch 6.0.
 */
#if __CUDA_ARCH__ < 600
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
    ga_ulong *address_as_ull = (ga_ulong *)address;
    ga_ulong old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif

__device__ ga_double atomicExch(ga_double *address, ga_double val) {
    return atomicExch((ga_ulong *)address,
                      __double_as_longlong(val));
}

/* GA_LONG */

__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
    ga_ulong *address_as_ull = (ga_ulong *)address;
    ga_ulong old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        (ga_ulong)(val + (ga_long)assumed));
    } while (assumed != old);
    return (ga_long)old;
}

__device__ ga_long atomicExch(ga_long *address, ga_long val) {
    return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
}


/* GA_HALF */

/*
 * This may read and write 2 bytes more than the size of the array
 * if the array has an uneven number of elements.  The actual value
 * at that spot will not be modified.
 */
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
      __half2float((ga_half)__byte_perm(old, 0,
                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, new_;
  old = *base;
  do {
    assumed = old;
    new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      %(type_y)s *Y,
#.........这里部分代码省略.........
开发者ID:juancamilog,项目名称:Theano,代码行数:101,代码来源:subtensor.py

示例10: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_idx_shift_x = c/2;
            const ga_int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    ga_int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }""" % locals()
        params = [
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """
        KERNEL void %(kname)s(
#.........这里部分代码省略.........
开发者ID:ChinaQuants,项目名称:Theano,代码行数:101,代码来源:neighbours.py

示例11: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        if node.inputs[0].type.context.kind != b'cuda':
            f = ''
        else:
            f = '' if dtype_x == 'float64' else 'f'
        params = [
            gpuarray.SIZE, gpuarray.SIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
        ]
        sio = StringIO()
        print("""#include "cluda.h"

        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
            GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
            GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
            GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
            GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
        {
          x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
          b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
          y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
          nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
          sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
          am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
          for (ga_int row = GID_0; row < M; row += GDIM_0){
            GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
            GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
            GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM ga_int row_max_threadIdx;
            %(work_x)s per_thread_row_max, per_thread_sum;
            ga_int per_thread_row_max_j;
            // COMPUTE ROW MAX AND ARGMAX
            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[LID_0] = per_thread_row_max;
            local_barrier();
            if (LID_0 == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (ga_int j = 0; j < LDIM_0; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }
            local_barrier();
            // The thread with the highest max writes out which of its
            // values was the winner.
            if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }
            per_thread_values[LID_0] = per_thread_sum;
            local_barrier();
            if (LID_0 == 0) {
              sum = 0.0;
              for (ga_int j = 0; j < LDIM_0; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }
            local_barrier();
#.........这里部分代码省略.........
开发者ID:DEVESHTARASIA,项目名称:Theano,代码行数:101,代码来源:nnet.py

示例12: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      GLOBAL_MEM %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      GLOBAL_MEM %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const ga_int set_instead_of_inc,
                                      GLOBAL_MEM ga_int *err)
        {
             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
                   tc=np.dtype(dtype_x).char)
        from pygpu.gpuarray import SIZE, SSIZE
        params = [
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
            gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
开发者ID:EugenePY,项目名称:Theano,代码行数:67,代码来源:subtensor.py

示例13: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = (
            """
// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
            const int width,
            const int c,
            const int d,
            const int step_x,
            const int step_y,
            const int grid_c,
            const int grid_d,
            const size_t stride0, const size_t stride1,
            const size_t stride2, const size_t stride3,
            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
            const size_t out_s0, const size_t out_s1,
            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
                left = left/grid_c;
                const int s = left%%nb_stack;
                left = left/nb_stack;
                const int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            int i = threadIdx.y;     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                int j = threadIdx.x;  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    int z_col = j + d * i;
                                    int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }"""
            % locals()
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
#.........这里部分代码省略.........
开发者ID:kelvinxu,项目名称:Theano,代码行数:101,代码来源:neighbours.py

示例14: inline_reduce_fixed_shared

def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b='', load_b='', dtype='float32'):
    """
    Return C++ code for a function that reduces a contiguous buffer.

    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.

    Parameters
    ----------
    N
        Length of the buffer.
    buf
        Buffer pointer of size warpSize * sizeof(dtype).
    x
        Input data.
    stride_x
        Input data stride.
    load_x
        Wrapper to read from x.
    pos
        Index of executing thread.
    count
        Number of executing threads.
    b
        Optional, pointer to the bias.
    stride_b
        Optional, the stride of b if b is provided.
    load_b
        Optional, wrapper to read from b if b is provided.
    dtype
        Optional, the dtype of the output.
    manner_fn
        A function that accepts strings of arguments a and b, and
        returns c code for their reduction.

          return "%(a)s + %(b)s"

        for a sum reduction.
    manner_init
        A function that accepts strings of arguments a and return c
        code for its initialization.

    Notes
    -----
    `buf` should be in gpu shared memory, we access it many times.

    """
    if b:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
                           " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
        loop_line = manner_fn("red",
                              manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
                                          "%(load_b)s(%(b)s[i * %(stride_b)s])" %
                                          locals()))
    else:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
        loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
                           "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))

    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
        }
        buf[%(pos)s] = red;
        __syncthreads();
        if (%(pos)s < warpSize)
        {
            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
            {
                %(buf)s[%(pos)s] = %(loop_line2)s;
            }
            if (%(pos)s < 16)
            {
                //reduce so that %(pos)s 0 has the reduction of everything
                if(%(pos)s + 16 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_16)s;
                if(%(pos)s + 8 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_8)s;
                if(%(pos)s + 4 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_4)s;
                if(%(pos)s + 2 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_2)s;
                if(%(pos)s + 1 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_1)s;
            }
#.........这里部分代码省略.........
开发者ID:12190143,项目名称:Theano,代码行数:101,代码来源:kernel_codegen.py

示例15: gpu_kernels

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        dtype_out = node.outputs[0].dtype
        itemsize_x = numpy.dtype(dtype_x).itemsize
        itemsize_y = numpy.dtype(dtype_y).itemsize
        itemsize_ind = numpy.dtype(dtype_ind).itemsize
        itemsize_out = numpy.dtype(dtype_out).itemsize
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        type_out = gpuarray.dtype_to_ctype(dtype_out)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
/*
 * This is an atomicAdd that works for doubles since that is not provided
 * natively by cuda.
 */
__device__ double atomicAdd(ga_double* address, ga_double val) {
    unsigned long long int* address_as_ull =
                                          (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

/*
 * This is a version of atomicAdd that works for half-floats.  It may
 * read and write 2 bytes more than the size of the array if the array
 * has an uneven number of elements.  The actual value at that spot
 * will not be modified.
 */

__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
      __half2float((ga_half)__byte_perm(old, 0,
                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      ga_int *err)
        {
             X = (%(type_x)s *)(((char *)X)+offset_X);
             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % locals()
        params = [
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
#.........这里部分代码省略.........
开发者ID:aalmah,项目名称:Theano,代码行数:101,代码来源:subtensor.py


注:本文中的pygpu.gpuarray.dtype_to_ctype函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。