当前位置: 首页>>代码示例>>Python>>正文


Python cuda.syncthreads函数代码示例

本文整理汇总了Python中numba.cuda.syncthreads函数的典型用法代码示例。如果您正苦于以下问题:Python syncthreads函数的具体用法?Python syncthreads怎么用?Python syncthreads使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了syncthreads函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: jacobi_relax_core

def jacobi_relax_core(A, Anew, error):
    smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f8)
    n = A.shape[0]
    m = A.shape[1]

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    j = ty + cuda.blockIdx.y * cuda.blockDim.y
    i = tx + cuda.blockIdx.x * cuda.blockDim.x

    sy = ty + 1
    sx = tx + 1

    smem[sy, sx] = A[j, i]
    if tx == 0 and i >= 1:
        smem[sy, 0] = A[j, i - 1]

    if ty == 0 and j < m - 1:
        smem[0, sx] = A[j - 1, i]

    if tx == 31 and j >= 1:
        smem[sy, 33] = A[j, i + 1]

    if ty == 31 and j < n - 1:
        smem[33, sx] = A[j + 1, i]

    cuda.syncthreads() # ensure smem is visible by all threads in the block

    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
                            + smem[sy - 1, sx] + smem[sy + 1, sx])
        error[j, i] = Anew[j, i] - A[j, i]
开发者ID:FedericoStra,项目名称:numba,代码行数:33,代码来源:laplace2d-numba-cuda-smem.py

示例2: _getOccupancyCUDAkernel

def _getOccupancyCUDAkernel(occus, coords, centers, channelsigmas, trunc):
    centeridx = cuda.blockIdx.x
    blockidx = cuda.blockIdx.y
    atomidx = (cuda.threadIdx.x + (cuda.blockDim.x * blockidx))

    if atomidx >= coords.shape[0] or centeridx >= centers.shape[0]:
        return

    # TODO: Can remove this. Barely any speedup
    centcoor = cuda.shared.array(shape=(3), dtype=numba.float32)
    centcoor[0] = centers[centeridx, 0]
    centcoor[1] = centers[centeridx, 1]
    centcoor[2] = centers[centeridx, 2]
    cuda.syncthreads()

    dx = coords[atomidx, 0] - centcoor[0]
    dy = coords[atomidx, 1] - centcoor[1]
    dz = coords[atomidx, 2] - centcoor[2]
    d2 = dx * dx + dy * dy + dz * dz
    if d2 >= trunc:
        return

    d1 = 1 / sqrt(d2)
    for h in range(channelsigmas.shape[1]):
        if channelsigmas[atomidx, h] == 0:
            continue
        x = channelsigmas[atomidx, h] * d1
        value = 1 - exp(-(x ** 12))
        cuda.atomic.max(occus, (centeridx, h), value)
开发者ID:alejandrovr,项目名称:htmd,代码行数:29,代码来源:voxeldescriptors.py

示例3: cu_square_matrix_mul

        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc
开发者ID:stuartarchibald,项目名称:numba,代码行数:30,代码来源:test_matmul.py

示例4: simple_smem

def simple_smem(ary):
    sm = cuda.shared.array(N, int32)
    i = cuda.grid(1)
    if i == 0:
        for j in range(N):
            sm[j] = j
    cuda.syncthreads()
    ary[i] = sm[i]
开发者ID:ASPP,项目名称:numba,代码行数:8,代码来源:test_globals.py

示例5: atomic_add_double

def atomic_add_double(idx, ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, float64)
    sm[tid] = 0.0
    cuda.syncthreads()
    bin = idx[tid] % 32
    cuda.atomic.add(sm, bin, 1.0)
    cuda.syncthreads()
    ary[tid] = sm[tid]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py

示例6: atomic_add3

def atomic_add3(ary):
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    sm = cuda.shared.array((4, 8), uint32)
    sm[tx, ty] = ary[tx, ty]
    cuda.syncthreads()
    cuda.atomic.add(sm, (tx, uint64(ty)), 1)
    cuda.syncthreads()
    ary[tx, ty] = sm[tx, ty]
开发者ID:MJJoyce,项目名称:numba,代码行数:9,代码来源:test_atomics.py

示例7: atomic_add

def atomic_add(ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, uint32)
    sm[tid] = 0
    cuda.syncthreads()
    bin = ary[tid] % 32
    cuda.atomic.add(sm, bin, 1)
    cuda.syncthreads()
    ary[tid] = sm[tid]
开发者ID:MJJoyce,项目名称:numba,代码行数:9,代码来源:test_atomics.py

示例8: atomic_add_float

def atomic_add_float(ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, float32)
    sm[tid] = 0
    cuda.syncthreads()
    bin = int(ary[tid] % 32)
    cuda.atomic.add(sm, bin, 1.0)
    cuda.syncthreads()
    ary[tid] = sm[tid]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py

示例9: atomic_add_float_2

def atomic_add_float_2(ary):
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    sm = cuda.shared.array((4, 8), float32)
    sm[tx, ty] = ary[tx, ty]
    cuda.syncthreads()
    cuda.atomic.add(sm, (tx, ty), 1)
    cuda.syncthreads()
    ary[tx, ty] = sm[tx, ty]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py

示例10: idx_kernel

def idx_kernel(arr):
    s = cuda.shared.array(shape=maxThread, dtype=int32)

    idx = cuda.grid(1)
    if idx < arr.shape[0]:
        s[cuda.threadIdx.x] = 1

    cuda.syncthreads()

    if idx < arr.shape[0]:
        cuda.atomic.add(arr, s[cuda.threadIdx.x], 1)
开发者ID:CyberIntelMafia,项目名称:KaggleMalware,代码行数:11,代码来源:test_sync_cuda.py

示例11: atomic_max_double_shared

def atomic_max_double_shared(res, ary):
    tid = cuda.threadIdx.x
    smary = cuda.shared.array(32, float64)
    smary[tid] = ary[tid]
    smres = cuda.shared.array(1, float64)
    if tid == 0:
        smres[0] = res[0]
    cuda.syncthreads()
    cuda.atomic.max(smres, 0, smary[tid])
    cuda.syncthreads()
    if tid == 0:
        res[0] = smres[0]
开发者ID:cpcloud,项目名称:numba,代码行数:12,代码来源:test_atomics.py

示例12: problematic

        def problematic(x, y):
            tid = cuda.threadIdx.x
            ntid = cuda.blockDim.x

            if tid > 12:
                for i in range(ntid):
                    y[i] += x[i] // y[i]

            cuda.syncthreads()
            if tid < 17:
                for i in range(ntid):
                    x[i] += x[i] // y[i]
开发者ID:cpcloud,项目名称:numba,代码行数:12,代码来源:test_exception.py

示例13: device_reduce_full_block

    def device_reduce_full_block(arr, partials, sm_partials):
        """
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        """
        tid = cuda.threadIdx.x
        blkid = cuda.blockIdx.x
        blksz = cuda.blockDim.x
        gridsz = cuda.gridDim.x

        # block strided loop to compute the reduction
        start = tid + blksz * blkid
        stop = arr.size
        step = blksz * gridsz

        # load first value
        tmp = arr[start]
        # loop over all values in block-stride
        for i in range(start + step, stop, step):
            tmp = reduce_op(tmp, arr[i])

        cuda.syncthreads()
        # inner-warp reduction
        inner_warp_reduction(sm_partials, tmp)

        cuda.syncthreads()
        # at this point, only the first slot for each warp in tsm_partials
        # is valid.

        # finish up block reduction
        # warning: this is assuming 4 warps.
        # assert numwarps == 4
        if tid < 2:
            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
                                            sm_partials[tid + 2, 0])
        if tid == 0:
            partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
开发者ID:cpcloud,项目名称:numba,代码行数:52,代码来源:reduction.py

示例14: oracle

        def oracle(x, y):
            tid = cuda.threadIdx.x
            ntid = cuda.blockDim.x

            if tid > 12:
                for i in range(ntid):
                    if y[i] != 0:
                        y[i] += x[i] // y[i]

            cuda.syncthreads()
            if tid < 17:
                for i in range(ntid):
                    if y[i] != 0:
                        x[i] += x[i] // y[i]
开发者ID:cpcloud,项目名称:numba,代码行数:14,代码来源:test_exception.py

示例15: gpu_unique_k

        def gpu_unique_k(arr, k, out, outsz_ptr):
            """
            Note: run with small blocks.
            """
            tid = cuda.threadIdx.x
            blksz = cuda.blockDim.x
            base = 0

            # shared memory
            vset_size = 0
            sm_mem_size = MAX_FAST_UNIQUE_K
            vset = cuda.shared.array(sm_mem_size, dtype=nbtype)
            share_vset_size = cuda.shared.array(1, dtype=int32)
            share_loaded = cuda.shared.array(sm_mem_size, dtype=nbtype)
            sm_mem_size = min(k, sm_mem_size)

            while vset_size < sm_mem_size and base < arr.size:
                pos = base + tid
                valid_load = min(blksz, arr.size - base)
                # load
                if tid < valid_load:
                    share_loaded[tid] = arr[pos]
                # wait for load to complete
                cuda.syncthreads()
                # thread-0 inserts
                if tid == 0:
                    for i in range(valid_load):
                        val = share_loaded[i]
                        new_size = gpu_unique_set_insert(vset, vset_size, val)
                        if new_size >= 0:
                            vset_size = new_size
                        else:
                            vset_size = sm_mem_size + 1
                    share_vset_size[0] = vset_size
                # wait until the insert is done
                cuda.syncthreads()
                vset_size = share_vset_size[0]
                # increment
                base += blksz

            # output
            if vset_size <= sm_mem_size:
                for i in range(tid, vset_size, blksz):
                    out[i] = vset[i]
                if tid == 0:
                    outsz_ptr[0] = vset_size
            else:
                outsz_ptr[0] = -1
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:48,代码来源:cudautils.py


注:本文中的numba.cuda.syncthreads函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。