本文整理汇总了Python中numba.cuda.syncthreads函数的典型用法代码示例。如果您正苦于以下问题:Python syncthreads函数的具体用法?Python syncthreads怎么用?Python syncthreads使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了syncthreads函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: jacobi_relax_core
def jacobi_relax_core(A, Anew, error):
smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f8)
n = A.shape[0]
m = A.shape[1]
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
j = ty + cuda.blockIdx.y * cuda.blockDim.y
i = tx + cuda.blockIdx.x * cuda.blockDim.x
sy = ty + 1
sx = tx + 1
smem[sy, sx] = A[j, i]
if tx == 0 and i >= 1:
smem[sy, 0] = A[j, i - 1]
if ty == 0 and j < m - 1:
smem[0, sx] = A[j - 1, i]
if tx == 31 and j >= 1:
smem[sy, 33] = A[j, i + 1]
if ty == 31 and j < n - 1:
smem[33, sx] = A[j + 1, i]
cuda.syncthreads() # ensure smem is visible by all threads in the block
if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
+ smem[sy - 1, sx] + smem[sy + 1, sx])
error[j, i] = Anew[j, i] - A[j, i]
示例2: _getOccupancyCUDAkernel
def _getOccupancyCUDAkernel(occus, coords, centers, channelsigmas, trunc):
centeridx = cuda.blockIdx.x
blockidx = cuda.blockIdx.y
atomidx = (cuda.threadIdx.x + (cuda.blockDim.x * blockidx))
if atomidx >= coords.shape[0] or centeridx >= centers.shape[0]:
return
# TODO: Can remove this. Barely any speedup
centcoor = cuda.shared.array(shape=(3), dtype=numba.float32)
centcoor[0] = centers[centeridx, 0]
centcoor[1] = centers[centeridx, 1]
centcoor[2] = centers[centeridx, 2]
cuda.syncthreads()
dx = coords[atomidx, 0] - centcoor[0]
dy = coords[atomidx, 1] - centcoor[1]
dz = coords[atomidx, 2] - centcoor[2]
d2 = dx * dx + dy * dy + dz * dz
if d2 >= trunc:
return
d1 = 1 / sqrt(d2)
for h in range(channelsigmas.shape[1]):
if channelsigmas[atomidx, h] == 0:
continue
x = channelsigmas[atomidx, h] * d1
value = 1 - exp(-(x ** 12))
cuda.atomic.max(occus, (centeridx, h), value)
示例3: cu_square_matrix_mul
def cu_square_matrix_mul(A, B, C):
sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
bw = cuda.blockDim.x
bh = cuda.blockDim.y
x = tx + bx * bw
y = ty + by * bh
acc = float32(0) # forces all the math to be f32
for i in range(bpg):
if x < n and y < n:
sA[ty, tx] = A[y, tx + i * tpb]
sB[ty, tx] = B[ty + i * tpb, x]
cuda.syncthreads()
if x < n and y < n:
for j in range(tpb):
acc += sA[ty, j] * sB[j, tx]
cuda.syncthreads()
if x < n and y < n:
C[y, x] = acc
示例4: simple_smem
def simple_smem(ary):
sm = cuda.shared.array(N, int32)
i = cuda.grid(1)
if i == 0:
for j in range(N):
sm[j] = j
cuda.syncthreads()
ary[i] = sm[i]
示例5: atomic_add_double
def atomic_add_double(idx, ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, float64)
sm[tid] = 0.0
cuda.syncthreads()
bin = idx[tid] % 32
cuda.atomic.add(sm, bin, 1.0)
cuda.syncthreads()
ary[tid] = sm[tid]
示例6: atomic_add3
def atomic_add3(ary):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
sm = cuda.shared.array((4, 8), uint32)
sm[tx, ty] = ary[tx, ty]
cuda.syncthreads()
cuda.atomic.add(sm, (tx, uint64(ty)), 1)
cuda.syncthreads()
ary[tx, ty] = sm[tx, ty]
示例7: atomic_add
def atomic_add(ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, uint32)
sm[tid] = 0
cuda.syncthreads()
bin = ary[tid] % 32
cuda.atomic.add(sm, bin, 1)
cuda.syncthreads()
ary[tid] = sm[tid]
示例8: atomic_add_float
def atomic_add_float(ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, float32)
sm[tid] = 0
cuda.syncthreads()
bin = int(ary[tid] % 32)
cuda.atomic.add(sm, bin, 1.0)
cuda.syncthreads()
ary[tid] = sm[tid]
示例9: atomic_add_float_2
def atomic_add_float_2(ary):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
sm = cuda.shared.array((4, 8), float32)
sm[tx, ty] = ary[tx, ty]
cuda.syncthreads()
cuda.atomic.add(sm, (tx, ty), 1)
cuda.syncthreads()
ary[tx, ty] = sm[tx, ty]
示例10: idx_kernel
def idx_kernel(arr):
s = cuda.shared.array(shape=maxThread, dtype=int32)
idx = cuda.grid(1)
if idx < arr.shape[0]:
s[cuda.threadIdx.x] = 1
cuda.syncthreads()
if idx < arr.shape[0]:
cuda.atomic.add(arr, s[cuda.threadIdx.x], 1)
示例11: atomic_max_double_shared
def atomic_max_double_shared(res, ary):
tid = cuda.threadIdx.x
smary = cuda.shared.array(32, float64)
smary[tid] = ary[tid]
smres = cuda.shared.array(1, float64)
if tid == 0:
smres[0] = res[0]
cuda.syncthreads()
cuda.atomic.max(smres, 0, smary[tid])
cuda.syncthreads()
if tid == 0:
res[0] = smres[0]
示例12: problematic
def problematic(x, y):
tid = cuda.threadIdx.x
ntid = cuda.blockDim.x
if tid > 12:
for i in range(ntid):
y[i] += x[i] // y[i]
cuda.syncthreads()
if tid < 17:
for i in range(ntid):
x[i] += x[i] // y[i]
示例13: device_reduce_full_block
def device_reduce_full_block(arr, partials, sm_partials):
"""
Partially reduce `arr` into `partials` using `sm_partials` as working
space. The algorithm goes like:
array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
block-0: | x | | | x | |
block-1: | | x | | | x |
block-2: | | | x | | |
The array is divided into chunks of 128 (size of a threadblock).
The threadblocks consumes the chunks in roundrobin scheduling.
First, a threadblock loads a chunk into temp memory. Then, all
subsequent chunks are combined into the temp memory.
Once all chunks are processed. Inner-block reduction is performed
on the temp memory. So that, there will just be one scalar result
per block. The result from each block is stored to `partials` at
the dedicated slot.
"""
tid = cuda.threadIdx.x
blkid = cuda.blockIdx.x
blksz = cuda.blockDim.x
gridsz = cuda.gridDim.x
# block strided loop to compute the reduction
start = tid + blksz * blkid
stop = arr.size
step = blksz * gridsz
# load first value
tmp = arr[start]
# loop over all values in block-stride
for i in range(start + step, stop, step):
tmp = reduce_op(tmp, arr[i])
cuda.syncthreads()
# inner-warp reduction
inner_warp_reduction(sm_partials, tmp)
cuda.syncthreads()
# at this point, only the first slot for each warp in tsm_partials
# is valid.
# finish up block reduction
# warning: this is assuming 4 warps.
# assert numwarps == 4
if tid < 2:
sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
sm_partials[tid + 2, 0])
if tid == 0:
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
示例14: oracle
def oracle(x, y):
tid = cuda.threadIdx.x
ntid = cuda.blockDim.x
if tid > 12:
for i in range(ntid):
if y[i] != 0:
y[i] += x[i] // y[i]
cuda.syncthreads()
if tid < 17:
for i in range(ntid):
if y[i] != 0:
x[i] += x[i] // y[i]
示例15: gpu_unique_k
def gpu_unique_k(arr, k, out, outsz_ptr):
"""
Note: run with small blocks.
"""
tid = cuda.threadIdx.x
blksz = cuda.blockDim.x
base = 0
# shared memory
vset_size = 0
sm_mem_size = MAX_FAST_UNIQUE_K
vset = cuda.shared.array(sm_mem_size, dtype=nbtype)
share_vset_size = cuda.shared.array(1, dtype=int32)
share_loaded = cuda.shared.array(sm_mem_size, dtype=nbtype)
sm_mem_size = min(k, sm_mem_size)
while vset_size < sm_mem_size and base < arr.size:
pos = base + tid
valid_load = min(blksz, arr.size - base)
# load
if tid < valid_load:
share_loaded[tid] = arr[pos]
# wait for load to complete
cuda.syncthreads()
# thread-0 inserts
if tid == 0:
for i in range(valid_load):
val = share_loaded[i]
new_size = gpu_unique_set_insert(vset, vset_size, val)
if new_size >= 0:
vset_size = new_size
else:
vset_size = sm_mem_size + 1
share_vset_size[0] = vset_size
# wait until the insert is done
cuda.syncthreads()
vset_size = share_vset_size[0]
# increment
base += blksz
# output
if vset_size <= sm_mem_size:
for i in range(tid, vset_size, blksz):
out[i] = vset[i]
if tid == 0:
outsz_ptr[0] = vset_size
else:
outsz_ptr[0] = -1