本文整理汇总了Python中numba.cuda.syncthreads方法的典型用法代码示例。如果您正苦于以下问题:Python cuda.syncthreads方法的具体用法?Python cuda.syncthreads怎么用?Python cuda.syncthreads使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类numba.cuda
的用法示例。
在下文中一共展示了cuda.syncthreads方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: rotate_iou_kernel_eval
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if (tx < col_size):
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if (tx < row_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
block_boxes[tx * 5:tx * 5 + 5], criterion)
示例2: rotate_iou_kernel_eval
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou,
criterion=-1):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if (tx < col_size):
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if (tx < row_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = row_start * threadsPerBlock * K + col_start * \
threadsPerBlock + tx * K + i
dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
block_boxes[tx * 5:tx * 5 + 5],
criterion)
示例3: rotate_iou_kernel_eval
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
threads_per_block = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threads_per_block, threads_per_block)
col_size = min(K - col_start * threads_per_block, threads_per_block)
block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
dev_query_box_idx = threads_per_block * col_start + tx
dev_box_idx = threads_per_block * row_start + tx
if tx < col_size:
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if tx < row_size:
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = row_start * threads_per_block * K + col_start * threads_per_block + tx * K + i
dev_iou[offset] = dev_rotate_iou_eval(
block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion
)
示例4: nms_kernel_v2
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(
shape=(threadsPerBlock, 5), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if (tx < col_size):
block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
cuda.syncthreads()
if (cuda.threadIdx.x < row_size):
cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
# cur_box = dev_boxes + cur_box_idx * 5;
i = 0
t = 0
start = 0
if (row_start == col_start):
start = tx + 1
for i in range(start, col_size):
if (iou_device(dev_boxes[cur_box_idx], block_boxes[i]) >
nms_overlap_thresh):
t |= 1 << i
col_blocks = ((n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0))
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例5: nms_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if (tx < col_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if (tx < row_size):
cur_box_idx = threadsPerBlock * row_start + tx
# cur_box = dev_boxes + cur_box_idx * 5;
t = 0
start = 0
if (row_start == col_start):
start = tx + 1
for i in range(start, col_size):
iou = iou_device(dev_boxes[cur_box_idx * 5:cur_box_idx * 5 + 4],
block_boxes[i * 5:i * 5 + 4])
if (iou > nms_overlap_thresh):
t |= 1 << i
col_blocks = ((n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0))
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例6: rotate_nms_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 6, ), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if (tx < col_size):
block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
cuda.syncthreads()
if (tx < row_size):
cur_box_idx = threadsPerBlock * row_start + tx
# cur_box = dev_boxes + cur_box_idx * 5;
t = 0
start = 0
if (row_start == col_start):
start = tx + 1
for i in range(start, col_size):
iou = devRotateIoU(dev_boxes[cur_box_idx * 6:cur_box_idx * 6 + 5],
block_boxes[i * 6:i * 6 + 5])
# print('iou', iou, cur_box_idx, i)
if (iou > nms_overlap_thresh):
t |= 1 << i
col_blocks = ((n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0))
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例7: rotate_iou_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if (tx < col_size):
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if (tx < row_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
dev_iou[offset] = devRotateIoU(block_qboxes[i * 5:i * 5 + 5],
block_boxes[tx * 5:tx * 5 + 5])
示例8: rotate_iou_kernel_eval
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N,
K,
dev_boxes,
dev_query_boxes,
dev_iou,
criterion=-1):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if (tx < col_size):
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if (tx < row_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
block_boxes[tx * 5:tx * 5 + 5],
criterion)
示例9: mat3_mul_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def mat3_mul_kernel(m0, m1, out):
i, j = cuda.grid(2)
m0_mat = m0[i, j, :, :]
m1_mat = m1[i, j, :, :]
out_mat = out[i, j, :, :]
mat3_mul(m0_mat, m1_mat, out_mat)
# cuda.syncthreads()
示例10: nms_kernel_v2
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(threadsPerBlock, 5), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if tx < col_size:
block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
cuda.syncthreads()
if cuda.threadIdx.x < row_size:
cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
# cur_box = dev_boxes + cur_box_idx * 5;
i = 0
t = 0
start = 0
if row_start == col_start:
start = tx + 1
for i in range(start, col_size):
if iou_device(dev_boxes[cur_box_idx], block_boxes[i]) > nms_overlap_thresh:
t |= 1 << i
col_blocks = (n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0
)
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例11: nms_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if tx < col_size:
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
cur_box_idx = threadsPerBlock * row_start + tx
# cur_box = dev_boxes + cur_box_idx * 5;
t = 0
start = 0
if row_start == col_start:
start = tx + 1
for i in range(start, col_size):
iou = iou_device(
dev_boxes[cur_box_idx * 5 : cur_box_idx * 5 + 4],
block_boxes[i * 5 : i * 5 + 4],
)
if iou > nms_overlap_thresh:
t |= 1 << i
col_blocks = (n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0
)
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例12: rotate_nms_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.y
col_start = cuda.blockIdx.x
tx = cuda.threadIdx.x
row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 6,), dtype=numba.float32)
dev_box_idx = threadsPerBlock * col_start + tx
if tx < col_size:
block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
cuda.syncthreads()
if tx < row_size:
cur_box_idx = threadsPerBlock * row_start + tx
# cur_box = dev_boxes + cur_box_idx * 5;
t = 0
start = 0
if row_start == col_start:
start = tx + 1
for i in range(start, col_size):
iou = devRotateIoU(
dev_boxes[cur_box_idx * 6 : cur_box_idx * 6 + 5],
block_boxes[i * 6 : i * 6 + 5],
)
# print('iou', iou, cur_box_idx, i)
if iou > nms_overlap_thresh:
t |= 1 << i
col_blocks = (n_boxes) // (threadsPerBlock) + (
(n_boxes) % (threadsPerBlock) > 0
)
dev_mask[cur_box_idx * col_blocks + col_start] = t
示例13: rotate_iou_kernel
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if tx < col_size:
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if tx < row_size:
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = (
row_start * threadsPerBlock * K
+ col_start * threadsPerBlock
+ tx * K
+ i
)
dev_iou[offset] = devRotateIoU(
block_qboxes[i * 5 : i * 5 + 5], block_boxes[tx * 5 : tx * 5 + 5]
)
示例14: rotate_iou_kernel_eval
# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if tx < col_size:
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if tx < row_size:
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = (
row_start * threadsPerBlock * K
+ col_start * threadsPerBlock
+ tx * K
+ i
)
dev_iou[offset] = devRotateIoUEval(
block_qboxes[i * 5 : i * 5 + 5],
block_boxes[tx * 5 : tx * 5 + 5],
criterion,
)