当前位置: 首页>>代码示例>>Python>>正文


Python cuda.syncthreads方法代码示例

本文整理汇总了Python中numba.cuda.syncthreads方法的典型用法代码示例。如果您正苦于以下问题:Python cuda.syncthreads方法的具体用法?Python cuda.syncthreads怎么用?Python cuda.syncthreads使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在numba.cuda的用法示例。


在下文中一共展示了cuda.syncthreads方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: rotate_iou_kernel_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                           block_boxes[tx * 5:tx * 5 + 5], criterion) 
开发者ID:traveller59,项目名称:kitti-object-eval-python,代码行数:32,代码来源:rotate_iou.py

示例2: rotate_iou_kernel_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou,
                           criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * \
                     threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                               block_boxes[tx * 5:tx * 5 + 5],
                                               criterion) 
开发者ID:ucbdrive,项目名称:3d-vehicle-tracking,代码行数:35,代码来源:rotate_iou.py

示例3: rotate_iou_kernel_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threads_per_block = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threads_per_block, threads_per_block)
    col_size = min(K - col_start * threads_per_block, threads_per_block)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threads_per_block * col_start + tx
    dev_box_idx = threads_per_block * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threads_per_block * K + col_start * threads_per_block + tx * K + i
            dev_iou[offset] = dev_rotate_iou_eval(
                block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion
            ) 
开发者ID:mit-han-lab,项目名称:pvcnn,代码行数:33,代码来源:iou.py

示例4: nms_kernel_v2

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(
        shape=(threadsPerBlock, 5), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
        block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
        block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
        block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
        block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
    cuda.syncthreads()
    if (cuda.threadIdx.x < row_size):
        cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
        # cur_box = dev_boxes + cur_box_idx * 5;
        i = 0
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            if (iou_device(dev_boxes[cur_box_idx], block_boxes[i]) >
                    nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:traveller59,项目名称:second.pytorch,代码行数:34,代码来源:nms_gpu.py

示例5: nms_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if (tx < row_size):
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            iou = iou_device(dev_boxes[cur_box_idx * 5:cur_box_idx * 5 + 4],
                             block_boxes[i * 5:i * 5 + 4])
            if (iou > nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:traveller59,项目名称:second.pytorch,代码行数:33,代码来源:nms_gpu.py

示例6: rotate_nms_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 6, ), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
        block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
        block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
        block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
        block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
        block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
    cuda.syncthreads()
    if (tx < row_size):
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            iou = devRotateIoU(dev_boxes[cur_box_idx * 6:cur_box_idx * 6 + 5],
                               block_boxes[i * 6:i * 6 + 5])
            # print('iou', iou, cur_box_idx, i)
            if (iou > nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:traveller59,项目名称:second.pytorch,代码行数:35,代码来源:nms_gpu.py

示例7: rotate_iou_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoU(block_qboxes[i * 5:i * 5 + 5],
                                           block_boxes[tx * 5:tx * 5 + 5]) 
开发者ID:traveller59,项目名称:second.pytorch,代码行数:32,代码来源:nms_gpu.py

示例8: rotate_iou_kernel_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N,
                           K,
                           dev_boxes,
                           dev_query_boxes,
                           dev_iou,
                           criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                               block_boxes[tx * 5:tx * 5 + 5],
                                               criterion) 
开发者ID:traveller59,项目名称:second.pytorch,代码行数:38,代码来源:nms_gpu.py

示例9: mat3_mul_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def mat3_mul_kernel(m0, m1, out):
    i, j = cuda.grid(2)
    m0_mat = m0[i, j, :, :]
    m1_mat = m1[i, j, :, :]
    out_mat = out[i, j, :, :]
    mat3_mul(m0_mat, m1_mat, out_mat)
    # cuda.syncthreads() 
开发者ID:sfu-gruvi-3dv,项目名称:sanet_relocal_demo,代码行数:9,代码来源:numba_mat_opt.py

示例10: nms_kernel_v2

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(threadsPerBlock, 5), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
        block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
        block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
        block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
        block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
    cuda.syncthreads()
    if cuda.threadIdx.x < row_size:
        cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
        # cur_box = dev_boxes + cur_box_idx * 5;
        i = 0
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            if iou_device(dev_boxes[cur_box_idx], block_boxes[i]) > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:poodarchu,项目名称:Det3D,代码行数:33,代码来源:nms_gpu.py

示例11: nms_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            iou = iou_device(
                dev_boxes[cur_box_idx * 5 : cur_box_idx * 5 + 4],
                block_boxes[i * 5 : i * 5 + 4],
            )
            if iou > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:poodarchu,项目名称:Det3D,代码行数:36,代码来源:nms_gpu.py

示例12: rotate_nms_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 6,), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
        block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
        block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
        block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
        block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
        block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
    cuda.syncthreads()
    if tx < row_size:
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            iou = devRotateIoU(
                dev_boxes[cur_box_idx * 6 : cur_box_idx * 6 + 5],
                block_boxes[i * 6 : i * 6 + 5],
            )
            # print('iou', iou, cur_box_idx, i)
            if iou > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
开发者ID:poodarchu,项目名称:Det3D,代码行数:38,代码来源:nms_gpu.py

示例13: rotate_iou_kernel

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = (
                row_start * threadsPerBlock * K
                + col_start * threadsPerBlock
                + tx * K
                + i
            )
            dev_iou[offset] = devRotateIoU(
                block_qboxes[i * 5 : i * 5 + 5], block_boxes[tx * 5 : tx * 5 + 5]
            ) 
开发者ID:poodarchu,项目名称:Det3D,代码行数:38,代码来源:nms_gpu.py

示例14: rotate_iou_kernel_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import syncthreads [as 别名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = (
                row_start * threadsPerBlock * K
                + col_start * threadsPerBlock
                + tx * K
                + i
            )
            dev_iou[offset] = devRotateIoUEval(
                block_qboxes[i * 5 : i * 5 + 5],
                block_boxes[tx * 5 : tx * 5 + 5],
                criterion,
            ) 
开发者ID:poodarchu,项目名称:Det3D,代码行数:40,代码来源:nms_gpu.py


注:本文中的numba.cuda.syncthreads方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。