當前位置: 首頁>>代碼示例>>Python>>正文


Python cuda.syncthreads方法代碼示例

本文整理匯總了Python中numba.cuda.syncthreads方法的典型用法代碼示例。如果您正苦於以下問題:Python cuda.syncthreads方法的具體用法?Python cuda.syncthreads怎麽用?Python cuda.syncthreads使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在numba.cuda的用法示例。


在下文中一共展示了cuda.syncthreads方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: rotate_iou_kernel_eval

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                           block_boxes[tx * 5:tx * 5 + 5], criterion) 
開發者ID:traveller59,項目名稱:kitti-object-eval-python,代碼行數:32,代碼來源:rotate_iou.py

示例2: rotate_iou_kernel_eval

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou,
                           criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * \
                     threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                               block_boxes[tx * 5:tx * 5 + 5],
                                               criterion) 
開發者ID:ucbdrive,項目名稱:3d-vehicle-tracking,代碼行數:35,代碼來源:rotate_iou.py

示例3: rotate_iou_kernel_eval

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threads_per_block = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threads_per_block, threads_per_block)
    col_size = min(K - col_start * threads_per_block, threads_per_block)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threads_per_block * col_start + tx
    dev_box_idx = threads_per_block * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threads_per_block * K + col_start * threads_per_block + tx * K + i
            dev_iou[offset] = dev_rotate_iou_eval(
                block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion
            ) 
開發者ID:mit-han-lab,項目名稱:pvcnn,代碼行數:33,代碼來源:iou.py

示例4: nms_kernel_v2

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(
        shape=(threadsPerBlock, 5), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
        block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
        block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
        block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
        block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
    cuda.syncthreads()
    if (cuda.threadIdx.x < row_size):
        cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
        # cur_box = dev_boxes + cur_box_idx * 5;
        i = 0
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            if (iou_device(dev_boxes[cur_box_idx], block_boxes[i]) >
                    nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:traveller59,項目名稱:second.pytorch,代碼行數:34,代碼來源:nms_gpu.py

示例5: nms_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if (tx < row_size):
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            iou = iou_device(dev_boxes[cur_box_idx * 5:cur_box_idx * 5 + 4],
                             block_boxes[i * 5:i * 5 + 4])
            if (iou > nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:traveller59,項目名稱:second.pytorch,代碼行數:33,代碼來源:nms_gpu.py

示例6: rotate_nms_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 6, ), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if (tx < col_size):
        block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
        block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
        block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
        block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
        block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
        block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
    cuda.syncthreads()
    if (tx < row_size):
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if (row_start == col_start):
            start = tx + 1
        for i in range(start, col_size):
            iou = devRotateIoU(dev_boxes[cur_box_idx * 6:cur_box_idx * 6 + 5],
                               block_boxes[i * 6:i * 6 + 5])
            # print('iou', iou, cur_box_idx, i)
            if (iou > nms_overlap_thresh):
                t |= 1 << i
        col_blocks = ((n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0))
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:traveller59,項目名稱:second.pytorch,代碼行數:35,代碼來源:nms_gpu.py

示例7: rotate_iou_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoU(block_qboxes[i * 5:i * 5 + 5],
                                           block_boxes[tx * 5:tx * 5 + 5]) 
開發者ID:traveller59,項目名稱:second.pytorch,代碼行數:32,代碼來源:nms_gpu.py

示例8: rotate_iou_kernel_eval

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel_eval(N,
                           K,
                           dev_boxes,
                           dev_query_boxes,
                           dev_iou,
                           criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if (tx < col_size):
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if (tx < row_size):
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
                                               block_boxes[tx * 5:tx * 5 + 5],
                                               criterion) 
開發者ID:traveller59,項目名稱:second.pytorch,代碼行數:38,代碼來源:nms_gpu.py

示例9: mat3_mul_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def mat3_mul_kernel(m0, m1, out):
    i, j = cuda.grid(2)
    m0_mat = m0[i, j, :, :]
    m1_mat = m1[i, j, :, :]
    out_mat = out[i, j, :, :]
    mat3_mul(m0_mat, m1_mat, out_mat)
    # cuda.syncthreads() 
開發者ID:sfu-gruvi-3dv,項目名稱:sanet_relocal_demo,代碼行數:9,代碼來源:numba_mat_opt.py

示例10: nms_kernel_v2

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def nms_kernel_v2(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(threadsPerBlock, 5), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx, 0] = dev_boxes[dev_box_idx, 0]
        block_boxes[tx, 1] = dev_boxes[dev_box_idx, 1]
        block_boxes[tx, 2] = dev_boxes[dev_box_idx, 2]
        block_boxes[tx, 3] = dev_boxes[dev_box_idx, 3]
        block_boxes[tx, 4] = dev_boxes[dev_box_idx, 4]
    cuda.syncthreads()
    if cuda.threadIdx.x < row_size:
        cur_box_idx = threadsPerBlock * row_start + cuda.threadIdx.x
        # cur_box = dev_boxes + cur_box_idx * 5;
        i = 0
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            if iou_device(dev_boxes[cur_box_idx], block_boxes[i]) > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:poodarchu,項目名稱:Det3D,代碼行數:33,代碼來源:nms_gpu.py

示例11: nms_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            iou = iou_device(
                dev_boxes[cur_box_idx * 5 : cur_box_idx * 5 + 4],
                block_boxes[i * 5 : i * 5 + 4],
            )
            if iou > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:poodarchu,項目名稱:Det3D,代碼行數:36,代碼來源:nms_gpu.py

示例12: rotate_nms_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 6,), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
        block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
        block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
        block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
        block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
        block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
    cuda.syncthreads()
    if tx < row_size:
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            iou = devRotateIoU(
                dev_boxes[cur_box_idx * 6 : cur_box_idx * 6 + 5],
                block_boxes[i * 6 : i * 6 + 5],
            )
            # print('iou', iou, cur_box_idx, i)
            if iou > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t 
開發者ID:poodarchu,項目名稱:Det3D,代碼行數:38,代碼來源:nms_gpu.py

示例13: rotate_iou_kernel

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel(N, K, dev_boxes, dev_query_boxes, dev_iou):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = (
                row_start * threadsPerBlock * K
                + col_start * threadsPerBlock
                + tx * K
                + i
            )
            dev_iou[offset] = devRotateIoU(
                block_qboxes[i * 5 : i * 5 + 5], block_boxes[tx * 5 : tx * 5 + 5]
            ) 
開發者ID:poodarchu,項目名稱:Det3D,代碼行數:38,代碼來源:nms_gpu.py

示例14: rotate_iou_kernel_eval

# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import syncthreads [as 別名]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = (
                row_start * threadsPerBlock * K
                + col_start * threadsPerBlock
                + tx * K
                + i
            )
            dev_iou[offset] = devRotateIoUEval(
                block_qboxes[i * 5 : i * 5 + 5],
                block_boxes[tx * 5 : tx * 5 + 5],
                criterion,
            ) 
開發者ID:poodarchu,項目名稱:Det3D,代碼行數:40,代碼來源:nms_gpu.py


注:本文中的numba.cuda.syncthreads方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。