Python cuda.stream方法代码示例

本文整理汇总了Python中numba.cuda.stream方法的典型用法代码示例。如果您正苦于以下问题：Python cuda.stream方法的具体用法？Python cuda.stream怎么用？Python cuda.stream使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类numba.cuda的用法示例。

在下文中一共展示了cuda.stream方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setUp

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def setUp(self):
        from pyculib.rand.binding import (Generator,
                                                  CURAND_RNG_PSEUDO_DEFAULT)

        self.N = 10
        self.ary32 = np.zeros(self.N, dtype=np.float32)
        self.ary64 = np.zeros(self.N, dtype=np.float64)

        self.stream = cuda.stream()
        self.devary32 = cuda.to_device(self.ary32, stream=self.stream)
        self.devary64 = cuda.to_device(self.ary64, stream=self.stream)

        self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT)
        self.rndgen.set_stream(self.stream)
        self.rndgen.set_offset(123)
        self.rndgen.set_pseudo_random_generator_seed(1234)

开发者ID:numba，项目名称:pyculib，代码行数:18，代码来源:test_rand.py

示例2: rotate_iou_gpu_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """rotated box iou running in gpu. 500x faster than cpu version
    (take 5ms in one example with numba.cuda code).
    convert from [this project](
        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    
    Args:
        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
            angles(clockwise when positive)
        query_boxes (float tensor: [K, 5]): [description]
        device_id (int, optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    box_dtype = boxes.dtype
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
    
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
            N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:traveller59，项目名称:kitti-object-eval-python，代码行数:38，代码来源:rotate_iou.py

示例3: rotate_iou_gpu_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """rotated box iou running in gpu. 500x faster than cpu version
    (take 5ms in one example with numba.cuda code).
    convert from [this project](
        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    
    Args:
        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
            angles(clockwise when positive)
        query_boxes (float tensor: [K, 5]): [description]
        device_id (int, optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
            N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:ucbdrive，项目名称:3d-vehicle-tracking，代码行数:37，代码来源:rotate_iou.py

示例4: rotate_iou_gpu_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """
    rotated box iou running in gpu. 8x faster than cpu version (take 5ms in one example with numba.cuda code).
    convert from [this project](https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    :param boxes: rbboxes, format: centers, dims, angles(clockwise when positive), FloatTensor[N, 5]
    :param query_boxes: FloatTensor[K, 5]
    :param criterion: optional, default: -1
    :param device_id: int, optional, default: 0
    :return:
    """
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threads_per_block = 8 * 8
    cuda.select_device(device_id)
    blocks_per_grid = (div_up(N, threads_per_block), div_up(K, threads_per_block))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blocks_per_grid, threads_per_block, stream](N, K, boxes_dev, query_boxes_dev,
                                                                           iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:mit-han-lab，项目名称:pvcnn，代码行数:32，代码来源:iou.py

示例5: nms_gpu

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu. 
    
    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """

    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 4]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
    blockspergrid = (div_up(boxes_num, threadsPerBlock),
                     div_up(boxes_num, threadsPerBlock))
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
        mask_dev.copy_to_host(mask_host, stream=stream)
    # stream.synchronize()
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])

开发者ID:traveller59，项目名称:second.pytorch，代码行数:37，代码来源:nms_gpu.py

示例6: rotate_nms_gpu

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu. WARNING: this function can provide right result 
    but its performance isn't be tested
    
    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    dets = dets.astype(np.float32)
    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 5]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    # mask_host shape: boxes_num * col_blocks * sizeof(np.uint64)
    mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
    blockspergrid = (div_up(boxes_num, threadsPerBlock),
                     div_up(boxes_num, threadsPerBlock))
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        rotate_nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
        mask_dev.copy_to_host(mask_host, stream=stream)
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])

开发者ID:traveller59，项目名称:second.pytorch，代码行数:38，代码来源:nms_gpu.py

示例7: rotate_iou_gpu

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu(boxes, query_boxes, device_id=0):
    """rotated box iou running in gpu. 500x faster than cpu version
    (take 5ms in one example with numba.cuda code).
    convert from [this project](
        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    
    Args:
        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
            angles(clockwise when positive)
        query_boxes (float tensor: [K, 5]): [description]
        device_id (int, optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    box_dtype = boxes.dtype
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel[blockspergrid, threadsPerBlock, stream](
            N, K, boxes_dev, query_boxes_dev, iou_dev)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:traveller59，项目名称:second.pytorch，代码行数:38，代码来源:nms_gpu.py

示例8: rotate_iou_gpu_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """rotated box iou running in gpu. 8x faster than cpu version
    (take 5ms in one example with numba.cuda code).
    convert from [this project](
        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    
    Args:
        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
            angles(clockwise when positive)
        query_boxes (float tensor: [K, 5]): [description]
        device_id (int, optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    box_dtype = boxes.dtype
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
            N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:traveller59，项目名称:second.pytorch，代码行数:38，代码来源:nms_gpu.py

示例9: test_lib

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def test_lib(self):
        from pyculib.blas.binding import (cuBlas,
                                                  CUBLAS_POINTER_MODE_HOST,
                                                  CUBLAS_ATOMICS_NOT_ALLOWED)

        stream = cuda.stream()
        blas = cuBlas()
        blas.stream = stream
        self.assertTrue(blas.stream is stream)
        blas.pointer_mode = CUBLAS_POINTER_MODE_HOST
        self.assertTrue(blas.pointer_mode == CUBLAS_POINTER_MODE_HOST)
        blas.atomics_mode = CUBLAS_ATOMICS_NOT_ALLOWED
        self.assertTrue(blas.atomics_mode == CUBLAS_ATOMICS_NOT_ALLOWED)

开发者ID:numba，项目名称:pyculib，代码行数:15，代码来源:test_blas_low_level.py

示例10: tearDown

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def tearDown(self):
        self.devary32.copy_to_host(self.ary32, stream=self.stream)
        self.devary64.copy_to_host(self.ary64, stream=self.stream)

        self.stream.synchronize()

        self.assertTrue(any(self.ary32 != 0))
        self.assertTrue(any(self.ary64 != 0))

        del self.N
        del self.ary32
        del self.ary64
        del self.stream
        del self.devary32
        del self.devary64

开发者ID:numba，项目名称:pyculib，代码行数:17，代码来源:test_rand.py

示例11: rotate_iou_gpu_eval

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """rotated box iou running in gpu. 500x faster than cpu version
    (take 5ms in one example with numba.cuda code).
    convert from [this project](
        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    
    Args:
        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
            angles(clockwise when positive)
        query_boxes (float tensor: [K, 5]): [description]
        device_id (int, optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """
    box_dtype = boxes.dtype
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
            N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)

开发者ID:SmallMunich，项目名称:nutonomy_pointpillars，代码行数:38，代码来源:nms_gpu.py

示例12: test_grid_multiply

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def test_grid_multiply(A, B):
    H, W = A.shape[:2]
    blocks_2d, threads_2d = cuda_grid_block_2d(H, W)
    out = np.zeros_like(A, dtype=np.float32)
    stream = cuda.stream()
    with stream.auto_synchronize():
        d_A = cuda.to_device(A, stream)
        d_B = cuda.to_device(B, stream)
        d_out = cuda.to_device(out, stream)
        mat3_mul_kernel[blocks_2d, threads_2d, stream](d_A, d_B, d_out)
        d_out.to_host(stream)
    return out

开发者ID:sfu-gruvi-3dv，项目名称:sanet_relocal_demo，代码行数:14，代码来源:numba_mat_opt.py

示例13: test_grid_mat_mul_vec

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def test_grid_mat_mul_vec(A, B):
    H, W = A.shape[:2]
    blocks_2d, threads_2d = cuda_grid_block_2d(H, W)
    out = np.zeros((H, W, 3), dtype=np.float32)
    stream = cuda.stream()
    with stream.auto_synchronize():
        d_A = cuda.to_device(A, stream)
        d_B = cuda.to_device(B, stream)
        d_out = cuda.to_device(out, stream)
        mat3_mul_vec_kernel[blocks_2d, threads_2d, stream](d_A, d_B, d_out)
        d_out.to_host(stream)
    return out

开发者ID:sfu-gruvi-3dv，项目名称:sanet_relocal_demo，代码行数:14，代码来源:numba_mat_opt.py

示例14: nms_gpu

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu.

    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]

    Returns:
        [type]: [description]
    """

    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 4]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    mask_host = np.zeros((boxes_num * col_blocks,), dtype=np.uint64)
    blockspergrid = (
        div_up(boxes_num, threadsPerBlock),
        div_up(boxes_num, threadsPerBlock),
    )
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev
        )
        mask_dev.copy_to_host(mask_host, stream=stream)
    # stream.synchronize()
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])

开发者ID:poodarchu，项目名称:Det3D，代码行数:40，代码来源:nms_gpu.py

示例15: rotate_nms_gpu

# 需要导入模块: from numba import cuda [as 别名]
# 或者: from numba.cuda import stream [as 别名]
def rotate_nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu. WARNING: this function can provide right result
    but its performance isn't be tested

    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]

    Returns:
        [type]: [description]
    """
    dets = dets.astype(np.float32)
    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 5]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    # mask_host shape: boxes_num * col_blocks * sizeof(np.uint64)
    mask_host = np.zeros((boxes_num * col_blocks,), dtype=np.uint64)
    blockspergrid = (
        div_up(boxes_num, threadsPerBlock),
        div_up(boxes_num, threadsPerBlock),
    )
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        rotate_nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev
        )
        mask_dev.copy_to_host(mask_host, stream=stream)
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])

开发者ID:poodarchu，项目名称:Det3D，代码行数:41，代码来源:nms_gpu.py

注：本文中的numba.cuda.stream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。