本文整理匯總了Python中numba.cuda.stream方法的典型用法代碼示例。如果您正苦於以下問題:Python cuda.stream方法的具體用法?Python cuda.stream怎麽用?Python cuda.stream使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類numba.cuda
的用法示例。
在下文中一共展示了cuda.stream方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: setUp
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def setUp(self):
from pyculib.rand.binding import (Generator,
CURAND_RNG_PSEUDO_DEFAULT)
self.N = 10
self.ary32 = np.zeros(self.N, dtype=np.float32)
self.ary64 = np.zeros(self.N, dtype=np.float64)
self.stream = cuda.stream()
self.devary32 = cuda.to_device(self.ary32, stream=self.stream)
self.devary64 = cuda.to_device(self.ary64, stream=self.stream)
self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT)
self.rndgen.set_stream(self.stream)
self.rndgen.set_offset(123)
self.rndgen.set_pseudo_random_generator_seed(1234)
示例2: rotate_iou_gpu_eval
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""rotated box iou running in gpu. 500x faster than cpu version
(take 5ms in one example with numba.cuda code).
convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
angles(clockwise when positive)
query_boxes (float tensor: [K, 5]): [description]
device_id (int, optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
box_dtype = boxes.dtype
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例3: rotate_iou_gpu_eval
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""rotated box iou running in gpu. 500x faster than cpu version
(take 5ms in one example with numba.cuda code).
convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
angles(clockwise when positive)
query_boxes (float tensor: [K, 5]): [description]
device_id (int, optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例4: rotate_iou_gpu_eval
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""
rotated box iou running in gpu. 8x faster than cpu version (take 5ms in one example with numba.cuda code).
convert from [this project](https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
:param boxes: rbboxes, format: centers, dims, angles(clockwise when positive), FloatTensor[N, 5]
:param query_boxes: FloatTensor[K, 5]
:param criterion: optional, default: -1
:param device_id: int, optional, default: 0
:return:
"""
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threads_per_block = 8 * 8
cuda.select_device(device_id)
blocks_per_grid = (div_up(N, threads_per_block), div_up(K, threads_per_block))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blocks_per_grid, threads_per_block, stream](N, K, boxes_dev, query_boxes_dev,
iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例5: nms_gpu
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
"""nms in gpu.
Args:
dets ([type]): [description]
nms_overlap_thresh ([type]): [description]
device_id ([type], optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
boxes_num = dets.shape[0]
keep_out = np.zeros([boxes_num], dtype=np.int32)
scores = dets[:, 4]
order = scores.argsort()[::-1].astype(np.int32)
boxes_host = dets[order, :]
threadsPerBlock = 8 * 8
col_blocks = div_up(boxes_num, threadsPerBlock)
cuda.select_device(device_id)
mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
blockspergrid = (div_up(boxes_num, threadsPerBlock),
div_up(boxes_num, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
mask_dev = cuda.to_device(mask_host, stream)
nms_kernel[blockspergrid, threadsPerBlock, stream](
boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
mask_dev.copy_to_host(mask_host, stream=stream)
# stream.synchronize()
num_out = nms_postprocess(keep_out, mask_host, boxes_num)
keep = keep_out[:num_out]
return list(order[keep])
示例6: rotate_nms_gpu
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_nms_gpu(dets, nms_overlap_thresh, device_id=0):
"""nms in gpu. WARNING: this function can provide right result
but its performance isn't be tested
Args:
dets ([type]): [description]
nms_overlap_thresh ([type]): [description]
device_id ([type], optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
dets = dets.astype(np.float32)
boxes_num = dets.shape[0]
keep_out = np.zeros([boxes_num], dtype=np.int32)
scores = dets[:, 5]
order = scores.argsort()[::-1].astype(np.int32)
boxes_host = dets[order, :]
threadsPerBlock = 8 * 8
col_blocks = div_up(boxes_num, threadsPerBlock)
cuda.select_device(device_id)
# mask_host shape: boxes_num * col_blocks * sizeof(np.uint64)
mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
blockspergrid = (div_up(boxes_num, threadsPerBlock),
div_up(boxes_num, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
mask_dev = cuda.to_device(mask_host, stream)
rotate_nms_kernel[blockspergrid, threadsPerBlock, stream](
boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
mask_dev.copy_to_host(mask_host, stream=stream)
num_out = nms_postprocess(keep_out, mask_host, boxes_num)
keep = keep_out[:num_out]
return list(order[keep])
示例7: rotate_iou_gpu
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu(boxes, query_boxes, device_id=0):
"""rotated box iou running in gpu. 500x faster than cpu version
(take 5ms in one example with numba.cuda code).
convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
angles(clockwise when positive)
query_boxes (float tensor: [K, 5]): [description]
device_id (int, optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
box_dtype = boxes.dtype
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel[blockspergrid, threadsPerBlock, stream](
N, K, boxes_dev, query_boxes_dev, iou_dev)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例8: rotate_iou_gpu_eval
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""rotated box iou running in gpu. 8x faster than cpu version
(take 5ms in one example with numba.cuda code).
convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
angles(clockwise when positive)
query_boxes (float tensor: [K, 5]): [description]
device_id (int, optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
box_dtype = boxes.dtype
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例9: test_lib
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def test_lib(self):
from pyculib.blas.binding import (cuBlas,
CUBLAS_POINTER_MODE_HOST,
CUBLAS_ATOMICS_NOT_ALLOWED)
stream = cuda.stream()
blas = cuBlas()
blas.stream = stream
self.assertTrue(blas.stream is stream)
blas.pointer_mode = CUBLAS_POINTER_MODE_HOST
self.assertTrue(blas.pointer_mode == CUBLAS_POINTER_MODE_HOST)
blas.atomics_mode = CUBLAS_ATOMICS_NOT_ALLOWED
self.assertTrue(blas.atomics_mode == CUBLAS_ATOMICS_NOT_ALLOWED)
示例10: tearDown
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def tearDown(self):
self.devary32.copy_to_host(self.ary32, stream=self.stream)
self.devary64.copy_to_host(self.ary64, stream=self.stream)
self.stream.synchronize()
self.assertTrue(any(self.ary32 != 0))
self.assertTrue(any(self.ary64 != 0))
del self.N
del self.ary32
del self.ary64
del self.stream
del self.devary32
del self.devary64
示例11: rotate_iou_gpu_eval
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""rotated box iou running in gpu. 500x faster than cpu version
(take 5ms in one example with numba.cuda code).
convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
angles(clockwise when positive)
query_boxes (float tensor: [K, 5]): [description]
device_id (int, optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
box_dtype = boxes.dtype
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
示例12: test_grid_multiply
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def test_grid_multiply(A, B):
H, W = A.shape[:2]
blocks_2d, threads_2d = cuda_grid_block_2d(H, W)
out = np.zeros_like(A, dtype=np.float32)
stream = cuda.stream()
with stream.auto_synchronize():
d_A = cuda.to_device(A, stream)
d_B = cuda.to_device(B, stream)
d_out = cuda.to_device(out, stream)
mat3_mul_kernel[blocks_2d, threads_2d, stream](d_A, d_B, d_out)
d_out.to_host(stream)
return out
示例13: test_grid_mat_mul_vec
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def test_grid_mat_mul_vec(A, B):
H, W = A.shape[:2]
blocks_2d, threads_2d = cuda_grid_block_2d(H, W)
out = np.zeros((H, W, 3), dtype=np.float32)
stream = cuda.stream()
with stream.auto_synchronize():
d_A = cuda.to_device(A, stream)
d_B = cuda.to_device(B, stream)
d_out = cuda.to_device(out, stream)
mat3_mul_vec_kernel[blocks_2d, threads_2d, stream](d_A, d_B, d_out)
d_out.to_host(stream)
return out
示例14: nms_gpu
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
"""nms in gpu.
Args:
dets ([type]): [description]
nms_overlap_thresh ([type]): [description]
device_id ([type], optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
boxes_num = dets.shape[0]
keep_out = np.zeros([boxes_num], dtype=np.int32)
scores = dets[:, 4]
order = scores.argsort()[::-1].astype(np.int32)
boxes_host = dets[order, :]
threadsPerBlock = 8 * 8
col_blocks = div_up(boxes_num, threadsPerBlock)
cuda.select_device(device_id)
mask_host = np.zeros((boxes_num * col_blocks,), dtype=np.uint64)
blockspergrid = (
div_up(boxes_num, threadsPerBlock),
div_up(boxes_num, threadsPerBlock),
)
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
mask_dev = cuda.to_device(mask_host, stream)
nms_kernel[blockspergrid, threadsPerBlock, stream](
boxes_num, nms_overlap_thresh, boxes_dev, mask_dev
)
mask_dev.copy_to_host(mask_host, stream=stream)
# stream.synchronize()
num_out = nms_postprocess(keep_out, mask_host, boxes_num)
keep = keep_out[:num_out]
return list(order[keep])
示例15: rotate_nms_gpu
# 需要導入模塊: from numba import cuda [as 別名]
# 或者: from numba.cuda import stream [as 別名]
def rotate_nms_gpu(dets, nms_overlap_thresh, device_id=0):
"""nms in gpu. WARNING: this function can provide right result
but its performance isn't be tested
Args:
dets ([type]): [description]
nms_overlap_thresh ([type]): [description]
device_id ([type], optional): Defaults to 0. [description]
Returns:
[type]: [description]
"""
dets = dets.astype(np.float32)
boxes_num = dets.shape[0]
keep_out = np.zeros([boxes_num], dtype=np.int32)
scores = dets[:, 5]
order = scores.argsort()[::-1].astype(np.int32)
boxes_host = dets[order, :]
threadsPerBlock = 8 * 8
col_blocks = div_up(boxes_num, threadsPerBlock)
cuda.select_device(device_id)
# mask_host shape: boxes_num * col_blocks * sizeof(np.uint64)
mask_host = np.zeros((boxes_num * col_blocks,), dtype=np.uint64)
blockspergrid = (
div_up(boxes_num, threadsPerBlock),
div_up(boxes_num, threadsPerBlock),
)
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
mask_dev = cuda.to_device(mask_host, stream)
rotate_nms_kernel[blockspergrid, threadsPerBlock, stream](
boxes_num, nms_overlap_thresh, boxes_dev, mask_dev
)
mask_dev.copy_to_host(mask_host, stream=stream)
num_out = nms_postprocess(keep_out, mask_host, boxes_num)
keep = keep_out[:num_out]
return list(order[keep])