本文整理匯總了Python中cupy.RawKernel方法的典型用法代碼示例。如果您正苦於以下問題:Python cupy.RawKernel方法的具體用法?Python cupy.RawKernel怎麽用?Python cupy.RawKernel使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類cupy
的用法示例。
在下文中一共展示了cupy.RawKernel方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _call_nms_kernel
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def _call_nms_kernel(bbox, thresh):
assert False, "Not supported."
n_bbox = bbox.shape[0]
threads_per_block = 64
col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
blocks = (col_blocks, col_blocks, 1)
threads = (threads_per_block, 1, 1)
mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
bbox, mask_dev))
mask_host = mask_dev.get()
selection, n_selec = _nms_gpu_post(
mask_host, n_bbox, threads_per_block, col_blocks)
return selection, n_selec
示例2: raw
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def raw(code, name, *args, **kwargs):
"""Creates a raw kernel function.
This function uses :func:`~chainer.backends.cuda.memoize` to cache the
resulting kernel object, i.e. the resulting kernel object is cached for
each argument combination and CUDA device.
The arguments are the same as those for :class:`cupy.RawKernel`.
"""
check_cuda_available()
return cupy.RawKernel(code, name, *args, **kwargs)
# ------------------------------------------------------------------------------
# numpy/cupy compatible coding
# ------------------------------------------------------------------------------
示例3: setUp
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def setUp(self):
self.dev = cupy.cuda.runtime.getDevice()
assert self.dev != 1
global _test_cache_dir
_test_cache_dir = tempfile.mkdtemp()
os.environ['CUPY_CACHE_DIR'] = _test_cache_dir
self.kern = cupy.RawKernel(
_test_source1, 'test_sum',
backend=self.backend)
self.mod2 = cupy.RawModule(
code=_test_source2,
backend=self.backend)
self.mod3 = cupy.RawModule(
code=_test_source3,
options=('-DPRECISION=2',),
backend=self.backend)
示例4: _call_nms_kernel
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def _call_nms_kernel(bbox, thresh):
n_bbox = bbox.shape[0]
threads_per_block = 64
col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
blocks = (col_blocks, col_blocks, 1)
threads = (threads_per_block, 1, 1)
mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
bbox, mask_dev))
mask_host = mask_dev.get()
selection, n_selec = _nms_gpu_post(
mask_host, n_bbox, threads_per_block, col_blocks)
return selection, n_selec
示例5: sgemm
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def sgemm(A, B,
dim_x=16, dim_y=16, blk_m=64, blk_n=64, blk_k=4,
dim_xa=64, dim_ya=4, dim_xb=4, dim_yb=64):
assert A.dtype == cp.float32
assert B.dtype == cp.float32
assert(dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb)
m, k = A.shape
k, n = B.shape
# Inputs matrices need to be in Fortran order.
A = cp.asfortranarray(A)
B = cp.asfortranarray(B)
C = cp.empty((m, n), dtype=cp.float32, order='F')
config = {'DIM_X': dim_x, 'DIM_Y': dim_y,
'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k,
'DIM_XA': dim_xa, 'DIM_YA': dim_ya,
'DIM_XB': dim_xb, 'DIM_YB': dim_yb,
'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y}
code = read_code(sgemm_file, params=config)
kern = cp.RawKernel(code, 'sgemm')
grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
block = (dim_x, dim_y, 1)
args = (m, n, k, A, B, C)
shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4
kern(grid, block, args=args, shared_mem=shared_mem)
return C
示例6: test_dynamical_parallelism
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def test_dynamical_parallelism(self):
ker = cupy.RawKernel(_test_source4, 'test_kernel', options=('-dc',),
backend=self.backend)
N = 169
inner_chunk = 13
x = cupy.zeros((N,), dtype=cupy.float32)
ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
assert (x == 1.0).all()
示例7: test_dynamical_parallelism_compile_failure
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def test_dynamical_parallelism_compile_failure(self):
# no option for separate compilation is given should cause an error
ker = cupy.RawKernel(_test_source4, 'test_kernel',
backend=self.backend)
N = 10
inner_chunk = 2
x = cupy.zeros((N,), dtype=cupy.float32)
if self.backend == 'nvrtc':
# raised when calling ls.complete()
with pytest.raises(cupy.cuda.driver.CUDADriverError):
ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
else: # nvcc
with pytest.raises(cupy.cuda.compiler.CompileException):
ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
示例8: test_context_switch_RawKernel
# 需要導入模塊: import cupy [as 別名]
# 或者: from cupy import RawKernel [as 別名]
def test_context_switch_RawKernel(self):
# run test_basic() on another device
# For RawKernel, we need to launch it once to force compiling
x1, x2, y = self._helper(self.kern, cupy.float32)
with cupy.cuda.Device(1):
x1, x2, y = self._helper(self.kern, cupy.float32)
assert cupy.allclose(y, x1 + x2)