本文整理汇总了Python中numba.cuda.stream函数的典型用法代码示例。如果您正苦于以下问题:Python stream函数的具体用法?Python stream怎么用?Python stream使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stream函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
mm = MM(shape=n, dtype=np.double, prealloc=5)
blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
gridsz = int(math.ceil(float(n) / blksz))
stream = cuda.stream()
prng = PRNG(PRNG.MRG32K3A, stream=stream)
# Allocate device side array
d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
d_last = cuda.to_device(paths[:, 0], to=mm.get())
for j in range(1, paths.shape[1]):
prng.normal(d_normdist, mean=0, sigma=1)
d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
d_paths.copy_to_host(paths[:, j], stream=stream)
mm.free(d_last)
d_last = d_paths
stream.synchronize()
示例2: test_gufunc_stream
def test_gufunc_stream(self):
#cuda.driver.flush_pending_free()
matrix_ct = 1001 # an odd number to test thread/block division in CUDA
A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
4)
B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
5)
ts = time()
stream = cuda.stream()
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
dC = gufunc(dA, dB, out=dC, stream=stream)
C = dC.copy_to_host(stream=stream)
stream.synchronize()
tcuda = time() - ts
ts = time()
Gold = ut.matrix_multiply(A, B)
tcpu = time() - ts
stream_speedups.append(tcpu / tcuda)
self.assertTrue(np.allclose(C, Gold))
示例3: setup
def setup(self):
self.stream = cuda.stream()
self.f32 = np.zeros(self.n, dtype=np.float32)
self.d_f32 = cuda.to_device(self.f32, self.stream)
self.f64 = np.zeros(self.n, dtype=np.float64)
self.d_f64 = cuda.to_device(self.f64, self.stream)
self.stream.synchronize()
示例4: test_func
def test_func(self):
np.random.seed(42)
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)
s = time()
stream = cuda.stream()
with stream.auto_synchronize():
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.to_device(C, stream)
cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
dC.copy_to_host(C, stream)
e = time()
tcuda = e - s
# Host compute
s = time()
Cans = np.dot(A, B)
e = time()
tcpu = e - s
# Check result
np.testing.assert_allclose(C, Cans, rtol=1e-5)
示例5: test_laplace_small
def test_laplace_small(self):
NN = 256
NM = 256
A = np.zeros((NN, NM), dtype=np.float64)
Anew = np.zeros((NN, NM), dtype=np.float64)
n = NN
m = NM
iter_max = 1000
tol = 1.0e-6
error = 1.0
for j in range(n):
A[j, 0] = 1.0
Anew[j, 0] = 1.0
print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))
timer = time.time()
iter = 0
blockdim = (tpb, tpb)
griddim = (NN // blockdim[0], NM // blockdim[1])
error_grid = np.zeros(griddim)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
self.assertTrue(error_grid.dtype == np.float64)
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.copy_to_host(error_grid, stream=stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
tmp = dA
dA = dAnew
dAnew = tmp
if iter % 100 == 0:
print("%5d, %0.6f (elapsed: %f s)" %
(iter, error, time.time() - timer))
iter += 1
runtime = time.time() - timer
print(" total: %f s" % runtime)
示例6: test
def test(ty):
print("Test %s" % ty)
data = np.array(np.random.random(1e6 + 1), dtype=ty)
ts = time()
stream = cuda.stream()
device_data = cuda.to_device(data, stream)
dresult = cuda_ufunc(device_data, device_data, stream=stream)
result = dresult.copy_to_host()
stream.synchronize()
tnumba = time() - ts
ts = time()
gold = np_ufunc(data, data)
tnumpy = time() - ts
print("Numpy time: %fs" % tnumpy)
print("Numba time: %fs" % tnumba)
if tnumba < tnumpy:
print("Numba is FASTER by %fx" % (tnumpy / tnumba))
else:
print("Numba is SLOWER by %fx" % (tnumba / tnumpy))
self.assertTrue(np.allclose(gold, result), (gold, result))
示例7: reduce
def reduce(self, arg, stream=0):
assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
"ufunc"
assert arg.ndim == 1, "must use 1d array"
n = arg.shape[0]
gpu_mems = []
if n == 0:
raise TypeError("Reduction on an empty array.")
elif n == 1: # nothing to do
return arg[0]
# always use a stream
stream = stream or cuda.stream()
with stream.auto_synchronize():
# transfer memory to device if necessary
if devicearray.is_cuda_ndarray(arg):
mem = arg
else:
mem = cuda.to_device(arg, stream)
# do reduction
out = self.__reduce(mem, gpu_mems, stream)
# use a small buffer to store the result element
buf = np.array((1,), dtype=arg.dtype)
out.copy_to_host(buf, stream=stream)
return buf[0]
示例8: test_func
def test_func(self):
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)
print("N = %d x %d" % (n, n))
s = time()
stream = cuda.stream()
with stream.auto_synchronize():
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.to_device(C, stream)
cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
dC.copy_to_host(C, stream)
e = time()
tcuda = e - s
# Host compute
Amat = np.matrix(A)
Bmat = np.matrix(B)
s = time()
Cans = Amat * Bmat
e = time()
tcpu = e - s
print('cpu: %f' % tcpu)
print('cuda: %f' % tcuda)
print('cuda speedup: %.2fx' % (tcpu / tcuda))
# Check result
self.assertTrue(np.allclose(C, Cans))
示例9: test_func
def test_func(self):
@cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]])
def cu_square_matrix_mul(A, B, C):
sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
bw = cuda.blockDim.x
bh = cuda.blockDim.y
x = tx + bx * bw
y = ty + by * bh
acc = float32(0) # forces all the math to be f32
for i in range(bpg):
if x < n and y < n:
sA[ty, tx] = A[y, tx + i * tpb]
sB[ty, tx] = B[ty + i * tpb, x]
cuda.syncthreads()
if x < n and y < n:
for j in range(tpb):
acc += sA[ty, j] * sB[j, tx]
cuda.syncthreads()
if x < n and y < n:
C[y, x] = acc
np.random.seed(42)
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)
s = time()
stream = cuda.stream()
with stream.auto_synchronize():
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.to_device(C, stream)
cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
dC.copy_to_host(C, stream)
e = time()
tcuda = e - s
# Host compute
s = time()
Cans = np.dot(A, B)
e = time()
tcpu = e - s
# Check result
np.testing.assert_allclose(C, Cans, rtol=1e-5)
示例10: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
num_streams = 2
part_width = int(math.ceil(float(n) / num_streams))
partitions = [(0, part_width)]
for i in range(1, num_streams):
begin, end = partitions[i - 1]
begin, end = end, min(end + (end - begin), n)
partitions.append((begin, end))
partlens = [end - begin for begin, end in partitions]
mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)
device = cuda.get_current_device()
blksz = device.MAX_THREADS_PER_BLOCK
gridszlist = [int(math.ceil(float(partlen) / blksz))
for partlen in partlens]
strmlist = [cuda.stream() for _ in range(num_streams)]
prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
for strm in strmlist]
# Allocate device side array
d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
for partlen, strm in zip(partlens, strmlist)]
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
steplist = [cu_step[gridsz, blksz, strm]
for gridsz, strm in zip(gridszlist, strmlist)]
d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for j in range(1, paths.shape[1]):
for prng, d_norm in zip(prnglist, d_normlist):
prng.normal(d_norm, mean=0, sigma=1)
d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
d_last, d_paths, d_norm = args
step(d_last, d_paths, dt, c0, c1, d_norm)
for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
d_paths.copy_to_host(paths[s:e, j], stream=strm)
mm.free(d_last, stream=strm)
d_lastlist = d_pathslist
for strm in strmlist:
strm.synchronize()
示例11: newthread
def newthread():
cuda.select_device(0)
stream = cuda.stream()
A = np.arange(100)
dA = cuda.to_device(A, stream=stream)
stream.synchronize()
del dA
del stream
cuda.close()
示例12: test_stream_bind
def test_stream_bind(self):
stream = cuda.stream()
with stream.auto_synchronize():
arr = cuda.device_array(
(3, 3),
dtype=np.float64,
stream=stream)
self.assertEqual(arr.bind(stream).stream, stream)
self.assertEqual(arr.stream, stream)
示例13: getGraphFromEdges_gpu
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
MAX_TPB = 512, stream = None):
"""
All input (except MAX_TPB and stream) are device arrays.
edges : array with the IDs of the edges that will be part of the new graph
n_edges : array of 1 element with the number of valid edges in the edges array;
if n_edges < size of edges, the last elements of the edges array are
not considered
"""
# check if number of valid edges was received
if n_edges is None:
edges_size = edges.size
n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
else:
edges_size = int(n_edges.getitem(0))
# check if a stream was received, if not create one
if stream is None:
myStream = cuda.stream()
else:
myStream = stream
new_n_edges = edges_size * 2
# allocate memory for new graph
ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
stream = myStream)
nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
stream = myStream)
nfe = cuda.device_array_like(fe, stream = myStream)
nod = cuda.device_array_like(od, stream = myStream)
# fill new outdegree with zeros
vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
memSet[vertexGrid, MAX_TPB, myStream](nod, 0)
# count all edges of new array and who they belong to
edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)
# get new first_edge array from new outdegree
nfe.copy_to_device(nod, stream=myStream)
ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)
# copy new first_edge to top_edge to serve as pointer in adding edges
top_edge = cuda.device_array_like(nfe, stream = myStream)
top_edge.copy_to_device(nfe, stream = myStream)
addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
top_edge, ndest, nweight)
del top_edge
#del dest, weight, fe, od
return ndest, nweight, nfe, nod
示例14: test_laplace_small
def test_laplace_small(self):
if config.ENABLE_CUDASIM:
NN, NM = 4, 4
iter_max = 20
else:
NN, NM = 256, 256
iter_max = 1000
A = np.zeros((NN, NM), dtype=np.float64)
Anew = np.zeros((NN, NM), dtype=np.float64)
n = NN
m = NM
tol = 1.0e-6
error = 1.0
for j in range(n):
A[j, 0] = 1.0
Anew[j, 0] = 1.0
timer = time.time()
iter = 0
blockdim = (tpb, tpb)
griddim = (NN // blockdim[0], NM // blockdim[1])
error_grid = np.zeros(griddim)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
self.assertTrue(error_grid.dtype == np.float64)
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.copy_to_host(error_grid, stream=stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
tmp = dA
dA = dAnew
dAnew = tmp
iter += 1
runtime = time.time() - timer
示例15: _run_copies
def _run_copies(self, A):
A0 = np.copy(A)
stream = cuda.stream()
ptr = cuda.to_device(A, copy=False, stream=stream)
ptr.copy_to_device(A, stream=stream)
ptr.copy_to_host(A, stream=stream)
stream.synchronize()
self.assertTrue(np.allclose(A, A0))