本文整理汇总了Python中numbapro.cuda.grid函数的典型用法代码示例。如果您正苦于以下问题:Python grid函数的具体用法?Python grid怎么用?Python grid使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了grid函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: compute_sample_kernel
def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows):
num_wavelengths = longest_wavelet - 2
output[cuda.gridDim.x] = 0.0
for row_index in range(num_rows):
output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet,
num_wavelengths, offsets_per_wavelength)
output[cuda.grid(1)] += factors[-1]
示例2: c_distribute
def c_distribute(rands, low, high):
i = cuda.grid(1)
if i >= rands.shape[0]:
return
rands[i] = (1.0 - rands[i]) * low + rands[i] * high
示例3: const_m
def const_m(out, const):
n = out.shape[0]
m = out.shape[1]
i,j = cuda.grid(2)
if i < n and j < m:
out[i,j] = const
示例4: tanh_m
def tanh_m(a, out):
n = out.shape[0]
m = out.shape[1]
i,j = cuda.grid(2)
if i < n and j < m:
out[i,j] = tanh(a[i,j])
示例5: abs_m
def abs_m(a, out):
n = out.shape[0]
m = out.shape[1]
i,j = cuda.grid(2)
if i < n and j < m:
out[i,j] = fabs(a[i,j])
示例6: log_m
def log_m(a, out):
n = out.shape[0]
m = out.shape[1]
i,j = cuda.grid(2)
if i < n and j < m:
out[i,j] = log(a[i,j])
示例7: exp_m
def exp_m(a, out):
n = out.shape[0]
m = out.shape[1]
i,j = cuda.grid(2)
if i < n and j < m:
out[i,j] = exp(a[i,j])
示例8: kernel
def kernel(dst, src):
'''A simple kernel that adds 1 to every item
'''
i = cuda.grid(1)
if i >= dst.shape[0]:
return
dst[i] = src[i] + 1
示例9: vec_add_ilp_x4
def vec_add_ilp_x4(a, b, c):
# read
i = cuda.grid(1)
ai = a[i]
bi = b[i]
bw = cuda.blockDim.x
gw = cuda.gridDim.x
stride = gw * bw
j = i + stride
aj = a[j]
bj = b[j]
k = j + stride
ak = a[k]
bk = b[k]
l = k + stride
al = a[l]
bl = b[l]
# compute
ci = core(ai, bi)
cj = core(aj, bj)
ck = core(ak, bk)
cl = core(al, bl)
# write
c[i] = ci
c[j] = cj
c[k] = ck
c[l] = cl
示例10: cu_matmul_sm
def cu_matmul_sm(A, B, C, n, tpb, bpg):
# decalre shared memory
sA = cuda.shared.array(shape=block_dim, dtype=float32)
sB = cuda.shared.array(shape=block_dim, dtype=float32)
# we now need the thread ID within a block as well as the global thread ID
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
x, y = cuda.grid(2)
# pefort partial operations in block-szied tiles
# saving intermediate values in an accumulator variable
acc = 0.0
for i in range(bpg):
# Stage 1: Prefil shared memory with current block from matrix A and matrix B
sA[tx, ty] = A[x, ty + i * tpb]
sB[tx, ty] = B[tx + i * tpb, y]
# Block calculations till shared mmeory is filled
cuda.syncthreads()
# Stage 2: Compute partial dot product and add to accumulator
if x < n and y < n:
for j in range(tpb):
acc += sA[tx, j] * sB[j, ty]
# Blcok until all threads have completed calcuaiton before next loop iteration
cuda.syncthreads()
# Put accumulated dot product into output matrix
if x < n and y < n:
C[x, y] = acc
示例11: pruneGPU
def pruneGPU(input_d, num_elements, min_sup):
tx = cuda.threadIdx.x
index = cuda.grid(1)
if index < num_elements:
if input_d[index] < min_sup:
input_d[index] = 0
示例12: _gaussian_cuda32
def _gaussian_cuda32(fac, n_rep, t, n_t, a_facGo, b_facGo, c_facGo):
i, j = cuda.grid(2)
if i >= n_rep or j >= n_t:
return
# Fill in 2D fac data structure
fac[i, j] = a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))
示例13: produce_chId_lit_gpu
def produce_chId_lit_gpu(rid, literal, chunk_id, length):
i = cuda.grid(1)
if i <length:
chunk_id[i] = rid[i]/31
literal[i] = (literal[i]|1<<31) #the left bit set to 1
off_set = 30-rid[i]%31
literal[i] = (literal[i]|1<<off_set)
示例14: get_reduced
def get_reduced(literal, start_pos, reduced_length, reduced_literal, input_data, chunk_id, reduced_input_data, reduced_chunk_id):
i = cuda.grid(1)
if i < reduced_length:
for lit in literal[start_pos[i]:start_pos[i+1]]:
reduced_literal[i] |= lit
reduced_input_data[i] = input_data[start_pos[i]]
reduced_chunk_id[i] = chunk_id[start_pos[i]]
示例15: maxPoly
def maxPoly(x0, coef, tol, nParam, argMax):
# Thread IDs
i = cuda.grid(1)
# The Kernel should only execute if i < nParam
if i >= nParam:
return
else:
# Iterate to convergence
x = x0
diff = tol+1
while diff > tol:
# Compute the first derivative
firstDeriv = 2*coef[i]*x + 2.3
# Compute the second derivative
secondDeriv = 2*coef[i]
# Newton step
xNew = x - firstDeriv/secondDeriv
# Compute difference for convergence check and update
diff = abs(xNew-x)
x = xNew
# Function output
argMax[i] = x