当前位置: 首页>>代码示例>>Python>>正文

Python cuda.grid函数代码示例

本文整理汇总了Python中numbapro.cuda.grid函数的典型用法代码示例。如果您正苦于以下问题:Python grid函数的具体用法?Python grid怎么用?Python grid使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。


示例1: compute_sample_kernel

def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows):
    num_wavelengths = longest_wavelet - 2
    output[cuda.gridDim.x] = 0.0
    for row_index in range(num_rows):
        output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet,
                                              num_wavelengths, offsets_per_wavelength)
    output[cuda.grid(1)] += factors[-1]

示例2: c_distribute

def c_distribute(rands, low, high):
    i = cuda.grid(1)

    if i >= rands.shape[0]:

    rands[i] = (1.0 - rands[i]) * low + rands[i] * high

示例3: const_m

def const_m(out, const):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = const

示例4: tanh_m

def tanh_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = tanh(a[i,j])

示例5: abs_m

def abs_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = fabs(a[i,j])

示例6: log_m

def log_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = log(a[i,j])

示例7: exp_m

def exp_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = exp(a[i,j])

示例8: kernel

def kernel(dst, src):
    '''A simple kernel that adds 1 to every item
    i = cuda.grid(1)
    if i >= dst.shape[0]:
    dst[i] = src[i] + 1

示例9: vec_add_ilp_x4

def vec_add_ilp_x4(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    # write 
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl

示例10: cu_matmul_sm

def cu_matmul_sm(A, B, C, n, tpb, bpg):
    # decalre shared memory
    sA = cuda.shared.array(shape=block_dim, dtype=float32)
    sB = cuda.shared.array(shape=block_dim, dtype=float32)

    # we now need the thread ID within a block as well as the global thread ID
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x, y = cuda.grid(2)

    # pefort partial operations in block-szied tiles
    # saving intermediate values in an accumulator variable
    acc = 0.0
    for i in range(bpg):
        # Stage 1: Prefil shared memory with current block from matrix A and matrix B
        sA[tx, ty] = A[x, ty + i * tpb]
        sB[tx, ty] = B[tx + i * tpb, y]

        # Block calculations till shared mmeory is filled

        # Stage 2: Compute partial dot product and add to accumulator
        if x < n and y < n:
            for j in range(tpb):
                acc += sA[tx, j] * sB[j, ty]

        # Blcok until all threads have completed calcuaiton before next loop iteration

    # Put accumulated dot product into output matrix
    if x < n and y < n:
        C[x, y] = acc

示例11: pruneGPU

def pruneGPU(input_d, num_elements, min_sup):
    tx = cuda.threadIdx.x
    index = cuda.grid(1)

    if index < num_elements:
        if input_d[index] < min_sup:
            input_d[index] = 0

示例12: _gaussian_cuda32

def _gaussian_cuda32(fac, n_rep, t, n_t, a_facGo, b_facGo, c_facGo):
	i, j = cuda.grid(2)
	if i >= n_rep or j >= n_t:

	# Fill in 2D fac data structure
	fac[i, j] = a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))

示例13: produce_chId_lit_gpu

def produce_chId_lit_gpu(rid, literal, chunk_id, length):
	i = cuda.grid(1)
	if i <length:
		chunk_id[i] = rid[i]/31
		literal[i] = (literal[i]|1<<31) #the left bit set to 1
		off_set = 30-rid[i]%31
		literal[i] = (literal[i]|1<<off_set)

示例14: get_reduced

def get_reduced(literal, start_pos, reduced_length, reduced_literal, input_data, chunk_id, reduced_input_data, reduced_chunk_id):
	i = cuda.grid(1)
	if i < reduced_length:
		for lit in literal[start_pos[i]:start_pos[i+1]]:
			reduced_literal[i] |= lit
		reduced_input_data[i] = input_data[start_pos[i]]
		reduced_chunk_id[i] = chunk_id[start_pos[i]]

示例15: maxPoly

def maxPoly(x0, coef, tol, nParam, argMax):

    # Thread IDs
    i = cuda.grid(1)
    # The Kernel should only execute if i < nParam
    if i >= nParam:
        # Iterate to convergence
        x = x0
        diff = tol+1
        while diff > tol:
            # Compute the first derivative
            firstDeriv = 2*coef[i]*x + 2.3

            # Compute the second derivative
            secondDeriv = 2*coef[i]

            # Newton step
            xNew = x - firstDeriv/secondDeriv

            # Compute difference for convergence check and update
            diff = abs(xNew-x)
            x = xNew

        # Function output
        argMax[i] = x
