Python cuda.to_device函数代码示例

本文整理汇总了Python中numbapro.cuda.to_device函数的典型用法代码示例。如果您正苦于以下问题：Python to_device函数的具体用法？Python to_device怎么用？Python to_device使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了to_device函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: evaluation_function

def evaluation_function(factors, opts):

    start = timer()

    longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples']
    window_width = len(target_samples)
    full_width = window_width + longest_wavelet
    num_wavelengths = longest_wavelet-2
    offsets_per_wavelet = full_width / num_wavelengths
    num_rows = offsets_per_wavelet * num_wavelengths

    result = np.zeros(window_width, dtype=np.float32)

    d_factors = cuda.to_device(factors)
    d_result = cuda.to_device(result)

    griddim = full_width, 1
    blockdim = 4, 1, 1

    compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim)
    compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows)

    d_result.to_host()
    generated_samples_sum = sum(result)
    factors_sum = sum(factors)
    difference_from_target = math.fabs(sum(target_samples - result))
    non_zero_factors = filter(lambda x: x != 0.0, result)

    fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors))

    print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " +
          str(generated_samples_sum)+". Factors sum: "+str(factors_sum))

    return fun_value

开发者ID:RelentlessResults，项目名称:decompose001，代码行数:34，代码来源:precomputed_representation.py

示例2: run_GPU

def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps):
    """ Runs the Command-Line interface for a specified number of steps,
        or forever if the number of steps is specified to be -1.
        Note that here, grid and adjGrid must be explicitly specified as
        opposed to passed in as a Game, to enable everything to be run on the
        GPU. Returns the final grid state. """
    step = 0
    dim = grid.shape
    # move arrays to GPU
    d_grid = cuda.to_device(grid)
    d_adjGrid = cuda.to_device(adjGrid)
    blockDim = (32,16)
    gridDim = (32,8)
    while step < steps or steps == -1:
        # print grid
        if printInd is not -1 and step % printInd is 0:
            # in order to print grid, first need memory back in CPU
            d_grid.to_host()
            printGrid(grid, step, dim)
        # print index
        if indSteps is not -1 and step % indSteps is 0:
            print("Step = " + str(step))
        newGrid = np.zeros_like(grid)
        d_newGrid = cuda.to_device(newGrid)
        evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid)
        d_grid = d_newGrid
        grid = newGrid
        sleep(delay)
        if step == 0:
            # allow initial position to be more easily visible
            sleep(initDelay)
        step += 1
    d_grid.to_host()
    return grid

开发者ID:goldenratio1618，项目名称:gameoflife，代码行数:34，代码来源:cmdline.py

示例3: train

	def train(self,ds,epochs,batch_size=10):

		for epoch in range(epochs):
			start = timer()
			count = 0.
			correct = 0.
			for i in range(len(ds)/batch_size):
				count += 1.
				x = encode(ds[i*batch_size][0],gpu=False)
				t = encode(ds[i*batch_size][1],gpu=False)
				for b in range(batch_size-1):
					x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False)))
					t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False)))
				x = cuda.to_device(x)
				t = cuda.to_device(t)
				assert x.shape[1] == self.layers[0]
				assert t.shape[1] == self.layers[2]
				print(x.shape)
				self.forward(x)
				print('output',decode(self.output))
				if decode(self.output) == decode(t):
					correct += 1.
				self.backward(t)
			print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count)
			if correct/count > 0.99:
				break

开发者ID:ZhangAustin，项目名称:PyCuNN，代码行数:26，代码来源:nn.py

示例4: fit

 def fit(self,X,Budget=None,W=None):
     self.X = cuda.to_device(X.astype(np.float64,order='F'))
     self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
     self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
     self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     if Budget is None:
         permutation = np.random.permutation(self.X.shape[0])
         self.permutation = cuda.to_device(permutation)
         initBudget(self.X,self.permutation,self.Budget)
     else:
         self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
     self.calculateKB()
     self.calculateKX()
     if W is None:
         self.initW()
     else:
         self.W = cuda.to_device(W.astype(np.float64,order='F'))
     self.t = 0
     for i in xrange(self.epochs):
         print "Epoch " + str(i)
         samples,features = self.X.shape
         permutation = getPermutation(samples,self.miniBatchSize)
         self.permutation = cuda.to_device(permutation)
         for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
             loadBatch(self.kx,self.permutation,j,self.kxi)
             self.nextW()
             self.t += 1
     self.predictH()

开发者ID:ejake，项目名称:tensor-factorization，代码行数:29，代码来源:cudaOKMF.py

示例5: getIdx

def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = 	numpy.ones(thread_num, dtype='int64')
	print thread_num
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)
	#print fill_word
	getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()
	#print compact_flag[0:28]

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	print d_compact_flag.copy_to_host()[0:2*reduced_length]
	print out_index_length
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	#for i in out_index:
	#	print bin(i)
	return out_index

开发者ID:DarinSSC，项目名称:WAH_on_GPU，代码行数:25，代码来源:bitmap_constructor_gpu.py

示例6: radix_sort

def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2,bin_length))
    block_num = max(thread_num/TPB_MAX,1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list,stream)
        d_one_list = cuda.to_device(one_list,stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()
        
        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0 #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)

开发者ID:DarinSSC，项目名称:BitmapIndex_GUI，代码行数:35，代码来源:radix_sort.py

示例7: tests

def tests():
    a = np.random.rand(300,500)
    b = np.random.rand(500,300)

    start = timer()
    c = np.dot(a,b)
    nptime = timer()-start
    print('nptime',nptime)

    x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
    y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
    z = np.zeros((1000,1000),order='F',dtype='float32')

    stream = cuda.stream()

    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dz = cuda.to_device(z)

    start = timer()
    blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
    cutime = timer()-start
    print('cutime',cutime)

    #dz.copy_to_host(z)
    print(dz[0])

    c = np.ones((1000,1000),order='F',dtype='float32')
    print(c.shape)
    dc = cuda.to_device(c)

   # blockDim = (256,256)
    #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))

    blockDim = (30,30)
    gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    start = timer()
    mtanh[gridDim,blockDim,stream](dc)
    tantime = timer() - start
    print('tantime',tantime)

    dc.copy_to_host(c,stream=stream)
    stream.synchronize()
    print(c)

    y = cm.CUDAMatrix(np.ones((1000,1000)))

    start = timer()
    cm.tanh(y)
    cmtan = timer()-start
    print('cmtan',cmtan)

    x = cm.CUDAMatrix(np.random.rand(1000,1500))
    y = cm.CUDAMatrix(np.random.rand(1500,1000))

    start = timer()
    cm.dot(x,y)
    cmtime = timer()-start
    print('cmtime',cmtime)

开发者ID:ZhangAustin，项目名称:PyCuNN，代码行数:60，代码来源:numbaprotests.py

示例8: test_scan

def test_scan():

    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)

    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)

    tac1 = time()

    in_d = cuda.to_device(in_h)
    out_d = cuda.to_device(out_h)
    cuda.synchronize()

    tac2 = time()

    tk1 = time()

    for i in range(0, 32):
        tk1 = time()
        preScan(out_d, in_d, NUM_ELEMENTS)
        cuda.synchronize()
        tk2 = time()
        print i, tk2 - tk1
    tk2 = time()

    th1 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "Last = ", out_h[-1] + in_h[-1]

    th2 = time()

开发者ID:jalatif，项目名称:Python_Massively_Parallel_FP_Tree，代码行数:33，代码来源:exclusive_scan.py

示例9: reduce_by_key

def reduce_by_key(input_data, chunk_id, literal, length):#step 3
	flag = numpy.ones(length, dtype='int32')
	stream = cuda.stream()
	d_flag = cuda.to_device(flag, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
	d_flag.to_host(stream)
	print 'flag:'
	print flag
	stream.synchronize()	
	is_finish = numpy.zeros(length, dtype='int32')
	hop = 1
	while hop<32:#only 32 because the length of a word in binary form is 32
		reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
		hop *= 2
	d_literal.to_host(stream)
	d_chunk_id.to_host(stream)
	stream.synchronize()

	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]
	for i in xrange(length):
		if flag[i]:
			reduced_input_data.append(input_data[i])
			reduced_chunk_id.append(chunk_id[i])
			reduced_literal.append(literal[i])
	return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal

开发者ID:DarinSSC，项目名称:WAH_on_GPU，代码行数:29，代码来源:bitmap_constructor_gpu.py

示例10: getIdx

def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = numpy.ones(thread_num, dtype='int64')
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)

	block_num = reduced_length/tpb + 1

	getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	offsets = []
	
	new_block_num = 2*reduced_length/tpb + 1

	scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	for i in xrange(reduced_length):
		if head[i]:
			offsets.append(d_compact_flag.copy_to_host()[2*i])

	key_length = numpy.zeros(cardinality, dtype='int64')

	for i in xrange(cardinality-1):
		key_length[i] = offsets[i+1] - offsets[i]
	key_length[cardinality-1] = out_index_length - offsets[cardinality-1]

	return out_index, numpy.array(offsets), numpy.array(key_length)

开发者ID:DarinSSC，项目名称:BitmapIndex_GUI，代码行数:35，代码来源:bitmap_pickle.py

示例11: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()

开发者ID:Aahung，项目名称:numbapro-examples，代码行数:31，代码来源:pricer_cuda.py

示例12: main

def main():
    N = 2048 * 2048

    # Allocate host memory arrays
    a = np.empty(N)
    b = np.empty(N)
    c = np.empty(N)

    # Initialize host memory
    a.fill(2)
    b.fill(1)
    c.fill(0)

    # Allocate and copy GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)

    threads_per_block = 128
    number_of_blocks = N / 128 + 1

    saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )

    d_c.copy_to_host(c)

    # Print out the first and last 5 values of c for a quality check
    print c[:5]
    print c[-5:]

开发者ID:mlanier，项目名称:Mlanier_Master，代码行数:28，代码来源:CUDA_Python2.py

示例13: gpumulti

def gpumulti(X,mu):
    device = cuda.get_current_device()
    
    n=len(X)
    X=np.array(X)
    x1 = np.array(X.T[0])
    x2 = np.array(X.T[1])
    
    bmk = np.arange(len(x1))
    
    mu = np.array(mu)
    
    dx1 = cuda.to_device(x1)
    dx2 = cuda.to_device(x2)
    dmu = cuda.to_device(mu)
    dbmk = cuda.to_device(bmk)
    
    # Set up enough threads for kernel
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(n)/tpb))
        
    cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
    
    bestmukey = dbmk.copy_to_host()
    
    return bestmukey

开发者ID:lmwalkowicz，项目名称:KeplerML，代码行数:26，代码来源:km_outliers.py

示例14: main

def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (tpb, tpb)
    griddim = (NN/blockdim[0], NM/blockdim[1])
        
    error_grid = np.zeros(griddim)
    
    stream = cuda.stream()

    dA = cuda.to_device(A, stream)          # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)
    
    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64
        
        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
        
        derror_grid.to_host(stream)
        
        
        # error_grid is available on host
        stream.synchronize()
        
        error = np.abs(error_grid).max()
        
        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime

开发者ID:Aahung，项目名称:numbapro-examples，代码行数:59，代码来源:laplace2d-numbapro-gpu-improve.py

示例15: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()

开发者ID:Aahung，项目名称:numbapro-examples，代码行数:58，代码来源:pricer_cuda_overlap.py

注：本文中的numbapro.cuda.to_device函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。