本文整理汇总了Python中numbapro.cuda.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_device函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluation_function
def evaluation_function(factors, opts):
start = timer()
longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples']
window_width = len(target_samples)
full_width = window_width + longest_wavelet
num_wavelengths = longest_wavelet-2
offsets_per_wavelet = full_width / num_wavelengths
num_rows = offsets_per_wavelet * num_wavelengths
result = np.zeros(window_width, dtype=np.float32)
d_factors = cuda.to_device(factors)
d_result = cuda.to_device(result)
griddim = full_width, 1
blockdim = 4, 1, 1
compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim)
compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows)
d_result.to_host()
generated_samples_sum = sum(result)
factors_sum = sum(factors)
difference_from_target = math.fabs(sum(target_samples - result))
non_zero_factors = filter(lambda x: x != 0.0, result)
fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors))
print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " +
str(generated_samples_sum)+". Factors sum: "+str(factors_sum))
return fun_value
示例2: run_GPU
def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps):
""" Runs the Command-Line interface for a specified number of steps,
or forever if the number of steps is specified to be -1.
Note that here, grid and adjGrid must be explicitly specified as
opposed to passed in as a Game, to enable everything to be run on the
GPU. Returns the final grid state. """
step = 0
dim = grid.shape
# move arrays to GPU
d_grid = cuda.to_device(grid)
d_adjGrid = cuda.to_device(adjGrid)
blockDim = (32,16)
gridDim = (32,8)
while step < steps or steps == -1:
# print grid
if printInd is not -1 and step % printInd is 0:
# in order to print grid, first need memory back in CPU
d_grid.to_host()
printGrid(grid, step, dim)
# print index
if indSteps is not -1 and step % indSteps is 0:
print("Step = " + str(step))
newGrid = np.zeros_like(grid)
d_newGrid = cuda.to_device(newGrid)
evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid)
d_grid = d_newGrid
grid = newGrid
sleep(delay)
if step == 0:
# allow initial position to be more easily visible
sleep(initDelay)
step += 1
d_grid.to_host()
return grid
示例3: train
def train(self,ds,epochs,batch_size=10):
for epoch in range(epochs):
start = timer()
count = 0.
correct = 0.
for i in range(len(ds)/batch_size):
count += 1.
x = encode(ds[i*batch_size][0],gpu=False)
t = encode(ds[i*batch_size][1],gpu=False)
for b in range(batch_size-1):
x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False)))
t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False)))
x = cuda.to_device(x)
t = cuda.to_device(t)
assert x.shape[1] == self.layers[0]
assert t.shape[1] == self.layers[2]
print(x.shape)
self.forward(x)
print('output',decode(self.output))
if decode(self.output) == decode(t):
correct += 1.
self.backward(t)
print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count)
if correct/count > 0.99:
break
示例4: fit
def fit(self,X,Budget=None,W=None):
self.X = cuda.to_device(X.astype(np.float64,order='F'))
self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
if Budget is None:
permutation = np.random.permutation(self.X.shape[0])
self.permutation = cuda.to_device(permutation)
initBudget(self.X,self.permutation,self.Budget)
else:
self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
self.calculateKB()
self.calculateKX()
if W is None:
self.initW()
else:
self.W = cuda.to_device(W.astype(np.float64,order='F'))
self.t = 0
for i in xrange(self.epochs):
print "Epoch " + str(i)
samples,features = self.X.shape
permutation = getPermutation(samples,self.miniBatchSize)
self.permutation = cuda.to_device(permutation)
for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
loadBatch(self.kx,self.permutation,j,self.kxi)
self.nextW()
self.t += 1
self.predictH()
示例5: getIdx
def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
compact_flag = numpy.ones(thread_num, dtype='int64')
print thread_num
index = numpy.ones(2*reduced_length, dtype='uint32')
d_index = cuda.to_device(index)
d_fill_word = cuda.to_device(fill_word)
d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
d_compact_flag = cuda.to_device(compact_flag)
#print fill_word
getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
compact_flag = d_compact_flag.copy_to_host()
#print compact_flag[0:28]
useless_array = numpy.zeros(thread_num, dtype='int64')
radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
print d_compact_flag.copy_to_host()[0:2*reduced_length]
print out_index_length
out_index = numpy.zeros(out_index_length, dtype='uint32')
scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
#for i in out_index:
# print bin(i)
return out_index
示例6: radix_sort
def radix_sort(arr, rid):
length = numpy.int64(len(arr))
bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))
block_num = max(thread_num/TPB_MAX,1)
stream = cuda.stream()
one_list = numpy.zeros(shape=(thread_num), dtype='int64')
zero_list = numpy.zeros(shape=(thread_num), dtype='int64')
iter_num = len(bin(ATTR_CARD_MAX))
for i in range(iter_num):
d_arr = cuda.to_device(arr, stream)
d_rid = cuda.to_device(rid, stream)
d_zero_list = cuda.to_device(zero_list,stream)
d_one_list = cuda.to_device(one_list,stream)
get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
d_one_list.to_host(stream)
d_zero_list.to_host(stream)
stream.synchronize()
base_reduction_block_num = block_num
base_reduction_block_size = TPB_MAX
tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
d_tmp_out = cuda.to_device(tmp_out, stream)
sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
d_tmp_out.to_host(stream)
stream.synchronize()
base = 0 #base for the scan of one_list
for j in xrange(base_reduction_block_num):
base += tmp_out[j]
Blelloch_scan_caller(d_zero_list, d_one_list, base)
array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
示例7: tests
def tests():
a = np.random.rand(300,500)
b = np.random.rand(500,300)
start = timer()
c = np.dot(a,b)
nptime = timer()-start
print('nptime',nptime)
x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
z = np.zeros((1000,1000),order='F',dtype='float32')
stream = cuda.stream()
dx = cuda.to_device(x)
dy = cuda.to_device(y)
dz = cuda.to_device(z)
start = timer()
blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
cutime = timer()-start
print('cutime',cutime)
#dz.copy_to_host(z)
print(dz[0])
c = np.ones((1000,1000),order='F',dtype='float32')
print(c.shape)
dc = cuda.to_device(c)
# blockDim = (256,256)
#gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))
blockDim = (30,30)
gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))
start = timer()
mtanh[gridDim,blockDim,stream](dc)
tantime = timer() - start
print('tantime',tantime)
dc.copy_to_host(c,stream=stream)
stream.synchronize()
print(c)
y = cm.CUDAMatrix(np.ones((1000,1000)))
start = timer()
cm.tanh(y)
cmtan = timer()-start
print('cmtan',cmtan)
x = cm.CUDAMatrix(np.random.rand(1000,1500))
y = cm.CUDAMatrix(np.random.rand(1500,1000))
start = timer()
cm.dot(x,y)
cmtime = timer()-start
print('cmtime',cmtime)
示例8: test_scan
def test_scan():
in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)
for i in range(0, NUM_ELEMENTS):
in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)
tac1 = time()
in_d = cuda.to_device(in_h)
out_d = cuda.to_device(out_h)
cuda.synchronize()
tac2 = time()
tk1 = time()
for i in range(0, 32):
tk1 = time()
preScan(out_d, in_d, NUM_ELEMENTS)
cuda.synchronize()
tk2 = time()
print i, tk2 - tk1
tk2 = time()
th1 = time()
out_d.copy_to_host(out_h)
cuda.synchronize()
#print "Last = ", out_h[-1] + in_h[-1]
th2 = time()
示例9: reduce_by_key
def reduce_by_key(input_data, chunk_id, literal, length):#step 3
flag = numpy.ones(length, dtype='int32')
stream = cuda.stream()
d_flag = cuda.to_device(flag, stream)
d_chunk_id = cuda.to_device(chunk_id, stream)
d_literal = cuda.to_device(literal, stream)
produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
d_flag.to_host(stream)
print 'flag:'
print flag
stream.synchronize()
is_finish = numpy.zeros(length, dtype='int32')
hop = 1
while hop<32:#only 32 because the length of a word in binary form is 32
reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
hop *= 2
d_literal.to_host(stream)
d_chunk_id.to_host(stream)
stream.synchronize()
reduced_input_data = []
reduced_chunk_id = []
reduced_literal =[]
for i in xrange(length):
if flag[i]:
reduced_input_data.append(input_data[i])
reduced_chunk_id.append(chunk_id[i])
reduced_literal.append(literal[i])
return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
示例10: getIdx
def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
compact_flag = numpy.ones(thread_num, dtype='int64')
index = numpy.ones(2*reduced_length, dtype='uint32')
d_index = cuda.to_device(index)
d_fill_word = cuda.to_device(fill_word)
d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
d_compact_flag = cuda.to_device(compact_flag)
block_num = reduced_length/tpb + 1
getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
compact_flag = d_compact_flag.copy_to_host()
useless_array = numpy.zeros(thread_num, dtype='int64')
radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
out_index = numpy.zeros(out_index_length, dtype='uint32')
offsets = []
new_block_num = 2*reduced_length/tpb + 1
scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
for i in xrange(reduced_length):
if head[i]:
offsets.append(d_compact_flag.copy_to_host()[2*i])
key_length = numpy.zeros(cardinality, dtype='int64')
for i in xrange(cardinality-1):
key_length[i] = offsets[i+1] - offsets[i]
key_length[cardinality-1] = out_index_length - offsets[cardinality-1]
return out_index, numpy.array(offsets), numpy.array(key_length)
示例11: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
mm = MM(shape=n, dtype=np.double, prealloc=5)
blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
gridsz = int(math.ceil(float(n) / blksz))
stream = cuda.stream()
prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
# Allocate device side array
d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
step_cfg = step[gridsz, blksz, stream]
d_last = cuda.to_device(paths[:, 0], to=mm.get())
for j in range(1, paths.shape[1]):
prng.normal(d_normdist, mean=0, sigma=1)
d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
d_paths.copy_to_host(paths[:, j], stream=stream)
mm.free(d_last, stream=stream)
d_last = d_paths
stream.synchronize()
示例12: main
def main():
N = 2048 * 2048
# Allocate host memory arrays
a = np.empty(N)
b = np.empty(N)
c = np.empty(N)
# Initialize host memory
a.fill(2)
b.fill(1)
c.fill(0)
# Allocate and copy GPU/device memory
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
threads_per_block = 128
number_of_blocks = N / 128 + 1
saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )
d_c.copy_to_host(c)
# Print out the first and last 5 values of c for a quality check
print c[:5]
print c[-5:]
示例13: gpumulti
def gpumulti(X,mu):
device = cuda.get_current_device()
n=len(X)
X=np.array(X)
x1 = np.array(X.T[0])
x2 = np.array(X.T[1])
bmk = np.arange(len(x1))
mu = np.array(mu)
dx1 = cuda.to_device(x1)
dx2 = cuda.to_device(x2)
dmu = cuda.to_device(mu)
dbmk = cuda.to_device(bmk)
# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
bestmukey = dbmk.copy_to_host()
return bestmukey
示例14: main
def main():
NN = 4096
NM = 4096
A = np.zeros((NN, NM), dtype=np.float64)
Anew = np.zeros((NN, NM), dtype=np.float64)
n = NN
m = NM
iter_max = 1000
tol = 1.0e-6
error = 1.0
for j in range(n):
A[j, 0] = 1.0
Anew[j, 0] = 1.0
print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)
timer = time.time()
iter = 0
blockdim = (tpb, tpb)
griddim = (NN/blockdim[0], NM/blockdim[1])
error_grid = np.zeros(griddim)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
assert error_grid.dtype == np.float64
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.to_host(stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
tmp = dA
dA = dAnew
dAnew = tmp
if iter % 100 == 0:
print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)
iter += 1
runtime = time.time() - timer
print " total: %f s" % runtime
示例15: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
num_streams = 2
part_width = int(math.ceil(float(n) / num_streams))
partitions = [(0, part_width)]
for i in range(1, num_streams):
begin, end = partitions[i - 1]
begin, end = end, min(end + (end - begin), n)
partitions.append((begin, end))
partlens = [end - begin for begin, end in partitions]
mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)
device = cuda.get_current_device()
blksz = device.MAX_THREADS_PER_BLOCK
gridszlist = [int(math.ceil(float(partlen) / blksz))
for partlen in partlens]
strmlist = [cuda.stream() for _ in range(num_streams)]
prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
for strm in strmlist]
# Allocate device side array
d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
for partlen, strm in zip(partlens, strmlist)]
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
steplist = [cu_step[gridsz, blksz, strm]
for gridsz, strm in zip(gridszlist, strmlist)]
d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for j in xrange(1, paths.shape[1]):
for prng, d_norm in zip(prnglist, d_normlist):
prng.normal(d_norm, mean=0, sigma=1)
d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
d_last, d_paths, d_norm = args
step(d_last, d_paths, dt, c0, c1, d_norm)
for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
d_paths.copy_to_host(paths[s:e, j], stream=strm)
mm.free(d_last, stream=strm)
d_lastlist = d_pathslist
for strm in strmlist:
strm.synchronize()