本文整理汇总了Python中pycuda.gpuarray.to_gpu函数的典型用法代码示例。如果您正苦于以下问题:Python to_gpu函数的具体用法?Python to_gpu怎么用?Python to_gpu使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_gpu函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(dtype):
from pycuda.elementwise import get_linear_combination_kernel
lc_kernel, lc_texrefs = get_linear_combination_kernel((
(True, dtype, dtype),
(True, dtype, dtype)
), dtype)
for size_exp in range(10, 26):
size = 1 << size_exp
from pycuda.curandom import rand
a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
x = rand(size, dtype=dtype)
b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
y = rand(size, dtype=dtype)
z = gpuarray.empty_like(x)
start = drv.Event()
stop = drv.Event()
start.record()
for i in range(20):
a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
lc_kernel.prepared_call(x._grid, x._block,
x.gpudata, y.gpudata, z.gpudata, x.mem_size)
stop.record()
stop.synchronize()
print size, size_exp, stop.time_since(start)
示例2: calculate_circuit_graph_vertex_data_device
def calculate_circuit_graph_vertex_data_device(d_D, d_C, length):
logger = logging.getLogger('eulercuda.pyeulertour.calculate_circuit_graph_vertex_data_device')
logger.info("started.")
mod = SourceModule("""
__global__ void calculateCircuitGraphVertexData( unsigned int * D,unsigned int * C,unsigned int ecount){
unsigned int tid=(blockDim.x*blockDim.y * gridDim.x*blockIdx.y) + (blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x;
if( tid <ecount)
{
unsigned int c=D[tid];
atomicExch(C+c,1);
}
}
""")
calculate_circuit_graph_vertex_data = mod.get_function('calculateCircuitGraphVertexData')
block_dim, grid_dim = getOptimalLaunchConfiguration(length, 512)
logger.info('block_dim = %s, grid_dim = %s' % (block_dim, grid_dim))
np_d_D = gpuarray.to_gpu(d_D)
np_d_C = gpuarray.to_gpu(d_C)
calculate_circuit_graph_vertex_data(
np_d_D,
np_d_C,
np.uintc(length),
block=block_dim, grid=grid_dim
)
np_d_D.get(d_D)
np_d_C.get(d_C)
# devdata = pycuda.tools.DeviceData()
# orec = pycuda.tools.OccupancyRecord(devdata, block_dim[0] * grid_dim[1])
# logger.info("Occupancy = %s" % (orec.occupancy * 100))
logger.info("Finished. Leaving.")
return d_D, d_C
示例3: test_cublasDgemmBatched
def test_cublasDgemmBatched(self):
l, m, k, n = 11, 7, 5, 3
A = np.random.rand(l, m, k).astype(np.float64)
B = np.random.rand(l, k, n).astype(np.float64)
C_res = np.einsum('nij,njk->nik',A,B)
a_gpu = gpuarray.to_gpu(A)
b_gpu = gpuarray.to_gpu(B)
c_gpu = gpuarray.empty((l, m, n), np.float64)
alpha = np.float64(1.0)
beta = np.float64(0.0)
a_arr = bptrs(a_gpu)
b_arr = bptrs(b_gpu)
c_arr = bptrs(c_gpu)
cublas.cublasDgemmBatched(self.cublas_handle, 'n','n',
n, m, k, alpha,
b_arr.gpudata, n,
a_arr.gpudata, k,
beta, c_arr.gpudata, n, l)
assert np.allclose(C_res, c_gpu.get())
示例4: gpu_sweep_col_mult
def gpu_sweep_col_mult(X, y):
""" X * y = X across the columns """
if type(X)==GPUArray:
gX = X
else:
gX = to_gpu(np.asarray(X, dtype=np.float32))
if type(y)==GPUArray:
gy = y
else:
gy = to_gpu(np.asarray(y, dtype=np.float32))
dims = np.asarray(X.shape, dtype=np.int32)
if devinfo.max_block_threads >= 1024:
blocksize = 32
else:
blocksize = 16
gridsize = int(dims[0] / blocksize) + 1
shared = 4*blocksize
if gX.flags.c_contiguous:
func = CUDA_Kernels.get_function("sweep_columns_mult")
else:
func = CUDA_Kernels.get_function("sweep_columns_mult_cm")
func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1),
grid = (gridsize,1), shared = shared)
if type(y)!=GPUArray:
X = gX.get()
示例5: cache_z
def cache_z(self, z):
x = np.require(z.real, dtype = np.double, requirements = ['A','W','O','C'])
y = np.require(z.imag, dtype = np.double, requirements = ['A','W','O','C'])
xd = gpuarray.to_gpu(x)
yd = gpuarray.to_gpu(y)
cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes)
cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
示例6: _init_weights
def _init_weights(self, weight_shape, bias_shape):
if self.weight is None:
if self.name == 'noise':
assert(weight_shape[0] == weight_shape[1])
self.weight = gpuarray.to_gpu(np.eye(weight_shape[0], dtype = np.float32))
else:
self.weight = gpuarray.to_gpu(randn(weight_shape, np.float32) * self.initW)
if self.bias is None:
if self.initB > 0.0:
self.bias = gpuarray.to_gpu((np.ones(bias_shape, dtype=np.float32) * self.initB))
else:
self.bias = gpuarray.zeros(bias_shape, dtype=np.float32)
Assert.eq(self.weight.shape, weight_shape)
Assert.eq(self.bias.shape, bias_shape)
self.weightGrad = gpuarray.zeros_like(self.weight)
self.biasGrad = gpuarray.zeros_like(self.bias)
if self.momW > 0.0:
if self.weightIncr is None:
self.weightIncr = gpuarray.zeros_like(self.weight)
if self.biasIncr is None:
self.biasIncr = gpuarray.zeros_like(self.bias)
Assert.eq(self.weightIncr.shape, weight_shape)
Assert.eq(self.biasIncr.shape, bias_shape)
示例7: gpu_sweep_row_div
def gpu_sweep_row_div(X, y):
""" X / y = X down the rows """
if type(X)==GPUArray:
gX = X
else:
gX = to_gpu(np.asarray(X, dtype=np.float32))
if type(y)==GPUArray:
gy = y
else:
gy = to_gpu(np.asarray(y, dtype=np.float32))
dims = np.asarray(X.shape, dtype=np.int32)
if devinfo.max_block_threads >= 1024:
blocksize = 32
else:
blocksize = 16
gridsize = int(dims[0] / blocksize) + 1
shared = int(4*dims[1])
if gX.flags.c_contiguous:
func = CUDA_Kernels.get_function("sweep_rows_div")
else:
func = CUDA_Kernels.get_functions("sweep_rows_div_cm")
func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1),
grid = (gridsize,1), shared = shared)
if type(y)!=GPUArray:
X = gX.get()
示例8: cuda_dot3
def cuda_dot3(A, b):
print("cuda_dot3", A.shape, b.shape)
# send b to GPU
b_gpu = gpuarray.to_gpu(b)
# transpose b on GPU
bt_gpu = linalg.transpose(b_gpu)
#remove b for now
b_gpu.gpudata.free()
del(b_gpu)
# send A to GPU
A_gpu = gpuarray.to_gpu(A)
temp_gpu = linalg.dot(bt_gpu, A_gpu)
bt_gpu.gpudata.free()
del(bt_gpu)
A_gpu.gpudata.free()
del(A_gpu)
# send b to GPU
b_gpu = gpuarray.to_gpu(b)
c_gpu = linalg.dot(temp_gpu, b_gpu)
temp_gpu.gpudata.free()
del(temp_gpu)
b_gpu.gpudata.free()
del(b_gpu)
#theoretically possible to move into RAM, force cleanup on GPU and then return from RAM
#but most likely not necessary
return c_gpu.get()
示例9: test_set_by_inds_from_inds
def test_set_by_inds_from_inds(self):
dest_gpu = gpuarray.to_gpu(np.zeros(5, dtype=np.float32))
ind_dest = gpuarray.to_gpu(np.array([0, 2, 4]))
src_gpu = gpuarray.to_gpu(np.arange(5, 10, dtype=np.float32))
ind_src = gpuarray.to_gpu(np.array([2, 3, 4]))
gpu.set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src)
assert np.allclose(dest_gpu.get(), np.array([7, 0, 8, 0, 9], dtype=np.float32))
示例10: set_by_inds
def set_by_inds(self, inds, data):
"""
Set mapped data by integer indices.
Parameters
----------
inds : sequence of int
Integer indices of data elements to update.
data : numpy.ndarray
Data to assign.
"""
assert len(np.shape(inds)) == 1
assert issubclass(inds.dtype.type, numbers.Integral)
N = len(inds)
assert N == len(data)
if not isinstance(inds, gpuarray.GPUArray):
inds = gpuarray.to_gpu(inds)
if not isinstance(data, gpuarray.GPUArray):
data = gpuarray.to_gpu(data)
# Allocate data array if it doesn't exist:
if not self.data:
self.data = gpuarray.empty(N, data.dtype)
else:
assert self.data.dtype == data.dtype
try:
func = self.set_by_inds.cache[inds.dtype]
except KeyError:
inds_ctype = tools.dtype_to_ctype(inds.dtype)
v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(data_ctype=self.data_ctype, inds_ctype=inds_ctype)
func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]")
self.set_by_inds.cache[inds.dtype] = func
func(self.data, inds, data, range=slice(0, N, 1))
示例11: main
def main():
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from skdata import toy
from hebel import memory_pool
from hebel.data_providers import BatchDataProvider
from hebel.models import NeuralNetRegression
from hebel.optimizers import SGD
from hebel.parameter_updaters import SimpleSGDUpdate
from hebel.monitors import SimpleProgressMonitor
from hebel.schedulers import exponential_scheduler
# Get data
data_cpu, targets_cpu = toy.Boston().regression_task()
data = gpuarray.to_gpu(data_cpu.astype(np.float32), allocator=memory_pool.allocate)
targets = gpuarray.to_gpu(targets_cpu.astype(np.float32), allocator=memory_pool.allocate)
data_provider = BatchDataProvider(data, targets)
# Create model object
model = NeuralNetRegression(n_in=data_cpu.shape[1], n_out=targets_cpu.shape[1],
layers=[100], activation_function='relu')
# Create optimizer object
optimizer = SGD(model, SimpleSGDUpdate, data_provider, data_provider,
learning_rate_schedule=exponential_scheduler(.1, .9999),
early_stopping=True)
optimizer.run(3000)
示例12: cuda_ageSols
def cuda_ageSols(sols):
""" makes solutions to age """
#get num sols
num_sols = len(sols);
#convert to form of numpy arrays
sols_arr = numpy.array(sols, numpy.float32);
ones_arr = numpy.zeros_like(sols,numpy.float32);
ones_arr[:,constants.AGE_GENE] = 1;
#copy each to gpu
sols_gpu = gpuarray.to_gpu(sols_arr);
mask_gpu = gpuarray.to_gpu(ones_arr);
#debug
if debug == True:
print mask_gpu.view();
#apply mask
aged_sols_gpu = sols_gpu + mask_gpu;
sols = aged_sols_gpu.get().tolist();
示例13: _initialize_gpu_ds
def _initialize_gpu_ds(self):
"""
Setup GPU arrays.
"""
self.synapse_state = garray.zeros(int(self.total_synapses) + \
len(self.input_neuron_list), np.float64)
if self.my_num_gpot_neurons>0:
self.V = garray.zeros(int(self.my_num_gpot_neurons), np.float64)
else:
self.V = None
if self.my_num_spike_neurons>0:
self.spike_state = garray.zeros(int(self.my_num_spike_neurons), np.int32)
if len(self.public_gpot_list)>0:
self.public_gpot_list_g = garray.to_gpu(self.public_gpot_list)
self.projection_gpot = garray.zeros(len(self.public_gpot_list), np.double)
self._extract_gpot = self._extract_projection_gpot_func()
if len(self.public_spike_list)>0:
self.public_spike_list_g = garray.to_gpu( \
(self.public_spike_list-self.spike_shift).astype(np.int32))
self.projection_spike = garray.zeros(len(self.public_spike_list), np.int32)
self._extract_spike = self._extract_projection_spike_func()
示例14: main_no_tex
def main_no_tex(dtype):
lc_kernel = get_lin_comb_kernel_no_tex((
(True, dtype, dtype),
(True, dtype, dtype)
), dtype)
for size_exp in range(10,26):
size = 1 << size_exp
from pycuda.curandom import rand
a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
x = rand(size, dtype=dtype)
b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
y = rand(size, dtype=dtype)
z = gpuarray.empty_like(x)
start = drv.Event()
stop = drv.Event()
start.record()
for i in range(20):
lc_kernel.prepared_call(x._grid, x._block,
a.gpudata, x.gpudata,
b.gpudata, y.gpudata,
z.gpudata, x.mem_size)
stop.record()
stop.synchronize()
print size, size_exp, stop.time_since(start)
示例15: test_neural_net_regression
def test_neural_net_regression(self):
for _ in range(20):
N = 10000 # Number of data points
D = 100 # Dimensionality of exogenous data
P = 50 # Dimensionality of endogenous data
W_true = 10 * np.random.rand(D, P) - 5
b_true = 100 * np.random.rand(P) - 50
X = np.random.randn(N, D)
Y = np.dot(X, W_true) + b_true[np.newaxis, :] + np.random.randn(N, P)
W_lstsq = np.linalg.lstsq(np.c_[np.ones((N, 1)), X], Y)[0]
b_lstsq = W_lstsq[0]
W_lstsq = W_lstsq[1:]
data_provider = BatchDataProvider(gpuarray.to_gpu(X.astype(np.float32),
allocator=memory_pool.allocate),
gpuarray.to_gpu(Y.astype(np.float32),
allocator=memory_pool.allocate))
model = NeuralNetRegression([], n_in=D, n_out=P)
optimizer = SGD(model, SimpleSGDUpdate,
data_provider, data_provider,
learning_rate_schedule=constant_scheduler(10.),
early_stopping=True)
optimizer.run(100)
self.assertLess(np.abs(W_lstsq - model.top_layer.W.get()).max(),
1e-5)