本文整理汇总了Python中pycuda.gpuarray.empty函数的典型用法代码示例。如果您正苦于以下问题:Python empty函数的具体用法?Python empty怎么用?Python empty使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了empty函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _minmax_impl
def _minmax_impl(a_gpu, axis, min_or_max, stream=None):
''' Returns both max and argmax (min/argmin) along an axis.'''
assert len(a_gpu.shape) < 3
if iscomplextype(a_gpu.dtype):
raise ValueError("Cannot compute min/max of complex values")
if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin!
if min_or_max == 'max':
return gpuarray.max(a_gpu).get()
else:
return gpuarray.min(a_gpu).get()
else:
if axis < 0:
axis += 2
assert axis in (0, 1)
global _global_cublas_allocator
alloc = _global_cublas_allocator
n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc)
idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc)
col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
else:
target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc)
idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc)
row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
return target, idx
示例2: generate_shifts_2d
def generate_shifts_2d(width, height, n_samples, with_hot=False):
x_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (width - 0.01)
x_shifts = x_shifts.astype(np.uint32)
y_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (height - 0.01)
y_shifts = y_shifts.astype(np.uint32)
if with_hot:
shifts_hot = gp.empty((width * height, n_samples), np.float32)
threads_per_block = 32
n_blocks = int(math.ceil(n_samples / threads_per_block))
gpu_shift_to_hot_2d(x_shifts, y_shifts, shifts_hot,
np.uint32(shifts_hot.strides[0]/4),
np.uint32(shifts_hot.strides[1]/4),
np.uint32(width), np.uint32(height), np.uint32(n_samples),
block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
return x_shifts, y_shifts, shifts_hot
else:
shifts = gp.empty((2, n_samples), np.float32)
threads_per_block = 32
n_blocks = int(math.ceil(n_samples / threads_per_block))
gpu_vstack(y_shifts, x_shifts, shifts,
np.uint32(shifts.strides[0]/4), np.uint32(shifts.strides[1]/4),
np.uint32(n_samples),
block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
return x_shifts, y_shifts, shifts
示例3: sample_dropout_mask
def sample_dropout_mask(x, dropout_probability=.5, columns=None, stream=None, target=None,
dropout_mask=None, dropout_prob_array=None):
""" Samples a dropout mask and applies it in place"""
assert x.flags.c_contiguous
if columns is not None:
assert len(columns) == 2
x_tmp = x
x = extract_columns(x, columns[0], columns[1])
shape = x.shape
if dropout_prob_array is None:
dropout_prob_array = gpuarray.empty(shape, x.dtype)
sampler.fill_uniform(dropout_prob_array, stream)
if dropout_mask is None:
dropout_mask = gpuarray.empty(shape, np.int8)
if target is None: target = x
all_kernels['sample_dropout_mask'](
x, target, dropout_mask, dropout_prob_array,
np.float32(dropout_probability))
if columns is not None:
insert_columns(x, x_tmp, columns[0])
return dropout_mask
示例4: get
def get(self, V_gpu, xcl_gpu, xcr_gpu, W_gpu, x_gpu, stream=None):
"""
"""
if stream is None:
stream = cuda.Stream()
# Temporary variables
z_gpu = gpuarray.empty((self.params['V_d'],
self.params['V_w']), self.params['dtype'])
xc2_gpu = gpuarray.empty(2*self.params['w_d'], self.params['dtype'])
blockDim_x = self.params['V_w']
self._func[0](xcl_gpu, xcr_gpu, xc2_gpu,
block=(blockDim_x, 1, 1),
stream=stream)
gridDim_z = self.params['V_d']
blockDim_y = self.params['V_w']
self._func[1](V_gpu, xc2_gpu, z_gpu,
block=(1, blockDim_y, 1),
grid=(1, 1, gridDim_z),
stream=stream)
blockDim_y = self.params['W_h']
self._func[2](W_gpu, xc2_gpu, z_gpu, x_gpu,
block=(1, blockDim_y, 1),
grid=(1, 1, 1),
stream=stream)
示例5: enable3d
def enable3d(self):
self.point1 = self.point-(self.mesh_diagonal_norm/60)*self.axis2
self.point2 = self.point+(self.mesh_diagonal_norm/60)*self.axis2
self.viewing_angle = 0.0
pos1, dir1 = from_film(self.point1, axis1=self.axis1, axis2=self.axis2,
size=self.size, width=self.film_width)
pos2, dir2 = from_film(self.point2, axis1=self.axis1, axis2=self.axis2,
size=self.size, width=self.film_width)
self.rays1 = gpu.GPURays(pos1, dir1,
max_alpha_depth=self.max_alpha_depth)
self.rays2 = gpu.GPURays(pos2, dir2,
max_alpha_depth=self.max_alpha_depth)
scope_size = (self.size[0]//4, self.size[0]//4)
scope_pos, scope_dir = from_film(self.point, axis1=self.axis1,
axis2=self.axis2, size=scope_size,
width=self.film_width/4.0)
self.scope_rays = gpu.GPURays(scope_pos, scope_dir)
self.scope_pixels_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.uint32)
self.pixels1_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
self.pixels2_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
self.distances_gpu = ga.empty(self.scope_rays.pos.size,
dtype=np.float32)
self.display3d = True
示例6: __init__
def __init__(self, res=(640, 480)):
mod = cuda.SourceModule(file("cpp/trace.cu").read(), keep=True, options=['-I../cpp'], no_extern_c=True)
self.InitEyeRays = mod.get_function("InitEyeRays")
self.InitFishEyeRays = mod.get_function("InitFishEyeRays")
self.Trace = mod.get_function("Trace")
self.ShadeSimple = mod.get_function("ShadeSimple")
self.mod = mod
self.block = (16, 32, 1) # 15: 32, 18: 28, 19: 24
self.grid = ( res[0]/self.block[0], res[1]/self.block[1] )
self.resx, self.resy = (self.grid[0]*self.block[0], self.grid[1]*self.block[1])
self.smallblock = (16, 16, 1)
self.smallgrid = ( res[0]/self.smallblock[0], res[1]/self.smallblock[1] )
self.d_img = ga.empty( (self.resy, self.resx, 4), uint8 )
'''
struct RayData
{
float3 dir;
float t;
VoxNodeId endNode;
int endNodeChild;
float endNodeSize;
};
'''
raySize = struct.calcsize("3f f i i f")
self.d_rays = ga.empty( (self.resy, self.resx, raySize), uint8 )
self.setLightPos((0.5, 0.5, 1))
self.detailCoef = 10.0
示例7: test_cublas_bug
def test_cublas_bug():
'''
The SGEMM call would cause all calls after it to fail for some unknown
reason. Likely this is caused swaprows causing memory corruption.
NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be
fixed in CUDA 6.5
'''
from pycuda.driver import Stream
from skcuda.cublas import cublasSgemm
from skcuda.misc import _global_cublas_handle as handle
n = 131
s = slice(128, n)
X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32))
a = gpuarray.empty((X.shape[1], 3), dtype=np.float32)
c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32)
b = gpuarray.empty_like(X)
m, n = a.shape[0], b[s].shape[1]
k = a.shape[1]
lda = m
ldb = k
ldc = m
#cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
#print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc
#gpuarray.dot(d, Xoutd[s])
#op.sgemm(a, b[s], c)
stream = Stream()
stream.synchronize()
示例8: test_cublasSgetriBatched
def test_cublasSgetriBatched(self):
l,m = 11,7
np.random.seed(1)
A = np.random.rand(l,m, m).astype(np.float32)
a_gpu = gpuarray.to_gpu(A)
a_arr = bptrs(a_gpu)
c_gpu = gpuarray.empty((l,m,m), np.float32)
c_arr = bptrs(c_gpu)
p_gpu = gpuarray.empty((l,m), np.int32)
i_gpu = gpuarray.zeros(l, np.int32)
cublas.cublasSgetrfBatched(self.cublas_handle,
m, a_arr.gpudata, m, p_gpu.gpudata,
i_gpu.gpudata, l)
cublas.cublasSgetriBatched(self.cublas_handle,
m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m,
i_gpu.gpudata, l)
X = np.array(map(np.linalg.inv,A))
X_ = c_gpu.get()
assert np.allclose(X,X_,6)
示例9: _create_halo_arrays
def _create_halo_arrays(self):
# Allocate space for the halos: two per face,
# one for sending and one for receiving.
nz, ny, nx = self.local_dims
sw = self.stencil_width
# create two halo regions for each face, one holding
# the halo values to send, and the other holding
# the halo values to receive.
self.left_recv_halo = gpuarray.empty([nz,ny,sw], dtype=np.float64)
self.left_send_halo = self.left_recv_halo.copy()
self.right_recv_halo = self.left_recv_halo.copy()
self.right_send_halo = self.left_recv_halo.copy()
self.bottom_recv_halo = gpuarray.empty([nz,sw,nx], dtype=np.float64)
self.bottom_send_halo = self.bottom_recv_halo.copy()
self.top_recv_halo = self.bottom_recv_halo.copy()
self.top_send_halo = self.bottom_recv_halo.copy()
self.back_recv_halo = gpuarray.empty([sw,ny,nx], dtype=np.float64)
self.back_send_halo = self.back_recv_halo.copy()
self.front_recv_halo = self.back_recv_halo.copy()
self.front_send_halo = self.back_recv_halo.copy()
示例10: gradient_gpu
def gradient_gpu(y_gpu, mode='valid'):
shape = np.array(y_gpu.shape).astype(np.uint32)
dtype = y_gpu.dtype
block_size = (16,16,1)
grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
int(np.ceil(float(shape[0])/block_size[1])))
shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize)
preproc = _generate_preproc(dtype, shape)
mod = SourceModule(preproc + kernel_code, keep=True)
if mode == 'valid':
gradient_gpu = mod.get_function("gradient_valid")
gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype)
grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype)
if mode == 'same':
gradient_gpu = mod.get_function("gradient_same")
gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata,
block=block_size, grid=grid_size, shared=shared_size)
return (gradx_gpu, grady_gpu)
示例11: initializeGpuMemory
def initializeGpuMemory(self):
K = self.modelParams["proc_id_model","K"]
# Sufficient statistics for the parameters of G kernels
self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32)
self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
示例12: _init_comm_bufs
def _init_comm_bufs(self):
"""
Buffers for sending/receiving data from other modules.
Notes
-----
Must be executed after `_init_port_dicts()`.
"""
# Buffers (and their interfaces and MPI types) for receiving data
# transmitted from source modules:
self._in_buf = {}
self._in_buf['gpot'] = {}
self._in_buf['spike'] = {}
self._in_buf_int = {}
self._in_buf_int['gpot'] = {}
self._in_buf_int['spike'] = {}
self._in_buf_mtype = {}
self._in_buf_mtype['gpot'] = {}
self._in_buf_mtype['spike'] = {}
for in_id in self._in_ids:
self._in_buf['gpot'][in_id] = \
gpuarray.empty(len(self._in_port_dict_ids['gpot'][in_id]),
self.pm['gpot'].dtype)
self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id])
self._in_buf_mtype['gpot'][in_id] = \
dtype_to_mpi(self._in_buf['gpot'][in_id].dtype)
self._in_buf['spike'][in_id] = \
gpuarray.empty(len(self._in_port_dict_ids['spike'][in_id]),
self.pm['spike'].dtype)
self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id])
self._in_buf_mtype['spike'][in_id] = \
dtype_to_mpi(self._in_buf['spike'][in_id].dtype)
# Buffers (and their interfaces and MPI types) for transmitting data to
# destination modules:
self._out_buf = {}
self._out_buf['gpot'] = {}
self._out_buf['spike'] = {}
self._out_buf_int = {}
self._out_buf_int['gpot'] = {}
self._out_buf_int['spike'] = {}
self._out_buf_mtype = {}
self._out_buf_mtype['gpot'] = {}
self._out_buf_mtype['spike'] = {}
for out_id in self._out_ids:
self._out_buf['gpot'][out_id] = \
gpuarray.empty(len(self._out_port_dict_ids['gpot'][out_id]),
self.pm['gpot'].dtype)
self._out_buf_int['gpot'][out_id] = bufint(self._out_buf['gpot'][out_id])
self._out_buf_mtype['gpot'][out_id] = \
dtype_to_mpi(self._out_buf['gpot'][out_id].dtype)
self._out_buf['spike'][out_id] = \
gpuarray.empty(len(self._out_port_dict_ids['spike'][out_id]),
self.pm['spike'].dtype)
self._out_buf_int['spike'][out_id] = bufint(self._out_buf['spike'][out_id])
self._out_buf_mtype['spike'][out_id] = \
dtype_to_mpi(self._out_buf['spike'][out_id].dtype)
示例13: getFields
def getFields(self,x,y):
outX = gpuarray.empty((self.Nfields,x,y),np.float32)
outY = gpuarray.empty((self.Nfields,x,y),np.float32)
grid = (int(ceil(x/32)),int(ceil(y/32)))
block = (int(ceil(x/grid[0])),int(ceil(y/grid[1])),1)
for i in range(self.Nfields):
self.resampleF[i].prepared_call(grid,block,outX[i,:,:].gpudata,outY[i,:,:].gpudata,np.int32(x),np.int32(y))
return outX,outY
示例14: show_values
def show_values(matrix_size, threads_per_block):
a_cpu = np.random.randn(matrix_size, matrix_size).astype(np.float32)
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
id_groups_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
id_groups_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
id_threads_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
id_threads_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
id_cell = gpuarray.empty((matrix_size, matrix_size), np.float32)
blocks = (threads_per_block, 1, 1)
blocks_per_side = int(matrix_size / threads_per_block)
if (blocks_per_side * threads_per_block) < matrix_size:
blocks_per_side = blocks_per_side + 1
grid = (blocks_per_side, matrix_size, 1)
print("Blocks: ", blocks)
print("Grid: ", grid)
kernel_code = kernel_source_code % {'MATRIX_SIZE': matrix_size, 'BLOCK_SIZE': threads_per_block}
compiled_kernel = compiler.SourceModule(kernel_code)
kernel_binary = compiled_kernel.get_function("markThreadID")
kernel_binary(
# inputs
a_gpu,
# outputs
id_groups_x, id_groups_y, id_threads_x, id_threads_y, id_cell,
block=blocks,
grid=grid
)
id_blocks_x_cpu = id_groups_x.get()
id_blocks_y_cpu = id_groups_y.get()
id_threads_x_cpu = id_threads_x.get()
id_threads_y_cpu = id_threads_y.get()
id_cell_cpu = id_cell.get()
print("id_blocks_x_cpu")
print(id_blocks_x_cpu)
print("id_blocks_y_cpu")
print(id_blocks_y_cpu)
print("id_threads_x_cpu")
print(id_threads_x_cpu)
print("id_threads_y_cpu")
print(id_threads_y_cpu)
print("id_cell_cpu")
print(id_cell_cpu)
示例15: initializeGpuMemory
def initializeGpuMemory(self):
"""
Allocate GPU memory for the base model parameters
"""
N = self.base.data.N
K = self.base.data.K
self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32)
self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32)