本文整理汇总了Python中pycuda.gpuarray.empty方法的典型用法代码示例。如果您正苦于以下问题:Python gpuarray.empty方法的具体用法?Python gpuarray.empty怎么用?Python gpuarray.empty使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pycuda.gpuarray
的用法示例。
在下文中一共展示了gpuarray.empty方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: interpNearest
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def interpNearest(ary, ny, nx):
"""Used to interpolate the mask for each stage."""
if ary.shape == (ny, nx):
return ary
y, x = ary.shape
rx = x / nx
ry = y / ny
out = np.empty((ny, nx), dtype=np.float32)
for j in range(ny):
for i in range(nx):
out[j, i] = ary[int(ry * j + .5), int(rx * i + .5)]
return out
# =======================================================================#
# = =#
# = Class CorrelStage: =#
# = =#
# =======================================================================#
示例2: gpu_mandelbrot
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
# we set up our complex lattice as such
real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)
# copy complex lattice to the GPU
mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
# synchronize in current context
pycuda.autoinit.context.synchronize()
# allocate an empty array on the GPU
mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
pycuda.autoinit.context.synchronize()
mandelbrot_graph = mandelbrot_graph_gpu.get_async()
pycuda.autoinit.context.synchronize()
return mandelbrot_graph
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:27,代码来源:gpu_mandelbrot_context_sync.py
示例3: gpu_mandelbrot
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
# we set up our complex lattice as such
real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)
# copy complex lattice to the GPU
mandelbrot_lattice_gpu = gpuarray.to_gpu(mandelbrot_lattice)
# allocate an empty array on the GPU
mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
mandelbrot_graph = mandelbrot_graph_gpu.get()
return mandelbrot_graph
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:20,代码来源:gpu_mandelbrot0.py
示例4: scikit_gpu_fft_pipeline
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def scikit_gpu_fft_pipeline(filename):
data = []
start = timer()
with open(filename, 'r') as file_obj:
for _ in range(((32768*1024*SIZE_MULTIPLIER//GULP_SIZE)//COMPLEX_MULTIPLIER)//GULP_FRAME_FFT):
data = np.fromfile(file_obj, dtype=np.complex64, count=GULP_SIZE*GULP_FRAME_FFT).reshape((GULP_FRAME_FFT, GULP_SIZE))
g_data = gpuarray.to_gpu(data)
plan = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT)
plan_inverse = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT)
tmp1 = gpuarray.empty(data.shape, dtype=np.complex64)
tmp2 = gpuarray.empty(data.shape, dtype=np.complex64)
fft(g_data, tmp1, plan)
ifft(tmp1, tmp2, plan_inverse)
for _ in range(NUMBER_FFT-1):
# Can't do FFT in place for fairness (emulating full pipeline)
tmp1 = gpuarray.empty(data.shape, dtype=np.complex64)
fft(tmp2, tmp1, plan)
tmp2 = gpuarray.empty(data.shape, dtype=np.complex64)
ifft(tmp1, tmp2, plan_inverse)
end = timer()
return end-start
示例5: call_prepare
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def call_prepare(self, sz, allocator):
MAX_BLOCK_COUNT = 1024
SMALL_SEQ_COUNT = 4
if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
total_block_size = SMALL_SEQ_COUNT*self.block_size
block_count = (sz + total_block_size - 1) // total_block_size
seq_count = SMALL_SEQ_COUNT
else:
block_count = MAX_BLOCK_COUNT
macroblock_size = block_count*self.block_size
seq_count = (sz + macroblock_size - 1) // macroblock_size
if block_count == 1:
result = empty((), self.dtype_out, allocator)
else:
result = empty((block_count,), self.dtype_out, allocator)
grid_size = (block_count, 1)
block_size = (self.block_size, 1, 1)
return result, block_count, seq_count, grid_size, block_size
示例6: resampleD
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def resampleD(self, newY, newX):
"""Resamples tex_d and returns it in a gpuarray"""
if (self.rX, self.rY) != (np.int32(newX), np.int32(newY)):
self.rGrid = (int(ceil(newX / 32)), int(ceil(newY / 32)))
self.rBlock = (int(ceil(newX / self.rGrid[0])),
int(ceil(newY / self.rGrid[1])), 1)
self.rX, self.rY = np.int32(newX), np.int32(newY)
self.devROut = gpuarray.empty((newY, newX), np.float32)
self.debug(3, "Resampling img_d texture to", (newY, newX),
" grid:", self.rGrid, "block:", self.rBlock)
self._resampleKrnl.prepared_call(self.rGrid, self.rBlock,
self.devROut.gpudata,
self.rX, self.rY)
return self.devROut
示例7: getFields
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def getFields(self, y=None, x=None):
"""Returns the fields, reampled to size (y,x)"""
if x is None or y is None:
y = self.h[0]
x = self.w[0]
outX = gpuarray.empty((self.Nfields, y, x), np.float32)
outY = gpuarray.empty((self.Nfields, y, x), np.float32)
grid = (int(ceil(x / 32)), int(ceil(y / 32)))
block = (int(ceil(x / grid[0])), int(ceil(y / grid[1])), 1)
for i in range(self.Nfields):
self.resampleF[i].prepared_call(grid, block,
outX[i, :, :].gpudata,
outY[i, :, :].gpudata,
np.int32(x), np.int32(y))
return outX, outY
示例8: eval_
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def eval_(self, x, y=None, batch_size=None, stream=None, delta=None, w_t = None, b_t = None):
if stream is None:
stream = self.stream
if type(x) != pycuda.gpuarray.GPUArray:
x = gpuarray.to_gpu_async(np.array(x,dtype=np.float32) , stream=self.stream)
if batch_size is None:
if len(x.shape) == 2:
batch_size = np.int32(x.shape[0])
else:
batch_size = np.int32(1)
if delta is None:
delta = self.delta
delta = np.float32(delta)
if w_t is None:
w_t = np.int32(-1)
if b_t is None:
b_t = np.int32(-1)
if y is None:
if batch_size == 1:
y = gpuarray.empty((self.num_outputs,), dtype=np.float32)
else:
y = gpuarray.empty((batch_size, self.num_outputs), dtype=np.float32)
eval_ker(self.num_outputs, self.num_inputs, self.relu, self.sigmoid, \
self.weights, self.b, x, y, np.int32(batch_size), w_t, b_t, \
delta , block=self.block, grid=self.grid , stream=stream)
return y
# threads: at least "num"
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:43,代码来源:deep_neural_network.py
示例9: definite_integral
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def definite_integral(self, lo=None, hi=None, samples_per_thread=None, num_blocks=None):
if lo is None or hi is None:
lo = self.lo
hi = self.hi
if samples_per_thread is None:
samples_per_thread = self.samples_per_thread
if num_blocks is None:
num_blocks = self.num_blocks
grid = (num_blocks,1,1)
else:
grid = (num_blocks,1,1)
block = (32,1,1)
num_threads = 32*num_blocks
self.ys = gpuarray.empty((num_threads,) , dtype=self.numpy_precision)
self.f(np.int32(samples_per_thread), self.numpy_precision(lo), self.numpy_precision(hi), self.ys, block=block, grid=grid)
self.nintegral = np.sum(self.ys.get() )
return np.sum(self.nintegral)
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:28,代码来源:monte_carlo_integrator.py
示例10: device_buffer
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def device_buffer(self):
"""Return the device buffer.
Returns
-------
pycuda.gpuarray.GPUArray
The pycuda array taking the data.
"""
if self._device_buf is None:
self._device_buf = gpuarray.empty(self._shape, self._dtype)
return self._device_buf
示例11: __init__
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def __init__(self, transformer, op):
super(LUTBpropKernel, self).__init__(transformer)
self.op = op
# Hard coding for now, non-deterministic is faster but difficult to reproduce
# or debug. Deterministic kernels are fast enough and LUT layer tends to be
# a small percentage of execution time.
self.deterministic = True
(self.E, self.I) = (_ for _ in op.call_info())
self.O = op.tensor_description()
pad_idx = op.pad_idx
lut_axis = op.lut_axis
# Only supported when reads are contiguous
assert (lut_axis == 0)
embedding_dim = self.O.shape[1]
vocab_size = self.O.shape[0]
nin = self.E.shape[0]
if pad_idx is None:
pad_idx = int(-1)
self.kernels = []
if self.deterministic:
self.index_buffer = empty((nin,), dtype=np.int32)
self.offset_buffer = empty((nin,), dtype=np.int32)
self.word_counts = empty((max(512, vocab_size) + 512,), dtype=np.int32)
for kernel_id in range(5):
threads = 512
if kernel_id in [1, 3]:
blocks = vocab_size // (threads * 2)
if vocab_size % (threads * 2):
blocks = blocks + 1
elif kernel_id == 2:
blocks = 1
else:
blocks = nin // threads
if nin % threads:
blocks = blocks + 1
params = [(blocks, 1, 1), (threads, 1, 1), None,
self.I, self.index_buffer.gpudata, self.offset_buffer.gpudata,
self.word_counts.gpudata, max(512, vocab_size), nin]
kernel = lookuptable._get_sorting_kernel(kernel_id, threads, self.I.dtype)
self.kernels.append((kernel, params))
threads = 32
blocks = nin
params = [(blocks, 1, 1), (threads, 1, 1), None,
self.I, self.index_buffer.gpudata, self.O, self.E,
nin, embedding_dim, vocab_size, pad_idx]
kernel = lookuptable._get_lut_bprop_kernel(self.E.dtype, self.I.dtype, True)
self.kernels.append((kernel, params))
示例12: __init__
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def __init__(self, bases, pv=None, *, force=False):
"""Create a new density matrix for several qudits.
Parameters
----------
bases : list of quantumsim.bases.PauliBasis
Dimensions of qubits in the system.
pv : array or None.
Must be of size (2**no_qubits, 2**no_qubits). Only upper triangle
is relevant. If data is `None`, create a new density matrix with
all qubits in ground state.
"""
super().__init__(bases, pv, force=force)
if pv is not None:
if self.dim_pauli != pv.shape:
raise ValueError(
'`bases` Pauli dimensionality should be the same as the '
'shape of `data` array.\n'
' - bases shapes: {}\n - data shape: {}'
.format(self.dim_pauli, pv.shape))
else:
pv = np.zeros(self.dim_pauli, np.float64)
ground_state_index = [pb.computational_basis_indices[0]
for pb in self.bases]
pv[tuple(ground_state_index)] = 1
if isinstance(pv, np.ndarray):
if pv.dtype not in (np.float16, np.float32, np.float64):
raise ValueError(
'`pv` must have float64 data type, got {}'
.format(pv.dtype)
)
# Looks like there are some issues with ordering, so the line
# below per se does not work.
# self._data = ga.to_gpu(pv.astype(np.float64))
self._work_data = ga.to_gpu(
pv.reshape(pv.size, order='C').astype(np.float64))
self._data = ga.empty(pv.shape, dtype=np.float64, order='C')
self._data.set(self._work_data.reshape(pv.shape))
self._work_data.gpudata.free()
elif isinstance(pv, ga.GPUArray):
if pv.dtype != np.float64:
raise ValueError(
'`pv` must have float64 data type, got {}'
.format(pv.dtype)
)
self._data = pv
else:
raise ValueError(
"`pv` must be Numpy array, PyCUDA GPU array or "
"None, got type `{}`".format(type(pv)))
self._data.gpudata.size = self._data.nbytes
self._work_data = ga.empty_like(self._data)
self._work_data.gpudata.size = self._work_data.nbytes
示例13: add_layer
# 需要导入模块: from pycuda import gpuarray [as 别名]
# 或者: from pycuda.gpuarray import empty [as 别名]
def add_layer(self, layer):
if layer['type'] == 'dense':
if len(self.network) == 0:
num_inputs = layer['num_inputs']
else:
num_inputs = self.network_summary[-1][2]
num_outputs = layer['num_outputs']
sigmoid = layer['sigmoid']
relu = layer['relu']
weights = layer['weights']
b = layer['bias']
self.network.append(DenseLayer(num_inputs=num_inputs, num_outputs=num_outputs, sigmoid=sigmoid, relu=relu, weights=weights, b=b))
self.network_summary.append( ('dense', num_inputs, num_outputs))
if self.max_batch_size > 1:
if len(self.network_mem) == 0:
self.network_mem.append(gpuarray.empty( (self.max_batch_size, self.network_summary[-1][1] ), dtype=np.float32 ) )
self.network_mem.append(gpuarray.empty((self.max_batch_size, self.network_summary[-1][2] ), dtype=np.float32 ) )
else:
if len(self.network_mem) == 0:
self.network_mem.append( gpuarray.empty( (self.network_summary[-1][1], ), dtype=np.float32 ) )
self.network_mem.append( gpuarray.empty((self.network_summary[-1][2], ), dtype=np.float32 ) )
elif layer['type'] == 'softmax':
if len(self.network) == 0:
raise Exception("Error! Softmax layer can't be first!")
if self.network_summary[-1][0] != 'dense':
raise Exception("Error! Need a dense layer before a softmax layer!")
num = self.network_summary[-1][2]
self.network.append(SoftmaxLayer(num=num))
self.network_summary.append(('softmax', num, num))
if self.max_batch_size > 1:
self.network_mem.append(gpuarray.empty((self.max_batch_size, self.network_summary[-1][2] ), dtype=np.float32 ) )
else:
self.network_mem.append( gpuarray.empty((self.network_summary[-1][2], ), dtype=np.float32 ) )
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:49,代码来源:deep_neural_network.py