本文整理汇总了Python中pycuda.driver.pagelocked_empty函数的典型用法代码示例。如果您正苦于以下问题:Python pagelocked_empty函数的具体用法?Python pagelocked_empty怎么用?Python pagelocked_empty使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pagelocked_empty函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self,**params):
'''
Hack-ish way to avoid initialisation until the weights are transfered:
'''
should_apply = self.apply_output_fns_init
params['apply_output_fns_init'] = False
super(GPUSparseCFProjection,self).__init__(**params)
# Transfering the weights:
self.pycuda_stream = cuda.Stream()
self.weights_gpu = cusparse.CSR.to_CSR(self.weights.toSparseArray().transpose())
# Getting the row and columns indices for the *transposed* matrix. Used for Hebbian learning and normalisation:
nzcols, nzrows = self.weights.nonzero()
tups = sorted(zip(nzrows, nzcols))
nzrows = [x[0] for x in tups]
nzcols = [x[1] for x in tups]
'''
Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
main memory without the involvment of the CPU:
'''
self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)
self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
# Getting them on the GPU:
self.nzcount = self.weights.getnnz()
self.nzrows_gpu = gpuarray.to_gpu(np.array(nzrows, np.int32))
self.nzcols_gpu = gpuarray.to_gpu(np.array(nzcols, np.int32))
# Helper array for normalization:
self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
# Kernel that applies the normalisation:
self.normalize_kernel = ElementwiseKernel(
"int *nzrows, float *norm_total, float *weights",
"weights[i] *= norm_total[nzrows[i]]",
"divisive_normalize")
# Kernel that calculates the learning:
self.hebbian_kernel = ElementwiseKernel(
"float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
"result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
"hebbian_learning")
params['apply_output_fns_init'] = should_apply
self.apply_output_fns_init = should_apply
if self.apply_output_fns_init:
self.apply_learn_output_fns()
示例2: _gpuAlloc
def _gpuAlloc(self):
#Get GPU information
self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols))) # multiply by 4 as that is size of float
# set max rows to smaller number to save memory usage
if self.totalRows < self.maxPossRows:
print "reducing max rows to reduce memory use on GPU"
self.maxPossRows = self.totalRows
# create pagelocked buffers and GPU arrays
self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
示例3: __init__
def __init__(self,**params):
#Hack-ish way to avoid initialisation until the weights are transfered:
should_apply = self.apply_output_fns_init
params['apply_output_fns_init'] = False
super(GPUSparseCFProjection,self).__init__(**params)
# The sparse matrix is stored in COO format, used for Hebbian learning and normalisation:
nzcols, nzrows, values = self.weights.getTriplets()
tups = sorted(zip(nzrows, nzcols, values))
nzrows = np.array([x[0] for x in tups], np.int32)
nzcols = np.array([x[1] for x in tups], np.int32)
values = np.array([x[2] for x in tups], np.float32)
# Getting them on the GPU:
self.nzcount = self.weights.getnnz()
self.nzrows_gpu = gpuarray.to_gpu(nzrows)
self.nzcols_gpu = gpuarray.to_gpu(nzcols)
# Setting the projection weights in CSR format for dot product calculation:
rowPtr = cusparse.coo2csr(self.nzrows_gpu, self.weights.shape[1])
descrA = cusparse.cusparseCreateMatDescr()
cusparse.cusparseSetMatType(descrA, cusparse.CUSPARSE_MATRIX_TYPE_GENERAL)
cusparse.cusparseSetMatIndexBase(descrA, cusparse.CUSPARSE_INDEX_BASE_ZERO)
self.weights_gpu = cusparse.CSR(descrA, values, rowPtr, self.nzcols_gpu, (self.weights.shape[1], self.weights.shape[0]))
# Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
# main memory without the involvment of the CPU:
self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)
self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
# Helper array for normalization:
self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
# Kernel that applies the normalisation:
self.normalize_kernel = ElementwiseKernel(
"int *nzrows, float *norm_total, float *weights",
"weights[i] *= norm_total[nzrows[i]]",
"divisive_normalize")
# Kernel that calculates the learning:
self.hebbian_kernel = ElementwiseKernel(
"float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
"result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
"hebbian_learning")
self.pycuda_stream = cuda.Stream()
# Finishing the initialisation that might have been delayed:
params['apply_output_fns_init'] = should_apply
self.apply_output_fns_init = should_apply
if self.apply_output_fns_init:
self.apply_learn_output_fns()
示例4: to_cpu
def to_cpu(self):
if self.flags.forc:
return self.get(pagelocked=True)
result = cuda.pagelocked_empty(self.shape, self.dtype)
copy_non_contiguous(result, self)
return result
示例5: get_next_minibatch
def get_next_minibatch(self, i, train=TRAIN):
if train == TRAIN:
data = self.train_data
else:
data = self.test_data
batch_data = data.data
batch_label = data.labels
batch_size = self.batch_size
mini_data = batch_data[:, i * batch_size: (i + 1) * batch_size]
locked_data = driver.pagelocked_empty(mini_data.shape, mini_data.dtype, order='C',
mem_flags=driver.host_alloc_flags.PORTABLE)
locked_data[:] = mini_data
if self.input is not None and locked_data.shape == self.input.shape:
self.input.set(locked_data)
else:
self.input = gpuarray.to_gpu(locked_data)
label = batch_label[i * batch_size : (i + 1) * batch_size]
#label = gpuarray.to_gpu(label)
#label = gpuarray.to_gpu(np.require(batch_label[i * batch_size : (i + 1) * batch_size], dtype =
# np.float, requirements = 'C'))
return self.input, label
示例6: _padded_array
def _padded_array(self,ar): #{{{
nrows_pad = ar.shape[0]
ncols_pad = 16*((ar.shape[1]+15)/16)
#arpad = numpy.empty((nrows_pad,ncols_pad),dtype=ar.dtype)
arpad = cuda.pagelocked_empty((nrows_pad,ncols_pad),dtype=ar.dtype)
arpad[0:ar.shape[0],0:ar.shape[1]] = ar
return arpad
示例7: __init__
def __init__(self, backend, ioshape, initval, extent, aliases, tags):
# Call the standard matrix constructor
super().__init__(backend, ioshape, initval, extent, aliases, tags)
# Allocate a page-locked buffer on the host for MPI to send/recv from
self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
self.dtype, 'C')
示例8: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
示例9: __init__
def __init__(self, backend, ioshape, initval, extent, aliases, tags):
# Call the standard matrix constructor
super().__init__(backend, ioshape, initval, extent, aliases, tags)
# If MPI is CUDA-aware then construct a buffer out of our CUDA
# device allocation and pass this directly to MPI
if backend.mpitype == 'cuda-aware':
self.hdata = _make_pybuf(self.data, self.nbytes, 0x200)
# Otherwise, allocate a buffer on the host for MPI to send/recv from
else:
self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
self.dtype, 'C')
示例10: __gpu_decorate_nodes
def __gpu_decorate_nodes(self, samples, labels):
si_0 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices)
si_1 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices)
self.values_array = np.empty(self.n_nodes, dtype = self.dtype_labels)
cuda.memcpy_dtoh(si_0, self.sorted_indices_gpu.ptr)
cuda.memcpy_dtoh(si_1, self.sorted_indices_gpu_.ptr)
decorate(self.target,
si_0,
si_1,
self.values_idx_array,
self.values_si_idx_array,
self.values_array,
self.n_nodes)
self.values_idx_array = None
self.values_si_idx_array = None
self.left_children.resize(self.n_nodes, refcheck = False)
self.right_children.resize(self.n_nodes, refcheck = False)
self.feature_threshold_array.resize(self.n_nodes, refcheck = False)
self.feature_idx_array.resize(self.n_nodes, refcheck = False)
示例11: get
def get(self, ary=None, pagelocked=False):
if ary is None:
if pagelocked:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
ary = numpy.empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
if self.size:
drv.memcpy_dtoh(ary, self.gpudata)
return ary
示例12: threshold_integrated
def threshold_integrated(series, value):
global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
t = numpy.float32(value**2)
nb = int(numpy.ceil(float(len(series))/nt/gs))
if _bn is None or len(_bn) < nb:
_bn = gpuarray.zeros(nb, dtype=numpy.uint32)
if _n is None:
_n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
ptr = numpy.intp(_n.base.get_device_pointer())
class T():
pass
_dn = T()
_dn.gpudata = ptr
_dn.flags = _n.flags
if _loc_tmp is None or len(series) > len(_loc_tmp):
_loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
_loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
_val_out = gpuarray.zeros(len(series), dtype=series.dtype)
_val = driver.pagelocked_empty((4096*256), numpy.complex64)
_loc = driver.pagelocked_empty((4096*256), numpy.uint32)
#Do the thresholding by block
stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
# Recombine the blocks into a final output
stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
# We need to get the data back now
pycbc.scheme.mgr.state.context.synchronize()
if _n != 0:
driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
pycbc.scheme.mgr.state.context.synchronize()
return _loc[0:_n], _val[0:_n]
示例13: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
ary = _as_strided(ary, strides=self.strides)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
assert ary.flags.forc
assert self.flags.forc, "Array in get() must be contiguous"
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
示例14: get_async
def get_async(self, stream = None, ary = None):
if ary is None:
ary = cuda.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == ary.dtype
if ary.base.__class__ != cuda.HostAllocation:
raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
if self.size:
if self.M == 1:
cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
else:
PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
return ary
示例15: get
def get(self, ary=None, astype=None, pagelocked=False):
if ary is None:
if pagelocked:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
ary = np.empty(self.shape, self.dtype)
ary = _as_strided(ary, strides=self.strides)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
assert ary.flags.forc
assert self.flags.forc, "Array in get() must be contiguous"
if self.size:
drv.memcpy_dtoh(ary, self.gpudata)
if astype is not None:
ary = ary.astype(astype) * 2 ** (self.iwl - 15)
return ary