本文整理汇总了Python中pycuda.compiler.SourceModule方法的典型用法代码示例。如果您正苦于以下问题:Python compiler.SourceModule方法的具体用法?Python compiler.SourceModule怎么用?Python compiler.SourceModule使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pycuda.compiler
的用法示例。
在下文中一共展示了compiler.SourceModule方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: maximum_filter_2d
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def maximum_filter_2d(arr2D, footprint): ## Make sure arr2D is our datatype float32 and footprint of int32
arr2DMaxed = numpy.empty_like(arr2D)
head, tail = os.path.split(os.path.abspath(__file__)) # Used so that we can always get the kernel which should be in the same directory as this file
maxFunction = open(head + "/2DSlidingMaxFootprintKernel.c", "rt")
maxFunction = SourceModule(maxFunction.read())
slidingMaxKernel = maxFunction.get_function("slidingMaxiumum2D")
blockSize = [16, 16] # To-do: Add a variable to this, can affect performance based on GPU
gridSize = getGridSize(blockSize, arr2D.shape) # Get the size of our grid based on the size of a grid (blocksize)
slidingMaxKernel(cuda.In(arr2D), # Input
cuda.Out(arr2DMaxed), # Output
numpy.int32(footprint.shape[1]), # Kernel Size
numpy.int32(arr2D.shape[1]), # Row Stride
numpy.int32(1), # Column Stride
numpy.int32(int(arr2D.shape[1])), # Array Column Count
numpy.int32(int(arr2D.shape[0])), # Array Row Count
cuda.In(footprint),
block=(blockSize[0],blockSize[1],1),
grid=(gridSize[0],gridSize[1],1)
)
return arr2DMaxed
示例2: test_pycuda_only
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def test_pycuda_only():
"""Run pycuda only example to test that pycuda works."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
# Test with pycuda in/out of numpy.ndarray
a = numpy.random.randn(100).astype(numpy.float32)
b = numpy.random.randn(100).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400, 1, 1), grid=(1, 1))
assert (dest == a * b).all()
示例3: test_pycuda_theano
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def test_pycuda_theano():
"""Simple example with pycuda function and Theano CudaNdarray object."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(100).astype(numpy.float32)
b = numpy.random.randn(100).astype(numpy.float32)
# Test with Theano object
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
multiply_them(dest, ga, gb,
block=(400, 1, 1), grid=(1, 1))
assert (numpy.asarray(dest) == a * b).all()
示例4: make_thunk
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
示例5: __init__
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def __init__(self, num_trans, min_support, use_CUDA, block, thread, use_optimal=True):
self.num_trans = num_trans
self.min_support = min_support * num_trans
self.support_list = {}
self.use_CUDA = use_CUDA
self.use_optimal = use_optimal
if self.use_CUDA and not self.use_optimal:
assert block != None and thread != None
mod = SourceModule("""__global__ void multiply_element(int *dest, int *a, int *b) {
const int idx = threadIdx.x + blockDim.x * blockIdx.x;
dest[idx] = a[idx] * b[idx];
}""")
self.multiply = mod.get_function("multiply_element")
self.block = (block, thread, 1)
dx, mx = divmod(self.num_trans, self.block[0])
dy, my = divmod(1, self.block[1])
self.grid = (int(dx + (mx>0)), int(dy + (my>0)))
print("Using Block =", self.block)
print("Using Grid =", self.grid)
elif self.use_CUDA:
print("Accelerating Eclat computation with GPU!")
else:
print("Not using GPU for acceleration.")
示例6: pack_rows
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def pack_rows():
code = pack() + r"""
__global__ void pack_rows(float *a, unsigned int *b, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
b[i] = pack(&a[i * 32]);
}
}
"""
module = SourceModule(code)
kernel = module.get_function("pack_rows")
sig = "2P I"
kernel.prepare(sig)
return kernel
示例7: get_dckernel
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def get_dckernel(slen):
# Right now, hardcoding the number of threads per block
nt = 1024
nb = int(numpy.ceil(slen / 1024.0))
if nb > 1024:
raise ValueError("More than 1024 blocks not supported yet")
try:
return dckernel_cache[nb]
except KeyError:
mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb))
freq_tex = mod.get_texref("freq_tex")
amp_tex = mod.get_texref("amp_tex")
phase_tex = mod.get_texref("phase_tex")
fn1 = mod.get_function("find_block_indices")
fn1.prepare("PPifff", texrefs=[freq_tex])
fn2 = mod.get_function("linear_interp")
fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex])
dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb)
return dckernel_cache[nb]
示例8: div_eigenenergy_cuda
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def div_eigenenergy_cuda(ksn2e, ksn2f, nfermi, vstart, comega, nm2v_re, nm2v_im,
block_size, grid_size):
block = (int(block_size[0]), int(block_size[1]), int(1))
grid = (int(grid_size[0]), int(grid_size[1]))
mod = SourceModule(kernel_code_div_eigenenergy_cuda)
calc_XXVV = mod.get_function("calc_XXVV_gpu")
calc_XXVV(nm2v_re, nm2v_im, np.int32(nm2v_re.shape[0]),
np.int32(nm2v_re.shape[1]), ksn2e, ksn2f, np.int32(nfermi),
np.int32(vstart), np.int32(ksn2e.shape[0]), np.float64(comega.real),
np.float64(comega.imag), block = block, grid = grid)
示例9: propagate
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def propagate(self, iters=2, rand_search_radius=500):
"""
Optimize the NNF using PatchMatch Algorithm
:param iters: number of iterations
:param rand_search_radius: max radius to use in random search
:return:
"""
mod = SourceModule(open(os.path.join(package_directory,"patchmatch.cu")).read(),no_extern_c=True)
patchmatch = mod.get_function("patch_match")
rows = self.A.shape[0]
cols = self.A.shape[1]
channels = np.int32(self.A.shape[2])
nnf_t = np.zeros(shape=(rows,cols),dtype=np.uint32)
threads = 20
def get_blocks_for_dim(dim,blocks):
#if dim % blocks ==0:
# return dim//blocks
return dim// blocks +1
patchmatch(
drv.In(self.A),
drv.In(self.AA),
drv.In(self.B),
drv.In(self.BB),
drv.InOut(self.nnf),
drv.InOut(nnf_t),
drv.InOut(self.nnd),
np.int32(rows),
np.int32(cols),
channels,
np.int32(self.patch_size),
np.int32(iters),
np.int32(8),
np.int32(rand_search_radius),
block=(threads,threads,1),
grid=(get_blocks_for_dim(rows,threads),
get_blocks_for_dim(cols,threads)))
示例10: make_node
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join(
[_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
for var, name in chain(izip(inputs, in_name),
izip(out_node.outputs, out_name))] +
["int size"])
mod = SourceModule("""
__global__ void %s(%s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s
}
}
""" % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name)
return out_node
示例11: _prepare_compound_kernel
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def _prepare_compound_kernel(transformer, ops):
"""
Generate and return a kernel given a set of ops.
ops (list): List of tuples describing ops to execute in kernel. Each tuple
should be of the format (op_name, input0, input1, output, axis)
"""
# Take care tensor dimensionality
ops = _wrap_tensor_descriptions(transformer, ops)
# Generate kernel source code and block/grid mapping
(axes_mapping, dims) = _get_axes_mapping(ops)
code, kernel_name, arg_desc, params = _get_compound_kernel(ops, axes_mapping, dims)
# Compile kernel
if _are_flex_params(params):
code = _includes_template + _flex_includes_template + code
else:
code = _includes_template + code
module = SourceModule(code, options=[])
kernel = module.get_function(kernel_name)
kernel.name = kernel_name
kernel.prepare(arg_desc)
# Calculate block and grid dims
blockdim = [1, 1, 1]
griddim = [1, 1, 1]
for axis in axes_mapping:
if axis[0] == 'x':
blockdim[0] = axis[1]
griddim[0] = axis[2]
elif axis[0] == 'y':
blockdim[1] = axis[1]
griddim[1] = axis[2]
elif axis[0] == 'z':
blockdim[2] = axis[1]
griddim[2] = axis[2]
params = [tuple(griddim), tuple(blockdim), None] + params
return (kernel, params, 128)
示例12: _get_transpose_kernel
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def _get_transpose_kernel(dtype):
code = _transpose_kernel % {
"type": _get_register_type(dtype, memory=True)
}
module = SourceModule(code)
kernel = module.get_function("transpose")
kernel.prepare("PPII")
return kernel
示例13: _get_shuffle_kernel
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def _get_shuffle_kernel(dtype):
code = _shuffle_kernel % {
"type": _get_register_type(dtype, memory=True)
}
module = SourceModule(code)
kernel = module.get_function("dimShuffle")
kernel.prepare("PPIIIIIIIIIIIIII")
return kernel
示例14: compile_cuda_kernel
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def compile_cuda_kernel(cuda_kernel_code):
"""
compiles a cuda kernel and return compiled module
"""
try:
cuda_code = cuda_kernel_code if 1 else replace_local_floats_with_double(cuda_kernel_code)
logging.debug("Compiling cuda code:\n" + cuda_code)
mod = SourceModule(cuda_code, options=DEFAULT_NVCC_FLAGS + ['--use_fast_math'])
except cuda.CompileError as e:
logging.error(cuda_code)
logging.error("CUDA compilation error:")
logging.error(e.stderr)
raise e
return mod
示例15: __init__
# 需要导入模块: from pycuda import compiler [as 别名]
# 或者: from pycuda.compiler import SourceModule [as 别名]
def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100):
self.math_function = math_function
if precision in [None, 's', 'S', 'single', np.float32]:
self.precision = 'float'
self.numpy_precision = np.float32
self.p_curand = ''
elif precision in ['d','D', 'double', np.float64]:
self.precision = 'double'
self.numpy_precision = np.float64
self.p_curand = '_double'
else:
raise Exception('precision is invalid datatype!')
if (hi - lo <= 0):
raise Exception('hi - lo <= 0!')
else:
self.hi = hi
self.lo = lo
MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function}
self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict
self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode)
self.f = self.ker.get_function('monte_carlo')
self.num_blocks = num_blocks
self.samples_per_thread = samples_per_thread
开发者ID:PacktPublishing,项目名称:Hands-On-GPU-Programming-with-Python-and-CUDA,代码行数:34,代码来源:monte_carlo_integrator.py