本文整理汇总了Python中pycuda.driver.matrix_to_texref函数的典型用法代码示例。如果您正苦于以下问题:Python matrix_to_texref函数的具体用法?Python matrix_to_texref怎么用?Python matrix_to_texref使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了matrix_to_texref函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_multiple_2d_textures
def test_multiple_2d_textures(self):
mod = SourceModule("""
texture<float, 2, cudaReadModeElementType> mtx_tex;
texture<float, 2, cudaReadModeElementType> mtx2_tex;
__global__ void copy_texture(float *dest)
{
int row = threadIdx.x;
int col = threadIdx.y;
int w = blockDim.y;
dest[row*w+col] =
tex2D(mtx_tex, row, col)
+
tex2D(mtx2_tex, row, col);
}
""")
copy_texture = mod.get_function("copy_texture")
mtx_tex = mod.get_texref("mtx_tex")
mtx2_tex = mod.get_texref("mtx2_tex")
shape = (3,4)
a = np.random.randn(*shape).astype(np.float32)
b = np.random.randn(*shape).astype(np.float32)
drv.matrix_to_texref(a, mtx_tex, order="F")
drv.matrix_to_texref(b, mtx2_tex, order="F")
dest = np.zeros(shape, dtype=np.float32)
copy_texture(drv.Out(dest),
block=shape+(1,),
texrefs=[mtx_tex, mtx2_tex]
)
assert la.norm(dest-a-b) < 1e-6
示例2: create_2d_texture
def create_2d_texture(a, module, variable, point_sampling=False):
a = numpy.ascontiguousarray(a)
out_texref = module.get_texref(variable)
cuda.matrix_to_texref(a, out_texref, order='C')
if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT)
else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
return out_texref
示例3: rotate_image
def rotate_image( a, resize = 1.5, angle = 180., interpolation = "linear", blocks = (16,16,1) ):
"""
Rotates the array. The new array has the new size and centers the
picture in the middle.
a - array (2-dim)
resize - new_image w/old_image w
angle - degrees to rotate the image
interpolation - "linear" or None
blocks - given to the kernel when run
returns: a new array with dtype=uint8 containing the rotated image
"""
angle = angle/180. *pi
# Convert this image to float. Unsigned int texture gave
# strange results for me. This conversion is slow though :(
a = a.astype("float32")
# Calculate the dimensions of the new image
calc_x = lambda (x,y): (x*a.shape[1]/2.*cos(angle)-y*a.shape[0]/2.*sin(angle))
calc_y = lambda (x,y): (x*a.shape[1]/2.*sin(angle)+y*a.shape[0]/2.*cos(angle))
xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
new_image_dim = (
int(numpy.ceil(max(ys)-min(ys))*resize),
int(numpy.ceil(max(xs)-min(xs))*resize),
)
# Now generate the cuda texture
cuda.matrix_to_texref(a, texref, order="C")
# We could set the next if we wanted to address the image
# in normalized coordinates ( 0 <= coordinate < 1.)
# texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
if interpolation == "linear":
texref.set_filter_mode(cuda.filter_mode.LINEAR)
# Calculate the gridsize. This is entirely given by the size of our image.
gridx = new_image_dim[0]/blocks[0] if \
new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1
gridy = new_image_dim[1]/blocks[1] if \
new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1
# Get the output image
output = numpy.zeros(new_image_dim,dtype="uint8")
# Call the kernel
copy_texture_func(
numpy.float32(resize), numpy.float32(angle),
numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]),
numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]),
cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy))
return output
示例4: copy_texture_memory_args
def copy_texture_memory_args(self, texmem_args):
"""adds texture memory arguments to the most recently compiled module
:param texmem_args: A dictionary containing the data to be passed to the
device texture memory. TODO
"""
filter_mode_map = { 'point': drv.filter_mode.POINT,
'linear': drv.filter_mode.LINEAR }
address_mode_map = { 'border': drv.address_mode.BORDER,
'clamp': drv.address_mode.CLAMP,
'mirror': drv.address_mode.MIRROR,
'wrap': drv.address_mode.WRAP }
logging.debug('copy_texture_memory_args called')
logging.debug('current module: ' + str(self.current_module))
self.texrefs = []
for k, v in texmem_args.items():
tex = self.current_module.get_texref(k)
self.texrefs.append(tex)
logging.debug('copying to texture: ' + str(k))
if not isinstance(v, dict):
data = v
else:
data = v['array']
logging.debug('texture to be copied: ')
logging.debug(data.nbytes)
logging.debug(data.dtype)
logging.debug(data.flags)
drv.matrix_to_texref(data, tex, order="C")
if isinstance(v, dict):
if 'address_mode' in v and v['address_mode'] is not None:
# address_mode is set per axis
amode = v['address_mode']
if not isinstance(amode, list):
amode = [ amode ] * data.ndim
for i, m in enumerate(amode):
try:
if m is not None:
tex.set_address_mode(i, address_mode_map[m])
except KeyError:
raise ValueError('Unknown address mode: ' + m)
if 'filter_mode' in v and v['filter_mode'] is not None:
fmode = v['filter_mode']
try:
tex.set_filter_mode(filter_mode_map[fmode])
except KeyError:
raise ValueError('Unknown filter mode: ' + fmode)
if 'normalized_coordinates' in v and v['normalized_coordinates']:
tex.set_flags(tex.get_flags() | drv.TRSF_NORMALIZED_COORDINATES)
示例5: doit
def doit(img):
width, height = (img.shape[0], img.shape[1])
cuda.matrix_to_texref(img, texref, order="C")
texref.set_filter_mode(cuda.filter_mode.LINEAR)
gpu_output = np.zeros((width/2,height/2), dtype=np.float32)
gridsize = (width / blocksize[0], height / blocksize[1])
downsample_func(cuda.Out(gpu_output), np.int32(width), np.int32(width/2), block=blocksize, grid = gridsize, texrefs=[texref])
# gpu_output = gpu_output.transpose()
return gpu_output
示例6: load_texture
def load_texture(self, name, arr):
'''
Loads an array into a texture with a name.
Address by the name in the kernel code.
'''
tex = self.mod.get_texref(name) # x*y*z
arr = arr.astype('float32')
if len(arr.shape) == 3:
carr = arr.copy('F')
texarray = numpy3d_to_array(carr, 'F')
tex.set_array(texarray)
else:
if len(arr.shape) == 1:
arr = np.expand_dims(arr, 1)
tex.set_flags(0)
cuda.matrix_to_texref(arr, tex, order='F')
tex.set_address_mode(0, cuda.address_mode.CLAMP)
tex.set_address_mode(1, cuda.address_mode.CLAMP)
tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tex.set_filter_mode(cuda.filter_mode.LINEAR)
self.textures[name] = tex
示例7: main
def main():
#Create dictionary argument for rendering
RenderArgs= {"safe_memory_mapping":1,
"aligned_byte_length_genome":8,
"bit_length_edge_type":3,
"curand_nr_threads_per_block":256,
"nr_tile_types":4,
"nr_edge_types":8,
"warpsize":32,
"fit_dim_thread_x":1,
"fit_dim_thread_y":1,
"fit_dim_block_x":1 }
# Set environment for template package Jinja2
env = Environment(loader=PackageLoader('main', './'))
# Load source code from file
Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
# Render source code
RenderedSource = Source.render( RenderArgs )
# Save rendered source code to file
f = open('./rendered.cu', 'w')
f.write(RenderedSource)
f.close()
#Load source code into module
KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
Kernel = KernelSourceModule.get_function("TestEdgeSortKernel")
CurandKernel = KernelSourceModule.get_function("CurandInitKernel")
#Initialise InteractionMatrix
InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
def Delta(a,b):
if a==b:
return 1
else:
return 0
for i in range(InteractionMatrix.shape[0]):
for j in range(InteractionMatrix.shape[1]):
InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )
#Set up our InteractionMatrix
InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
print InteractionMatrix
#a = numpy.random.randn(400).astype(numpy.uint8)
#b = numpy.random.randn(400).astype(numpy.uint8)
dest = numpy.arange(256).astype(numpy.uint8)
for i in range(0, 256/8):
dest[i*8 + 0] = 36
dest[i*8 + 1] = 151
dest[i*8 + 2] = 90
dest[i*8 + 3] = 109
dest[i*8 + 4] = 224
dest[i*8 + 5] = 4
dest[i*8 + 6] = 0
dest[i*8 + 7] = 0
dest_h = drv.mem_alloc(dest.nbytes)
drv.memcpy_htod(dest_h, dest)
print "before: "
print dest
curand = numpy.zeros(40*256).astype(numpy.uint8);
curand_h = drv.mem_alloc(curand.nbytes)
CurandKernel(curand_h, block=(32,1,1), grid=(1,1))
Kernel(dest_h, curand_h, block=(32,1,1), grid=(1,1))
drv.memcpy_dtoh(dest, dest_h)
print "after: "
print dest
示例8: __init__
def __init__(self, img_path):
super(LFapplication, self).__init__()
#
# Load image data
#
base_path = os.path.splitext(img_path)[0]
lenslet_path = base_path + '-lenslet.txt'
optics_path = base_path + '-optics.txt'
with open(lenslet_path, 'r') as f:
tmp = eval(f.readline())
x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
np.array(tmp, dtype=np.float32)
with open(optics_path, 'r') as f:
for line in f:
name, val = line.strip().split()
try:
setattr(self, name, np.float32(val))
except:
pass
max_angle = math.atan(self.pitch/2/self.flen)
#
# Prepare image
#
im_pil = Image.open(img_path)
if im_pil.mode == 'RGB':
self.NCHANNELS = 3
w, h = im_pil.size
im = np.zeros((h, w, 4), dtype=np.float32)
im[:, :, :3] = np.array(im_pil).astype(np.float32)
self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3)
else:
self.NCHANNELS = 1
im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32)
h, w = im.shape
self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx))
x_start = x_offset - int(x_offset / right_dx) * right_dx
y_start = y_offset - int(y_offset / down_dy) * down_dy
x_ratio = self.flen * right_dx / self.pitch
y_ratio = self.flen * down_dy / self.pitch
#
# Generate the cuda kernel
#
mod_LFview = pycuda.compiler.SourceModule(
_kernel_tpl.render(
newiw=self.LF_dim[1],
newih=self.LF_dim[0],
oldiw=w,
oldih=h,
x_start=x_start,
y_start=y_start,
x_ratio=x_ratio,
y_ratio=y_ratio,
x_step=right_dx,
y_step=down_dy,
NCHANNELS=self.NCHANNELS
)
)
self.LFview_func = mod_LFview.get_function("LFview_kernel")
self.texref = mod_LFview.get_texref("tex")
#
# Now generate the cuda texture
#
if self.NCHANNELS == 3:
cuda.bind_array_to_texref(
cuda.make_multichannel_2d_array(im, order="C"),
self.texref
)
else:
cuda.matrix_to_texref(im, self.texref, order="C")
#
# We could set the next if we wanted to address the image
# in normalized coordinates ( 0 <= coordinate < 1.)
# texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#
self.texref.set_filter_mode(cuda.filter_mode.LINEAR)
#
# Prepare the traits
#
self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))
self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
self.LF_img = Plot(self.plotdata)
if self.NCHANNELS == 3:
self.LF_img.img_plot("LF_img")
else:
self.LF_img.img_plot("LF_img", colormap=gray)
示例9: mds
#.........这里部分代码省略.........
(1.0, 0.0, 0.0))}
mymap = LinearSegmentedColormap('mymap', cdict)
mymap.set_over( (1.0, 0.0, 1.0) )
mymap.set_bad ( (0.0, 0.0, 0.0) )
pl.plt.register_cmap(cmap=mymap)
# set up arrays for calculations
coords_gpu = gpuarray.to_gpu(np.random.random( (max_x, max_y, DIMENSIONS) ).astype(np.float32)) # initialize coordinates to random values in range 0...1
forces_gpu = gpuarray.zeros( (int(max_x), int(max_y), DIMENSIONS), dtype=np.float32 ) # 3D float32 accumulate forces over one timestep
weights_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation
errors_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation
near_stations_gpu = gpuarray.zeros( (cuda_grid_shape[0], cuda_grid_shape[1], N_NEARBY_STATIONS), dtype=np.int32)
debug_gpu = gpuarray.zeros( n_gridpoints, dtype = np.int32 )
debug_img_gpu = gpuarray.zeros_like( errors_gpu )
print "\n----COMPILATION----"
# times could be merged into forces kernel, if done by pixel not station.
# integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays.
# DIM should be replaced by python script, so as not to define twice.
src = open("unified_mds.cu").read()
src = src.replace( 'N_NEARBY_STATIONS_PYTHON', str(N_NEARBY_STATIONS) )
src = src.replace( 'N_STATIONS_PYTHON', str(n_stations) )
src = src.replace( 'DIMENSIONS_PYTHON', str(DIMENSIONS) )
mod = SourceModule(src, options=["--ptxas-options=-v"])
stations_kernel = mod.get_function("stations" )
forces_kernel = mod.get_function("forces" )
integrate_kernel = mod.get_function("integrate")
matrix_texref = mod.get_texref('tex_matrix')
station_coords_texref = mod.get_texref('tex_station_coords')
near_stations_texref = mod.get_texref('tex_near_stations')
#ts_coords_texref = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D.
cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures
cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices
near_stations_gpu.bind_to_texref_ext(near_stations_texref)
# note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app!
stations_kernel(near_stations_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
#print "Near stations list:"
#print near_stations_gpu
print "\n----CALCULATION----"
t_start = time.time()
n_pass = 0
while (n_pass < N_ITERATIONS) :
n_pass += 1
# Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations,
# they don't move... so the whole thing stabilizes in a couple of cycles.
# Stations are worked on in blocks to avoid locking up the GPU with one giant kernel.
for subset_low in range(0, n_stations, STATION_BLOCK_SIZE) :
subset_high = subset_low + STATION_BLOCK_SIZE
if subset_high > n_stations : subset_high = n_stations
sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) )
sys.stdout.flush()
STATUS_FUNCTION(n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0, (time.time() - t_start) / n_pass + (subset_low/n_stations))
# adding texrefs in kernel call seems to change nothing, leaving them out.
# max_x and max_y could be #defined in kernel source, along with STATION_BLOCK_SIZE
forces_kernel(np.int32(n_stations), np.int32(subset_low), np.int32(subset_high), max_x, max_y, coords_gpu, forces_gpu, weights_gpu, errors_gpu, debug_gpu, debug_img_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
# print coords_gpu, forces_gpu
time.sleep(0.5) # let the user interface catch up.
integrate_kernel(max_x, max_y, coords_gpu, forces_gpu, weights_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
print IMAGES_EVERY
if (IMAGES_EVERY > 0) and (n_pass % IMAGES_EVERY == 0) :
#print 'Kernel debug output:'
#print debug_gpu
velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2))
pl.imshow( velocities.T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
pl.title( 'Velocity ( sec / timestep) - step %03d' % n_pass )
pl.colorbar()
pl.savefig( 'img/vel%03d.png' % n_pass )
pl.close()
pl.imshow( (errors_gpu.get() / weights_gpu.get() / 60.0 ).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
pl.title( 'Average absolute error (min) - step %03d' %n_pass )
pl.colorbar()
pl.savefig( 'img/err%03d.png' % n_pass )
pl.close()
pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
pl.title( 'Debugging Output - step %03d' %n_pass )
pl.colorbar()
pl.savefig( 'img/debug%03d.png' % n_pass )
pl.close()
GRAPH_FUNCTION('img/err%03d.png' % n_pass)
#INTERFACE.update
sys.stdout.write( "/ avg pass time %02.1f sec" % ( (time.time() - t_start) / n_pass, ) )
sys.stdout.flush()
示例10: trikmeans_gpu
#.........这里部分代码省略.........
#---------------------------------------------------------------
# prepare source modules
#---------------------------------------------------------------
t1 = time.time()
mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters,
blocksize_step4, seqcount_step4, gridsize_step4,
blocksize_step4part2, useTextureForData)
ccdist = mod_ccdist.get_function("ccdist")
calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
init = mod_ccdist.get_function("init")
step3 = mod_ccdist.get_function("step3")
step4 = mod_ccdist.get_function("step4")
step4part2 = mod_ccdist.get_function("step4part2")
calc_movement = mod_ccdist.get_function("calc_movement")
step56 = mod_ccdist.get_function("step56")
pycuda.autoinit.context.synchronize()
t2 = time.time()
module_time = t2-t1
#---------------------------------------------------------------
# setup data on GPU
#---------------------------------------------------------------
t1 = time.time()
data = np.array(data).astype(np.float32)
clusters = np.array(clusters).astype(np.float32)
if useTextureForData:
# copy the data to the texture
texrefData = mod_ccdist.get_texref("texData")
cuda.matrix_to_texref(data, texrefData, order="F")
else:
if usePageLockedMemory:
data_pl = cuda.pagelocked_empty_like(data)
data_pl[:,:] = data;
gpu_data = gpuarray.to_gpu(data_pl)
else:
gpu_data = gpuarray.to_gpu(data)
if usePageLockedMemory:
clusters_pl = cuda.pagelocked_empty_like(clusters)
clusters_pl[:,:] = clusters
gpu_clusters = gpuarray.to_gpu(clusters_pl)
else:
gpu_clusters = gpuarray.to_gpu(clusters)
gpu_assignments = gpuarray.zeros((nPts,), np.int32) # cluster assignment
gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between
# point and each cluster
gpu_upper = gpuarray.zeros((nPts,), np.float32) # upper bounds on distance between
# point and any cluster
gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances
gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32) # half distance to closest
gpu_hdClosest.fill(1.0e10) # set to large value // **TODO** get the acutal float max
gpu_badUpper = gpuarray.zeros((nPts,), np.int32) # flag to indicate upper bound needs recalc
gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32);
gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32)
gpu_cluster_changed.fill(1)
gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_step4), np.float32)
示例11: main
def main():
#Set up global timer
tot_time = time.time()
#Define constants
BankSize = 8 # Do not go beyond 8!
WarpSize = 32 #Do not change...
DimGridX = 19
DimGridY = 19
BlockDimX = 256
BlockDimY = 256
SearchSpaceSize = 2**24 #BlockDimX * BlockDimY * 32
FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
AlignedByteLengthGenome = 4
#Create dictionary argument for rendering
RenderArgs= {"safe_memory_mapping":1,
"aligned_byte_length_genome":AlignedByteLengthGenome,
"bit_length_edge_type":3,
"curand_nr_threads_per_block":32,
"nr_tile_types":2,
"nr_edge_types":8,
"warpsize":WarpSize,
"fit_dim_thread_x":32*BankSize,
"fit_dim_thread_y":1,
"fit_dim_block_x":BlockDimX,
"fit_dim_grid_x":19,
"fit_dim_grid_y":19,
"fit_nr_four_permutations":24,
"fit_length_movelist":244,
"fit_nr_redundancy_grid_depth":2,
"fit_nr_redundancy_assemblies":10,
"fit_tile_index_starting_tile":0,
"glob_nr_tile_orientations":4,
"banksize":BankSize,
"curand_dim_block_x":BlockDimX
}
# Set environment for template package Jinja2
env = Environment(loader=PackageLoader('main', './'))
# Load source code from file
Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
# Render source code
RenderedSource = Source.render( RenderArgs )
# Save rendered source code to file
f = open('./rendered.cu', 'w')
f.write(RenderedSource)
f.close()
#Load source code into module
KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
CurandKernel = KernelSourceModule.get_function("CurandInitKernel")
#Initialise InteractionMatrix
InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
def Delta(a,b):
if a==b:
return 1
else:
return 0
for i in range(InteractionMatrix.shape[0]):
for j in range(InteractionMatrix.shape[1]):
InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )
#Set up our InteractionMatrix
InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
print InteractionMatrix
#Set-up genomes
#dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
#for i in range(0, GenomeDim/4):
#dest[i*8 + 0] = int('0b00100101',2) #CRASHES
#dest[i*8 + 1] = int('0b00010000',2) #CRASHES
#dest[i*8 + 0] = int('0b00101000',2)
#dest[i*8 + 1] = int('0b00000000',2)
#dest[i*8 + 2] = int('0b00000000',2)
#dest[i*8 + 3] = int('0b00000000',2)
#dest[i*8 + 4] = int('0b00000000',2)
#dest[i*8 + 5] = int('0b00000000',2)
#dest[i*8 + 6] = int('0b00000000',2)
#dest[i*8 + 7] = int('0b00000000',2)
# dest[i*4 + 0] = 40
# dest[i*4 + 1] = 0
# dest[i*4 + 2] = 0
# dest[i*4 + 3] = 0
dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
#drv.memcpy_htod(dest_h, dest)
#print "Genomes before: "
#print dest
#Set-up grids
#grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
#grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
#drv.memcpy_htod(grids_h, grids)
#print "Grids:"
#.........这里部分代码省略.........
示例12: trikmeans_gpu
def trikmeans_gpu(data, clusters, iterations, return_times = 0):
# trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
# kmeans using triangle inequality algorithm and cuda
# input arguments are the data, intial cluster values, and number of iterations to repeat
# The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
# nPts = number of data points
# The shape of clustrs is (nDim, nClusters)
#
# The return values are the updated clusters and labels for the data
#---------------------------------------------------------------
# get problem parameters
#---------------------------------------------------------------
(nDim, nPts) = data.shape
nClusters = clusters.shape[1]
#---------------------------------------------------------------
# set calculation control variables
#---------------------------------------------------------------
useTextureForData = 1
# block and grid sizes for the ccdist kernel (also for hdclosest)
blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16))
gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist
#block and grid sizes for the init module
threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
blocksize_init = min(512, threads_desired)
gridsize_init = 1 + (threads_desired - 1)/blocksize_init
#block and grid sizes for the step3 module
blocksize_step3 = blocksize_init
gridsize_step3 = gridsize_init
#block and grid sizes for the step4 module
for blocksize_step4_x in range(32, 512, 32):
if blocksize_step4_x >= nClusters:
break;
blocksize_step4_y = min(nDim, 512/blocksize_step4_x)
gridsize_step4_x = 1 + (nClusters-1)/blocksize_step4_x
gridsize_step4_y = 1 + (nDim-1)/blocksize_step4_y
#block and grid sizes for the calc_movement module
blocksize_calcm = blocksize_step4_x
gridsize_calcm = gridsize_step4_x
#block and grid sizes for the step56 module
blocksize_step56 = blocksize_init
gridsize_step56 = gridsize_init
#---------------------------------------------------------------
# prepare source modules
#---------------------------------------------------------------
t1 = time.time()
mod_ccdist = mods2.get_ccdist_module(nDim, nPts, nClusters, blocksize_ccdist, blocksize_init,
blocksize_step4_x, blocksize_step4_y, blocksize_step56,
useTextureForData)
#mod_step56 = mods2.get_step56_module(nDim, nPts, nClusters, blocksize_step56)
ccdist = mod_ccdist.get_function("ccdist")
calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
init = mod_ccdist.get_function("init")
step3 = mod_ccdist.get_function("step3")
step4 = mod_ccdist.get_function("step4")
calc_movement = mod_ccdist.get_function("calc_movement")
step56 = mod_ccdist.get_function("step56")
pycuda.autoinit.context.synchronize()
t2 = time.time()
module_time = t2-t1
#---------------------------------------------------------------
# setup data on GPU
#---------------------------------------------------------------
t1 = time.time()
data = np.array(data).astype(np.float32)
clusters = np.array(clusters).astype(np.float32)
if useTextureForData:
# copy the data to the texture
texrefData = mod_ccdist.get_texref("texData")
cuda.matrix_to_texref(data, texrefData, order="F")
else:
gpu_data = gpuarray.to_gpu(data)
gpu_clusters = gpuarray.to_gpu(clusters)
gpu_assignments = gpuarray.zeros((nPts,), np.int32) # cluster assignment
gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between
# point and each cluster
gpu_upper = gpuarray.zeros((nPts,), np.float32) # upper bounds on distance between
# point and any cluster
gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances
gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32) # half distance to closest
#.........这里部分代码省略.........
示例13: main
#.........这里部分代码省略.........
"fit_nr_subgridsperbank":int(NrSubgridsPerBank),
"glob_bitlength_edgetype":int(EdgeTypeBitLength)
}
# Render source code
RenderedSource = Source.render( RenderArgs )
# Save rendered source code to file
f = open('./rendered.cu', 'w')
f.write(RenderedSource)
f.close()
#Load source code into module
KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None)
#Allocate values on GPU
Genomes_h = drv.mem_alloc(Genomes.nbytes)
FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
ReductionList_h = drv.mem_alloc(ReductionList.nbytes)
#Copy values to global memory
drv.memcpy_htod(Genomes_h, Genomes)
drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
drv.memcpy_htod(FitnessValues_h, FitnessValues)
drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
drv.memcpy_htod(Mutexe_h, Mutexe)
#Copy values to constant / texture memory
for id in range(0, NrFitnessFunctionGrids):
FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
drv.memcpy_htod(GAParams_h[0], GAParams)
FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")
#Set up curandStates
curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)
#Compile kernels
curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
sorting_fnc = KernelSourceModule.get_function("SortingKernel")
ga_fnc = KernelSourceModule.get_function("GAKernel")
#Initialise Curand
curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))
#Build parameter lists for FitnessKernel and GAKernel
FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);
示例14: main
def main():
#FourPermutations set-up
FourPermutations = numpy.array([ [1,2,3,4],
[1,2,4,3],
[1,3,2,4],
[1,3,4,2],
[1,4,2,3],
[1,4,3,2],
[2,1,3,4],
[2,1,4,3],
[2,3,1,4],
[2,3,4,1],
[2,4,1,3],
[2,4,3,1],
[3,2,1,4],
[3,2,4,1],
[3,1,2,4],
[3,1,4,2],
[3,4,2,1],
[3,4,1,2],
[4,2,3,1],
[4,2,1,3],
[4,3,2,1],
[4,3,1,2],
[4,1,2,3],
[4,1,3,2],]).astype(numpy.uint8)
#Create dictionary argument for rendering
RenderArgs= {"safe_memory_mapping":1,
"aligned_byte_length_genome":8,
"bit_length_edge_type":3,
"curand_nr_threads_per_block":256,
"nr_tile_types":4,
"nr_edge_types":8,
"warpsize":32,
"fit_dim_thread_x":1,
"fit_dim_thread_y":1,
"fit_dim_block_x":1,
"fit_dim_grid_x":19,
"fit_dim_grid_y":19,
"fit_nr_four_permutations":24,
"fit_length_movelist":244,
"fit_nr_redundancy_grid_depth":2,
"fit_nr_redundancy_assemblies":10,
"fit_tile_index_starting_tile":0,
"glob_nr_tile_orientations":4
}
# Set environment for template package Jinja2
env = Environment(loader=PackageLoader('main', './'))
# Load source code from file
Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
# Render source code
RenderedSource = Source.render( RenderArgs )
# Save rendered source code to file
f = open('./rendered.cu', 'w')
f.write(RenderedSource)
f.close()
#Load source code into module
KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
Kernel = KernelSourceModule.get_function("TestAssemblyKernel")
CurandKernel = KernelSourceModule.get_function("CurandInitKernel")
#Initialise InteractionMatrix
InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
def Delta(a,b):
if a==b:
return 1
else:
return 0
for i in range(InteractionMatrix.shape[0]):
for j in range(InteractionMatrix.shape[1]):
InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )
#Set up our InteractionMatrix
InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
print InteractionMatrix
#Set-up genomes
dest = numpy.arange(256).astype(numpy.uint8)
for i in range(0, 256/8):
#dest[i*8 + 0] = int('0b00100101',2) #CRASHES
#dest[i*8 + 1] = int('0b00010000',2) #CRASHES
#dest[i*8 + 0] = int('0b00101000',2)
#dest[i*8 + 1] = int('0b00000000',2)
#dest[i*8 + 2] = int('0b00000000',2)
#dest[i*8 + 3] = int('0b00000000',2)
#dest[i*8 + 4] = int('0b00000000',2)
#dest[i*8 + 5] = int('0b00000000',2)
#dest[i*8 + 6] = int('0b00000000',2)
#dest[i*8 + 7] = int('0b00000000',2)
dest[i*8 + 0] = 36
dest[i*8 + 1] = 151
dest[i*8 + 2] = 90
dest[i*8 + 3] = 109
#.........这里部分代码省略.........
示例15: run
#.........这里部分代码省略.........
coords_gpu = gpuarray.to_gpu(np.random.random( (max_x, max_y, self.DIMENSIONS) ).astype(np.float32)) # initialize coordinates to random values in range 0...1
forces_gpu = gpuarray.zeros( (int(max_x), int(max_y), self.DIMENSIONS), dtype=np.float32 ) # 3D float32 accumulate forces over one timestep
weights_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation
errors_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation
#near_stations_gpu = gpuarray.zeros( (int(max_x), int(max_y), self.N_NEARBY_STATIONS, 2), dtype=np.int32)
# instead of using synthetic distances, use the network distance near stations lists.
# rather than copying the array over to the GPU then binding to external texref,
# could just use matrix_to_texref, but this function only seems to understand 2d arrays
near_stations_gpu = gpuarray.to_gpu( nearby_stations )
debug_gpu = gpuarray.zeros( n_gridpoints, dtype = np.int32 )
debug_img_gpu = gpuarray.zeros_like( errors_gpu )
print "\n----COMPILATION----"
# times could be merged into forces kernel, if done by pixel not station.
# integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays.
# DIM should be replaced by python script, so as not to define twice.
replacements = [( 'N_NEAR_STATIONS', self.N_NEARBY_STATIONS ),
( 'N_STATIONS', n_stations ),
( 'DIM', self.DIMENSIONS)]
src = preprocess_cu('unified_mds_stochastic.cu', replacements)
#print src
mod = SourceModule(src, options=["--ptxas-options=-v"])
stations_kernel = mod.get_function("stations" )
forces_kernel = mod.get_function("forces" )
integrate_kernel = mod.get_function("integrate")
matrix_texref = mod.get_texref('tex_matrix')
station_coords_texref = mod.get_texref('tex_station_coords')
near_stations_texref = mod.get_texref('tex_near_stations')
#ts_coords_texref = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D.
cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures
cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices
# again, matrix_to_texref is not used here because that function only understands 2D arrays
near_stations_gpu.bind_to_texref_ext(near_stations_texref)
# note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app!
# stations_kernel disabled since true network distances are now being used
#stations_kernel(near_stations_gpu, max_x, max_y, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
# autoinit.context.synchronize()
#self.cuda_context.synchronize()
#print "Near stations list:"
#print near_stations_gpu
#sys.exit()
print "\n----CALCULATION----"
t_start = time.time()
n_pass = 0
active_cells = list(self.active_cells) # make a working list of map cells that are accessible
print "N active cells:", len(active_cells)
while (n_pass < self.N_ITERATIONS) :
n_pass += 1
random.shuffle(active_cells)
active_cells_gpu = gpuarray.to_gpu(np.array(active_cells).astype(np.int32))
# Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations,
# they don't move... so the whole thing stabilizes in a couple of cycles.
# Stations are worked on in blocks to avoid locking up the GPU with one giant kernel.
# for subset_low in range(0, n_stations, self.STATION_BLOCK_SIZE) :
for subset_low in range(1) : # changed to try integrating more often
subset_high = subset_low + self.STATION_BLOCK_SIZE
if subset_high > n_stations : subset_high = n_stations
sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) )