本文整理汇总了Python中pycuda.driver.mem_alloc函数的典型用法代码示例。如果您正苦于以下问题:Python mem_alloc函数的具体用法?Python mem_alloc怎么用?Python mem_alloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了mem_alloc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: diffuse_pycuda
def diffuse_pycuda(u):
nx,ny = np.int32(u.shape)
alpha = np.float32(0.645)
dx = np.float32(3.5/(nx-1))
dy = np.float32(3.5/(ny-1))
dt = np.float32(1e-05)
time = np.float32(0.4)
nt = np.int32(np.ceil(time/dt))
# print nt
u[0,:]=200
u[:,0]=200
u = u.astype(np.float32)
u_prev = u.copy()
u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
cuda.memcpy_htod(u_d, u)
cuda.memcpy_htod(u_prev_d, u_prev)
BLOCKSIZE = 16
gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
blockSize = (BLOCKSIZE,BLOCKSIZE,1)
for t in range(nt+1):
copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
cuda.memcpy_dtoh(u, u_d)
return u
示例2: get_spharms_l_eq_2
def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu):
modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32)
modelist_gpu = cuda.mem_alloc(modelist.nbytes)
# nsampslen = np.array(len(theta), ndmin=1).astype(np.int32)
nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32)
nsamps_gpu = cuda.mem_alloc(nsamps.nbytes)
nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes)
cuda.memcpy_htod(nsamps_gpu, nsamps)
cuda.memcpy_htod(nmodes_gpu, nmodeslen)
# cuda.memcpy_htod(theta_gpu, theta)
# cuda.memcpy_htod(phi_gpu, phi)
cuda.memcpy_htod(modelist_gpu, modelist)
# Get and compile the cuda function
sph = mod.get_function("compute_sph_harmonics_l_eq_2")
result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2)
blk = (1024,1,1)
grd = (1,1,1)
sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd)
# cuda.memcpy_dtoh(result, result_gpu)
# print(result[0:9])
# print(len(result))
return
示例3: main
def main():
(h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0])
if LINEAR:
thread_x, thread_y, thread_z = 128,1,1
block_x, block_y = (w*h*d)/thread_x, 1
if (w*h*d)%thread_x:
block_x += 1
else:
thread_x, thread_y, thread_z = 16, 8, d
block_x, block_y = h / thread_x, w / thread_y
if h % thread_x:
block_x += 1
if w % thread_y:
block_y += 1
#print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y)
image_data_size = 2896782 * 4
a_gpu = cuda.mem_alloc(image_data_size)
b_gpu = cuda.mem_alloc(image_data_size)
c_gpu = cuda.mem_alloc(image_data_size)
image_path_pairs = []
for i in xrange(50):
page_num = i + 1
path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num
image_path_pairs.append((path1,path2))
do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
示例4: __compute_sub_gaussian_gpu
def __compute_sub_gaussian_gpu(self, sub_partitions):
if sub_partitions < 1:
raise Exception("You can't have less than 1 partition")
elif sub_partitions > self.pts.shape[0]:
raise Exception("sub partitions need to be smaller than pts size")
# Delta Partitions
d_part = self.pts.shape[0]/sub_partitions
# Does the correct partitioning
alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize
self.pts_gpu = cuda.mem_alloc(alloc_size)
self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0])
self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2])
for partition in range(sub_partitions):
sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :]
self.__compute_guassian_on_pts(sub_pts)
self.pts_gpu.free()
# See's if there is a remainder of points to work with
if self.pts.shape[0] % sub_partitions:
alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize)
self.pts_gpu = cuda.mem_alloc(alloc_size)
self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :])
self.pts_gpu.free()
示例5: calc_psd
def calc_psd(self,bitloads,xtalk):
#Number of expected permutations
Ncombinations=self.K
#Check if this is getting hairy and assign grid/block dimensions
(warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)
#How many individual lk's
memdim=blockCount*threadCount
threadshare_grid=(blockCount,1)
threadshare_block=(threadCount,1,1)
#Memory (We get away with the NCombinations because calpsd checks against it)
d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
h_p=np.zeros((self.K,self.N)).astype(self.type)
cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
#Go solve
#__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){
self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
cuda.Context.synchronize()
cuda.memcpy_dtoh(h_p,d_p)
d_a.free()
d_bitload.free()
d_XTG.free()
d_p.free()
return h_p.astype(np.float64)
示例6: cuda_crossOver
def cuda_crossOver(sola, solb):
""" """
sol_len = len(sola);
a_gpu = cuda.mem_alloc(sola.nbytes);
b_gpu = cuda.mem_alloc(solb.nbytes);
cuda.memcpy_htod(a_gpu, sola);
cuda.memcpy_htod(b_gpu, solb);
func = mod.get_function("crossOver");
func(a_gpu,b_gpu, block=(sol_len,1,1));
a_new = numpy.empty_like(sola);
b_new = numpy.empty_like(solb);
cuda.memcpy_dtoh(a_new, a_gpu);
cuda.memcpy_dtoh(b_new, b_gpu);
if debug == True:
print "a:", a;
print "b:",b;
print "new a:",a_new;
print "new b:",b_new;
return a_new,b_new;
示例7: alloc
def alloc(self, dim, stream=None):
"""
Ensure that this object's framebuffers are large enough to handle the
given dimensions, allocating new ones if not.
If ``stream`` is not None and a reallocation is necessary, the stream
will be synchronized before the old buffers are deallocated.
"""
nbins = dim.ah * dim.astride
if self.nbins >= nbins:
return
if self.nbins is not None:
self.free()
try:
self.d_front = cuda.mem_alloc(16 * nbins)
self.d_back = cuda.mem_alloc(16 * nbins)
self.d_side = cuda.mem_alloc(16 * nbins)
self.nbins = nbins
except cuda.MemoryError, e:
# If a frame that's too large sneaks by the task distributor, we
# don't want to kill the server, but we also don't want to leave
# it stuck without any free memory to complete the next alloc.
# TODO: measure free mem and only take tasks that fit (but that
# should be done elsewhere)
self.free(stream)
raise e
示例8: calc_blob_blob_forces_pycuda
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
# Determine number of threads and blocks for the GPU
number_of_blobs = np.int32(len(r_vectors))
threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)
# Get parameters from arguments
L = kwargs.get('periodic_length')
eps = kwargs.get('repulsion_strength')
b = kwargs.get('debye_length')
blob_radius = kwargs.get('blob_radius')
# Reshape arrays
x = np.reshape(r_vectors, number_of_blobs * 3)
f = np.empty_like(x)
# Allocate GPU memory
x_gpu = cuda.mem_alloc(x.nbytes)
f_gpu = cuda.mem_alloc(f.nbytes)
# Copy data to the GPU (host to device)
cuda.memcpy_htod(x_gpu, x)
# Get blob-blob force function
force = mod.get_function("calc_blob_blob_force")
# Compute mobility force product
force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1))
# Copy data from GPU to CPU (device to host)
cuda.memcpy_dtoh(f, f_gpu)
return np.reshape(f, (number_of_blobs, 3))
示例9: __init__
def __init__(self, max_size, offsets=None):
"""
Create a sorter. The sorter will hold on to internal resources for as
long as it is alive, including an 'offsets' array of size 4*max_size.
To share this cost, you may pass in an array of at least this size to
__init__ (to, for instance, share across different bit-widths in a
multi-pass sort).
"""
self.init_mod()
self.max_size = max_size
assert max_size % self.group_size == 0
max_grids = max_size / self.group_size
if offsets is None:
self.doffsets = cuda.mem_alloc(self.max_size * 4)
else:
self.doffsets = offsets
self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4)
self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4)
# There are probably better ways to choose how many condensation
# groups to launch. TODO: maybe pick one if I care
self.ncond = 32
self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4)
self.dglobal = cuda.mem_alloc(self.radix_size * 4)
示例10: prepare_device_arrays
def prepare_device_arrays(self):
self.maxLayers = self.grid_prop.GetMaxLayers()
nczbins_fine = len(self.czcen_fine)
numLayers = np.zeros(nczbins_fine,dtype=np.int32)
densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
self.grid_prop.GetNumberOfLayers(numLayers)
self.grid_prop.GetDensityInLayer(densityInLayer)
self.grid_prop.GetDistanceInLayer(distanceInLayer)
# Copy all these earth info arrays to device:
self.d_numLayers = cuda.mem_alloc(numLayers.nbytes)
self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes)
self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
cuda.memcpy_htod(self.d_numLayers,numLayers)
cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)
self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)
return
示例11: poisson_parallel
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
# create Cheetah template and fill in variables for Poisson kernal
template = Template(poisson_blending_source)
template.BLOCK_DIM_X = b_size[0]
template.BLOCK_DIM_Y = b_size[1]
template.WIDTH = dest_im.shape[1]
template.HEIGHT = dest_im.shape[0]
template.RGB = RGB
template.NEIGHBORS = neighbors
# compile the CUDA kernel
poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")
# alloc memory in GPU
out_image = np.array(dest_im, dtype =np.uint8)
d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
cu.memcpy_htod(d_source, source_im)
cu.memcpy_htod(d_destination, dest_im)
cu.memcpy_htod(d_buffer, interior_buffer)
# calls CUDA for Poisson Blending n # of times
for i in range(n):
poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)
# retrieves the final output image and returns
cu.memcpy_dtoh(out_image, d_destination)
return out_image
示例12: __init__
def __init__(self, init_data, n_generators):
self.ctx = curr_gpu.make_context()
self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
(free, total) = cuda.mem_get_info()
print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
print(("Global free memory :%i Mo free" % (free / 10 ** 6)))
################################################################################################################
self.width_mat = np.int32(init_data.shape[0])
# self.gpu_init_data = ga.to_gpu(init_data)
self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
cuda.memcpy_htod(self.gpu_init_data, init_data)
self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
(free, total) = cuda.mem_get_info()
print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
print(("Global free memory :%i Mo free" % (free / 10 ** 6)))
self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
# self.gpu_new_data = ga.to_gpu(self.cpu_new_data)
self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
# self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
################################################################################################################
self.init_rng = self.module.get_function("init_rng")
self.gen_rand_mat = self.module.get_function("gen_rand_mat")
self.sum_along_axis = self.module.get_function("sum_along_axis")
self.norm_along_axis = self.module.get_function("norm_along_axis")
self.init_vect_sum = self.module.get_function("init_vect_sum")
self.copy_mat = self.module.get_function("copy_mat")
################################################################################################################
self.n_generators = n_generators
seed = 1
self.rng_states = cuda.mem_alloc(
n_generators
* characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
)
self.init_rng(
np.int32(n_generators),
self.rng_states,
np.uint64(seed),
np.uint64(0),
block=(64, 1, 1),
grid=(n_generators // 64 + 1, 1),
)
(free, total) = cuda.mem_get_info()
size_block_x = 32
size_block_y = 32
n_blocks_x = int(self.width_mat) // (size_block_x) + 1
n_blocks_y = int(self.width_mat) // (size_block_y) + 1
self.grid = (n_blocks_x, n_blocks_y, 1)
self.block = (size_block_x, size_block_y, 1)
示例13: confirmInitialization
def confirmInitialization(featuresForSOM,somMatrix):
#allocate memory for the somcuda on the device
somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
somBytesPerRow = np.int32(somMatrix.strides[0])
somNumberOfRows = np.int32(somMatrix.shape[0])
somNumberOfColumns = np.int32(somMatrix.shape[1])
pycuda.memcpy_htod(somMatrixPtr,somMatrix)
#allocate space for bmu index
bmu = np.zeros(somMatrixRows).astype(np.float32)
bmuPtr = pycuda.mem_alloc(bmu.nbytes)
pycuda.memcpy_htod(bmuPtr,bmu)
bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
intraDayOffset = features.columns.get_loc('Ret_121')
dayOffset = features.columns.get_loc('Ret_PlusOne')
objVal = 0.0;
objSampSize=0.0
r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)]
nodeHitMatrix = np.array(r).astype(np.float32)
hitCountDict = defaultdict(list)
samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
if len(samples) >= len(featuresForSOM):
samples = [x for x in range (0, len(featuresForSOM))]
for i in samples:
feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
featuresPtr = pycuda.mem_alloc(feats.nbytes)
pycuda.memcpy_htod(featuresPtr,feats)
#find the BMU
computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
pycuda.memcpy_dtoh(bmu,bmuPtr)
pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
block = np.argmin(bmu)
thread = bmuIndex[block]
val = hitCountDict[(block,thread)]
if val == None or len(val) == 0:
hitCountDict[(block,thread)] = [1,i]
else:
hitCountDict[(block,thread)][0] += 1
val = np.int32(hitCountDict[(block,thread)])[0]
if val == 1:
val = 0x0000ff00
elif val <= 10:
val = 0x000000ff
elif val <= 100:
val = 0x00ff0000
else:
val = 0x00ffffff
bval = (val & 0x000000ff)
gval = ((val & 0x0000ff00) >> 8)
rval = ((val & 0x00ff0000) >> 16)
nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
ax = plt.subplot(111)
somplot = plt.imshow(nodeHitMatrix,interpolation="none")
plt.show()
plt.pause(0.1)
示例14: computeAvgDistancetoBMU
def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns):
adjustNodes = {}
sampSize = 0
cumDistance = 0.0
nodeHitMatrix.fill(0)
hitCountDict.clear()
if len(featuresForSOM) < 100:
sampSize = len(featuresForSOM)
elif currentIter < len(featuresForSOM):
sampSize = int(currentIter)
if sampSize == 0:
sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM))
else:
sampSize = len(featuresForSOM)
samples = [x for x in range (0,sampSize)]
#allocate space for bmu
bmu = np.zeros(somMatrixRows).astype(np.float32)
bmuPtr = pycuda.mem_alloc(bmu.nbytes)
pycuda.memcpy_htod(bmuPtr,bmu)
#allocate space for bmu index
bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
for i in samples:
feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
featuresPtr = pycuda.mem_alloc(feats.nbytes)
pycuda.memcpy_htod(featuresPtr,feats)
#find the BMU
computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
pycuda.memcpy_dtoh(bmu,bmuPtr)
pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
cumDistance += np.min(bmu)
block = np.argmin(bmu)
thread = bmuIndex[block]
adjustNodes[i]=[block,thread]
val = hitCountDict[(block,thread)]
if val == None or len(val) == 0:
hitCountDict[(block,thread)] = [1,i]
else:
hitCountDict[(block,thread)][0] += 1
val = np.int32(hitCountDict[(block,thread)])[0]
if val == 1:
val = 0x0000ff00
elif val <= 10:
val = 0x000000ff
elif val <= 100:
val = 0x00ff0000
else:
val = 0x00ffffff
bval = (val & 0x000000ff)
gval = ((val & 0x0000ff00) >> 8)
rval = ((val & 0x00ff0000) >> 16)
nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
iterationDistance.append(cumDistance/sampSize)
iterationCount.append(currentIter)
return cumDistance/sampSize
示例15: set_refsmiles
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
"""Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.
Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
"""
# Set up lingo and count matrices on device #{{{
if self.usePycudaArray:
# Set up using PyCUDA CUDAArray support
self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
self.gpu.tex2lr.set_array(self.gpu.rsmiles)
self.gpu.tex2cr.set_array(self.gpu.rcounts)
else:
# Manually handle setup
temprlmat = self._padded_array(refsmilesmat)
if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)
temprcmat = self._padded_array(refcountsmat)
self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)
descriptor = cuda.ArrayDescriptor()
descriptor.width = temprcmat.shape[1]
descriptor.height = temprcmat.shape[0]
descriptor.format = cuda.array_format.UNSIGNED_INT32
descriptor.num_channels = 1
self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
self.gpu.stream.synchronize()
del temprlmat
del temprcmat
#}}}
self.rlengths = reflengths
self.rshape = refsmilesmat.shape
self.nref = refsmilesmat.shape[0]
# Copy reference lengths to GPU
self.gpu.rl_gpu = cuda.to_device(reflengths)
# Allocate buffers for query set magnitudes
self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
if refmags is not None:
cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
else:
# Calculate query set magnitudes on GPU
magthreads = 256
self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
return