当前位置: 首页>>代码示例>>Python>>正文


Python driver.mem_alloc函数代码示例

本文整理汇总了Python中pycuda.driver.mem_alloc函数的典型用法代码示例。如果您正苦于以下问题:Python mem_alloc函数的具体用法?Python mem_alloc怎么用?Python mem_alloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了mem_alloc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: diffuse_pycuda

def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py

示例2: get_spharms_l_eq_2

def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu):
	modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32)


	modelist_gpu = cuda.mem_alloc(modelist.nbytes)

#	nsampslen = np.array(len(theta), ndmin=1).astype(np.int32)
	nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32)
	nsamps_gpu = cuda.mem_alloc(nsamps.nbytes)
	nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes) 	
	
	cuda.memcpy_htod(nsamps_gpu, nsamps)
	cuda.memcpy_htod(nmodes_gpu, nmodeslen)

#	cuda.memcpy_htod(theta_gpu, theta)
#	cuda.memcpy_htod(phi_gpu, phi)
	cuda.memcpy_htod(modelist_gpu, modelist)


	# Get and compile the cuda function 
	sph = mod.get_function("compute_sph_harmonics_l_eq_2")
	result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2)	
	blk  = (1024,1,1)
	grd = (1,1,1) 
	sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd)	

#	cuda.memcpy_dtoh(result, result_gpu)
#	print(result[0:9])
#	print(len(result))
	return	
开发者ID:brandonbm00,项目名称:rapidpe_gpu,代码行数:30,代码来源:sph_harmonics_cu.py

示例3: main

def main():
    (h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0])
    if LINEAR:
        thread_x, thread_y, thread_z = 128,1,1
        block_x, block_y = (w*h*d)/thread_x, 1
        if (w*h*d)%thread_x:
            block_x += 1
    else:
        thread_x, thread_y, thread_z = 16, 8, d
        block_x, block_y = h / thread_x, w / thread_y
        if h % thread_x:
            block_x += 1
        if w % thread_y:
            block_y += 1
    #print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y)

    image_data_size = 2896782 * 4
    a_gpu = cuda.mem_alloc(image_data_size)
    b_gpu = cuda.mem_alloc(image_data_size)
    c_gpu = cuda.mem_alloc(image_data_size)
    
    image_path_pairs = []
    for i in xrange(50):
        page_num = i + 1
        path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num
        image_path_pairs.append((path1,path2))
    
    do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
开发者ID:B-Rich,项目名称:python_scripts,代码行数:28,代码来源:accelerated.py

示例4: __compute_sub_gaussian_gpu

    def __compute_sub_gaussian_gpu(self, sub_partitions):
        if sub_partitions < 1:
            raise Exception("You can't have less than 1 partition")
        elif sub_partitions > self.pts.shape[0]:
            raise Exception("sub partitions need to be smaller than pts size")
        # Delta Partitions
        d_part = self.pts.shape[0]/sub_partitions

        # Does the correct partitioning
        alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize
        self.pts_gpu = cuda.mem_alloc(alloc_size)
        self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0])
        self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2])

        for partition in range(sub_partitions):
            sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :]
            self.__compute_guassian_on_pts(sub_pts)
        self.pts_gpu.free()

        # See's if there is a remainder of points to work with
        if self.pts.shape[0] % sub_partitions:
            alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize)
            self.pts_gpu = cuda.mem_alloc(alloc_size)
            self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :])
            self.pts_gpu.free()
开发者ID:SCIInstitute,项目名称:MLM,代码行数:25,代码来源:gaussian_gpu.py

示例5: calc_psd

    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
开发者ID:andrewbolster,项目名称:multiuserDSM,代码行数:32,代码来源:gpu.py

示例6: cuda_crossOver

def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py

示例7: alloc

    def alloc(self, dim, stream=None):
        """
        Ensure that this object's framebuffers are large enough to handle the
        given dimensions, allocating new ones if not.

        If ``stream`` is not None and a reallocation is necessary, the stream
        will be synchronized before the old buffers are deallocated.
        """
        nbins = dim.ah * dim.astride
        if self.nbins >= nbins:
            return
        if self.nbins is not None:
            self.free()
        try:
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back = cuda.mem_alloc(16 * nbins)
            self.d_side = cuda.mem_alloc(16 * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
            # don't want to kill the server, but we also don't want to leave
            # it stuck without any free memory to complete the next alloc.
            # TODO: measure free mem and only take tasks that fit (but that
            # should be done elsewhere)
            self.free(stream)
            raise e
开发者ID:vincentmele,项目名称:cuburn,代码行数:26,代码来源:render.py

示例8: calc_blob_blob_forces_pycuda

def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
开发者ID:stochasticHydroTools,项目名称:RigidMultiblobsWall,代码行数:33,代码来源:forces_pycuda_user_defined.py

示例9: __init__

    def __init__(self, max_size, offsets=None):
        """
        Create a sorter. The sorter will hold on to internal resources for as
        long as it is alive, including an 'offsets' array of size 4*max_size.
        To share this cost, you may pass in an array of at least this size to
        __init__ (to, for instance, share across different bit-widths in a
        multi-pass sort).
        """
        self.init_mod()
        self.max_size = max_size
        assert max_size % self.group_size == 0
        max_grids = max_size / self.group_size

        if offsets is None:
            self.doffsets = cuda.mem_alloc(self.max_size * 4)
        else:
            self.doffsets = offsets
        self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4)
        self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4)

        # There are probably better ways to choose how many condensation
        # groups to launch. TODO: maybe pick one if I care
        self.ncond = 32
        self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4)
        self.dglobal = cuda.mem_alloc(self.radix_size * 4)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:25,代码来源:sort.py

示例10: prepare_device_arrays

    def prepare_device_arrays(self):

        self.maxLayers  = self.grid_prop.GetMaxLayers()
        nczbins_fine    = len(self.czcen_fine)
        numLayers       = np.zeros(nczbins_fine,dtype=np.int32)
        densityInLayer  = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
        distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)

        self.grid_prop.GetNumberOfLayers(numLayers)
        self.grid_prop.GetDensityInLayer(densityInLayer)
        self.grid_prop.GetDistanceInLayer(distanceInLayer)

        # Copy all these earth info arrays to device:
        self.d_numLayers       = cuda.mem_alloc(numLayers.nbytes)
        self.d_densityInLayer  = cuda.mem_alloc(densityInLayer.nbytes)
        self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
        cuda.memcpy_htod(self.d_numLayers,numLayers)
        cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
        cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)

        self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
        self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
        cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
        cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)

        return
开发者ID:gkrueckl,项目名称:pisa,代码行数:26,代码来源:Prob3GPUOscillationService.py

示例11: poisson_parallel

def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py

示例12: __init__

    def __init__(self, init_data, n_generators):

        self.ctx = curr_gpu.make_context()
        self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        ################################################################################################################

        self.width_mat = np.int32(init_data.shape[0])
        #        self.gpu_init_data = ga.to_gpu(init_data)
        self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
        cuda.memcpy_htod(self.gpu_init_data, init_data)

        self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
        print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
        cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
        #        self.gpu_new_data = ga.to_gpu(self.cpu_new_data)

        self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
        self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
        cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
        #        self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
        ################################################################################################################
        self.init_rng = self.module.get_function("init_rng")
        self.gen_rand_mat = self.module.get_function("gen_rand_mat")
        self.sum_along_axis = self.module.get_function("sum_along_axis")
        self.norm_along_axis = self.module.get_function("norm_along_axis")
        self.init_vect_sum = self.module.get_function("init_vect_sum")
        self.copy_mat = self.module.get_function("copy_mat")
        ################################################################################################################
        self.n_generators = n_generators
        seed = 1
        self.rng_states = cuda.mem_alloc(
            n_generators
            * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
        )
        self.init_rng(
            np.int32(n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(n_generators // 64 + 1, 1),
        )
        (free, total) = cuda.mem_get_info()

        size_block_x = 32
        size_block_y = 32
        n_blocks_x = int(self.width_mat) // (size_block_x) + 1
        n_blocks_y = int(self.width_mat) // (size_block_y) + 1
        self.grid = (n_blocks_x, n_blocks_y, 1)
        self.block = (size_block_x, size_block_y, 1)
开发者ID:koszullab,项目名称:centroID,代码行数:59,代码来源:cuda_lib.py

示例13: confirmInitialization

def confirmInitialization(featuresForSOM,somMatrix):
    #allocate memory for the somcuda on the device
    somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
    somBytesPerRow = np.int32(somMatrix.strides[0])
    somNumberOfRows = np.int32(somMatrix.shape[0])
    somNumberOfColumns = np.int32(somMatrix.shape[1])
    pycuda.memcpy_htod(somMatrixPtr,somMatrix)
    #allocate space for bmu index
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    intraDayOffset = features.columns.get_loc('Ret_121')
    dayOffset = features.columns.get_loc('Ret_PlusOne')
    objVal = 0.0;
    objSampSize=0.0
    r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] 
    nodeHitMatrix = np.array(r).astype(np.float32)
    hitCountDict = defaultdict(list)
    samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
    if len(samples) >= len(featuresForSOM):
        samples = [x for x in range (0, len(featuresForSOM))]       
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
    fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
    ax = plt.subplot(111)
    somplot = plt.imshow(nodeHitMatrix,interpolation="none")
    plt.show()
    plt.pause(0.1)
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:58,代码来源:sommap.py

示例14: computeAvgDistancetoBMU

def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns):
    adjustNodes = {}
    sampSize = 0
    cumDistance = 0.0
    nodeHitMatrix.fill(0)
    hitCountDict.clear()
    if len(featuresForSOM) < 100:
        sampSize = len(featuresForSOM)
    elif currentIter < len(featuresForSOM):
        sampSize = int(currentIter)
        if sampSize == 0:
            sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM))
    else:
        sampSize = len(featuresForSOM)
    samples = [x for x in range (0,sampSize)]
    #allocate space for bmu
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    #allocate space for bmu index
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        cumDistance += np.min(bmu)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        adjustNodes[i]=[block,thread]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    iterationDistance.append(cumDistance/sampSize)
    iterationCount.append(currentIter)
    return cumDistance/sampSize
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:56,代码来源:sommap.py

示例15: set_refsmiles

    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py


注:本文中的pycuda.driver.mem_alloc函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。