当前位置: 首页>>代码示例>>Python>>正文


Python driver.memcpy_dtoh函数代码示例

本文整理汇总了Python中pycuda.driver.memcpy_dtoh函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_dtoh函数的具体用法?Python memcpy_dtoh怎么用?Python memcpy_dtoh使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了memcpy_dtoh函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: calcV1complex

    def calcV1complex(self, stim, speed):
        """Compute V1 complex cell responses of a frame."""

        # allocate stim on device
        self._loadInput(stim)

        # convolve the stimulus with separate V1 filters
        self._calcV1linear()

        # rectify linear response to get V1 simple cell firing rate
        self._calcV1rect()

        # spatial pooling to get V1 complex
        self._calcV1blur()

        # divisive normalization
        self._calcV1normalize()

        # steer filters in specified directions
        self._calcV1direction(speed)

        # get data from device
        res = np.zeros(self.nrX*self.nrY*self.nrDirs).astype(np.float32)
        cuda.memcpy_dtoh(res, self.d_respV1c)

        return res
开发者ID:UCI-CARL,项目名称:MotionEnergy,代码行数:26,代码来源:motionenergy.py

示例2: scenario_inplace_padded_C2R

def scenario_inplace_padded_C2R(batch,tic,toc):

  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([16*(BENG_CHANNELS//16+1)],int32)
  onembed = array([2*inembed[0]],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0],
  	                                       onembed.ctypes.data, 1, onembed[0],
  					       cufft.CUFFT_C2R, batch)

  data_shape = (batch,inembed[0])
  cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_data = cpu_data.astype(complex64)
  gpu_data  = cuda.mem_alloc(8*batch*inembed[0])		# complex64
  cuda.memcpy_htod(gpu_data,cpu_data)

  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data))
  toc.record()
  toc.synchronize()

  cpu_result = np.empty(batch*onembed[0],dtype=np.float32)
  cuda.memcpy_dtoh(cpu_result,gpu_data)
  cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_)
  result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1)
  print 'Batched in-place scenario'
  print 'test passed:',np.allclose(cpu_result,result)
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)' 
开发者ID:sma-wideband,项目名称:sdbe,代码行数:27,代码来源:fft_test.py

示例3: runTest

    def runTest(self):
        nx, ny, nz, str_f, pt0, pt1, is_array = self.args
        slice_xyz = common.slices_two_points(pt0, pt1)

        # generate random source
        if is_array:
            shape = common.shape_two_points(pt0, pt1)
            value = np.random.rand(*shape).astype(np.float32)
        else:
            value = np.random.ranf()

        # instance
        fields = Fields(0, nx, ny, nz, '', 'single')

        tfunc = lambda tstep: np.sin(0.03*tstep)
        incident = IncidentDirect(fields, str_f, pt0, pt1, tfunc, value) 

        # host allocations
        eh = np.zeros(fields.ns_pitch, dtype=fields.dtype)

        # verify
        eh[slice_xyz] = fields.dtype(value) * fields.dtype(tfunc(1))
        fields.update_e()
        fields.update_h()

        copy_eh_buf = fields.get_buf(str_f)
        copy_eh = np.zeros_like(eh)
        cuda.memcpy_dtoh(copy_eh, copy_eh_buf)

        original = eh[slice_xyz]
        copy = copy_eh[slice_xyz]
        norm = np.linalg.norm(original - copy)
        self.assertEqual(norm, 0, '%s, %g' % (self.args, norm))

        fields.context_pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:35,代码来源:test_incident_direct.py

示例4: calculate

    def calculate (self, data, f_high, f_bins):
        
        import pycuda.driver as driver
        import pycuda.compiler as compiler
        import pycuda.autoinit
        
        log = logging.getLogger("astroplpython.function.signal")   
        log.debug("CULSP.calculate() called")
        
        log.debug("Orig Data:"+str(data)) 
        
        log.debug(" TODO: Calculate blocksize")

        log.debug("set up GPU, allocate memory for working")
        a_gpu = driver.mem_alloc(data.size * data.dtype.itemsize)
        
        log.debug("push data into GPU memory")
        driver.memcpy_htod(a_gpu, data)
        
        log.debug("compile and run the culsp_kernel on data in the GPU")
        culsp_func = compiler.SourceModule(self._kernelStr).get_function("culsp_kernel") 
        culsp_func (a_gpu, block=(4,4,1))

        log.debug("pull data from GPU back into main memory")
        result = np.empty_like(data)
        driver.memcpy_dtoh(result, a_gpu)
        
        log.debug("return result") 
        return result
开发者ID:brianthomas,项目名称:astroplpython,代码行数:29,代码来源:LSPeriodogram.py

示例5: poisson_parallel

def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py

示例6: diffuse_pycuda

def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py

示例7: fromGPU

 def fromGPU(self, shared_mem, buff_dtype=np.float32 ):
     
     buff = np.frombuffer(shared_mem.get_obj(), dtype=buff_dtype)
     buff = buff[:self.buffer_nnets*self.buffer_nsamples]
     buff = buff.reshape( (self.buffer_nnets, self.buffer_nsamples) )
     cuda.memcpy_dtoh(buff, self.gpu_data)
     return buff
开发者ID:JohnCEarls,项目名称:GPUDirac,代码行数:7,代码来源:data.py

示例8: test_constant_memory

    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
开发者ID:davidweichiang,项目名称:pycuda,代码行数:28,代码来源:test_driver.py

示例9: calc_bandwidth_d2h

	def calc_bandwidth_d2h( s ):
		t1 = datetime.now()
		cuda.memcpy_dtoh( s.a, s.dev_a )
		dt = datetime.now() - t1
		dt_float = dt.seconds + dt.microseconds*1e-6

		return s.nbytes/dt_float/gbytes
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:150-gpus-mpi-range-h5-seperate.py

示例10: fromSourceFile

def fromSourceFile():
    import numpy as np
    import pycuda.driver as cuda
    import pycuda.autoinit
    from pycuda.compiler import SourceModule

    #random data
    np.random.seed(1)
    a = np.random.randn(4,4)
    a = a.astype(np.float32)

    #read code and get function
    mod = SourceModule(open('simple.cu').read())
    func = mod.get_function("doublify")

    #allocate memory on the GPU
    a_gpu = cuda.mem_alloc(a.nbytes)

    #transfer to the GPU memory
    cuda.memcpy_htod(a_gpu, a)

    #execute
    func(a_gpu, block=(4,4,1))

    #collect results
    a_doubled = np.empty_like(a)
    cuda.memcpy_dtoh(a_doubled, a_gpu)

    print a_doubled
    print a_doubled / (a*2)
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:30,代码来源:cudaTests.py

示例11: test_prepared_invocation

    def test_prepared_invocation(self):
        a = np.random.randn(4,4).astype(np.float32)
        a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)

        drv.memcpy_htod(a_gpu, a)

        mod = SourceModule("""
            __global__ void doublify(float *a)
            {
              int idx = threadIdx.x + threadIdx.y*blockDim.x;
              a[idx] *= 2;
            }
            """)

        func = mod.get_function("doublify")
        func.prepare("P")
        func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
        a_doubled = np.empty_like(a)
        drv.memcpy_dtoh(a_doubled, a_gpu)
        print (a)
        print (a_doubled)
        assert la.norm(a_doubled-2*a) == 0

        # now with offsets
        func.prepare("P")
        a_quadrupled = np.empty_like(a)
        func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
        drv.memcpy_dtoh(a_quadrupled, a_gpu)
        assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:29,代码来源:test_driver.py

示例12: loop

def loop(iterations):
    ts = 0
    while(ts<iterations):
        ' To avoid overwrites a temporary copy is made of F '
        T[:] = F
        cuda.memcpy_htod(T_gpu, T)
        
        ' Propagate '
        prop(F_gpu, T_gpu, 
             block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate density and get bounceback from obstacle nodes '
        density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu,
                block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate equilibrium '
        eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, 
           U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), 
           grid=(gridDimX,gridDimY))
        
        ' Transfer bounceback to obstacle nodes '
        bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu,
                   block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
                              
        ' Copy F to host for copy to T in beginning of loop '
        cuda.memcpy_dtoh(F, F_gpu)
        
        ts += 1
开发者ID:hohiroki,项目名称:Lattice-Boltzmann,代码行数:28,代码来源:lbm2dcu.py

示例13: cuda_crossOver

def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py

示例14: _debug_print

	def _debug_print( self ) :
		cuda_driver.memcpy_dtoh( self.f , self.df1 )

		np.set_printoptions( 3 , 10000 , linewidth = 200 , suppress = True )

		print '#'*80
		print self.f
开发者ID:jkotur,项目名称:particles,代码行数:7,代码来源:lbm.py

示例15: convolution_cuda

def convolution_cuda(sourceImage,  filterx,  filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32',  'source image must be float32'
    (imageHeight,  imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4  # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns,  filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = numpy.int32(DATA_H)
    DATA_W = numpy.int32(DATA_W)
    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=[int(e) for e in blockGridRows],  block=[int(e) for e in threadBlockRows])    
    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=[int(e) for e in blockGridColumns],  block=[int(e) for e in threadBlockColumns])

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:34,代码来源:Convolution.py


注:本文中的pycuda.driver.memcpy_dtoh函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。