当前位置: 首页>>代码示例>>Python>>正文


Python driver.to_device函数代码示例

本文整理汇总了Python中pycuda.driver.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了to_device函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_compare_order

def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
开发者ID:wbkifun,项目名称:my_stuff,代码行数:31,代码来源:compare_order_C_F.py

示例2: multiply_csr

def multiply_csr(matrix, vector, block_size, repeat=1):
    '''
    Method multiply matrix by vector using CUDA module for CSR.
    Calculation executed on nVidia GPU.

    Parameters
    ==========
    matrix : Scipy matrix or numpy array
        Matrix to multiplication.
    vector : numpy array
        Vector to multiplication. His length must equal number of columns
        matrix.
    block_size : int (recommended 128 or 256)
        Size of block CUDA.
    repeat : int > 0
        Number of repetitions multiplications. It has no effect on
        result. Specifies the length of returned list of execution times.

    Returns
    =======
    Tuple of result multiplication and list of execution times.
    '''
    if len(vector) != matrix.shape[1]:
        raise ArithmeticError('Length of the vector is not equal to the'
                              'number of columns of the matrix.')
    matrix = mf.convert_to_scipy_csr(matrix)
    data = numpy.array(matrix.data, dtype=numpy.float32)
    indices = numpy.array(matrix.indices, dtype=numpy.int32)
    indptr = numpy.array(matrix.indptr, dtype=numpy.int32)
    data = cuda.to_device(data)
    indices = cuda.to_device(indices)
    indptr = cuda.to_device(indptr)
    num_rows = matrix.shape[0]
    result = numpy.zeros(num_rows, dtype=numpy.float32)
    time_list = []

    grid_size = int(numpy.ceil((num_rows+0.0)/block_size))
    block = (block_size, 1, 1)
    grid = (grid_size, 1)
    g_vector = cuda.to_device(vector)
    num_rows = numpy.int32(num_rows)

    kernel, texref = cudacodes.get_cuda_csr(block_size=block_size)
    texref.set_address(g_vector, vector.nbytes)
    tex = [texref]

    for _ in range(repeat):
        start.record()
        kernel(data,
               indices,
               indptr,
               cuda.Out(result),
               num_rows,
               block=block,
               grid=grid,
               texrefs=tex)
        end.record()
        end.synchronize()
        time_list.append(start.time_till(end))
    return (result, time_list)
开发者ID:fivitti,项目名称:SMDV,代码行数:60,代码来源:matrixmultiplication.py

示例3: allocation

 def allocation(self):
     super(DGModalGpu, self).allocation()
     self.ul_gpu = cuda.to_device(self.ul)
     self.ul_prev_gpu = cuda.to_device(self.ul)
     self.ul_tmp_gpu = cuda.to_device(self.ul)
     self.kl_gpu = cuda.to_device(self.ul)
     self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
开发者ID:wbkifun,项目名称:my_research,代码行数:7,代码来源:dg_modal_gpu.py

示例4: test_simple_kernel_2

    def test_simple_kernel_2(self):
        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        a = np.random.randn(400).astype(np.float32)
        b = np.random.randn(400).astype(np.float32)
        a_gpu = drv.to_device(a)
        b_gpu = drv.to_device(b)

        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=(400, 1, 1))
        assert la.norm(dest-a*b) == 0

        drv.Context.synchronize()
        # now try with offsets
        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
                block=(399, 1, 1))

        assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:30,代码来源:test_driver.py

示例5: train_gpu

    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)
            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."


        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
                         for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
                   block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
开发者ID:ustbliubo2014,项目名称:DeepLearn,代码行数:35,代码来源:paragraph_vector.py

示例6: get_phir_gpu

def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]

    toc = time.time()
    time_set = toc - tic
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
             AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
开发者ID:cdcooper84,项目名称:pygbe,代码行数:59,代码来源:projection.py

示例7: __init__

 def __init__(self, code, point, struct_ptr):
     self.code = cuda.to_device(code)
     self.point = cuda.to_device(point)
     self.code_shape, self.code_dtype = code.shape, code.dtype
     self.point_shape, self.point_dtype = point.shape, point.dtype
     cuda.memcpy_htod(int(struct_ptr), np.int32(code.size))
     cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code)))
     cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
开发者ID:Huskyeder,项目名称:ParagraphVec,代码行数:8,代码来源:paragraph_vector.py

示例8: sync_to_device

 def sync_to_device(self):
     self.object_array = np.array([f.as_array()
             for f in self.object_list])
     self.d_object_array = cuda.to_device(self.object_array)
     self.d_object_count = cuda.to_device(np.array([self.object_count],
                                                dtype=np.int32))
     self.device_ptr = cuda.to_device(np.array([self.d_object_array,
                                                self.d_object_count],
                                               dtype=np.intp))
     return self.device_ptr
开发者ID:cfobel,项目名称:pycuda_helpers,代码行数:10,代码来源:struct_container.py

示例9: nlargest

  def nlargest(self, n):
    """Returns the per-individual threshold above which there are n outputs.
    
    @param n: number of outputs which should be above the threshold
    @type params: int

    @return list of thresholds, in order of individuals, which delimit the top
            n output values
    """
    log.debug("enter nlargest with n=%d", n)

    # Find one more output so that we can use strictly-less-than when counting
    # and underestimate lift rather than overestimating it.
    n = n + 1

    passSizes = []
    while n > 0:
      nextSize = min(self.maxHeapFloats, n)
      passSizes.append(nextSize)
      n -= nextSize

    log.debug("pass sizes: %r", passSizes)
    
    thresholdsMat = np.ones(shape=(self.popSize,),
                            dtype=np.float32) * np.inf
    self.thresholds = driver.to_device(thresholdsMat)

    uintBytes = np.dtype(np.uint32).itemsize
    thresholdCounts = np.zeros(shape=(self.popSize,),
                               dtype=np.uint32)
    self.thresholdCounts = driver.to_device(thresholdCounts)

    for passSize in passSizes:
      log.debug("begin pass size %d", passSize)
      self.nlargestKernel.prepared_call(self.nlargestGridDim,
                                        self.outputs,
                                        self.trainSet.size,
                                        self.popSize,
                                        passSize,
                                        self.thresholds,
                                        self.thresholdCounts)

      driver.Context.synchronize()

      if log.isEnabledFor(logging.DEBUG):
        thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
        log.debug("thresholds: %s", str(thresholdsMat))
        
        thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
        log.debug("thresholdCounts: %s", str(thresholdCounts))

    self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
    return self.thresholdsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:53,代码来源:ann.py

示例10: P2PKt_gpu

def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
            surf, LorY, w, param, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event() 
        toc = cuda.Event() 
    else:
        tic = Event()
        toc = Event()

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mKtcDev = cuda.to_device(mKtc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    directKt_gpu = kernel.get_function("P2PKt")
    AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays


    directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, 
                surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, 
                surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, 
                surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), 
                numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
                numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return Ktx_gpu, Kty_gpu, Ktz_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:52,代码来源:FMMutils.py

示例11: K

 def K(self, Q, P, angles, quadratures):
     drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
     drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
     Nx = Q.shape[0]
     Ny = int(floor(quadratures.size / 1024.))
     K = scipy.empty((Nx,), dtype=scipy.float32)
     Kb = drv.mem_alloc(4*Ny*Nx)
     Q_gpu = drv.to_device(Q)
     P_gpu = drv.to_device(P)
     self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
                block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
     self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
     return K/self.L
开发者ID:martina88esposito,项目名称:tomohowk,代码行数:13,代码来源:tomography_cuda.py

示例12: __init__

    def __init__(self):
        self.stream = cuda.Stream()
        self.pool = pycuda.tools.PageLockedMemoryPool()
        self._clear()

        # These resources rely on the slots/ringbuffer mechanism for sharing,
        # and so can be shared across any number of launches, genomes, and
        # render kernels. Notably, seeds are self-synchronizing, so they're not
        # attached to either stream object.
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
        self.d_seeds = cuda.to_device(seeds)
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
        self.d_points = cuda.mem_alloc(self._len_d_points)
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:14,代码来源:render.py

示例13: M2P_gpu

def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL

    tic.record()
    M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)]
    MSort  = numpy.zeros(param.Nm*M2P_size)
    MdSort = numpy.zeros(param.Nm*M2P_size)

    i = -1
    for C in surfTar.M2P_list[surf,0:M2P_size]:
        i+=1
        MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M
        MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md

#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)
    MDev = cuda.to_device(MSort.astype(REAL))
    MdDev = cuda.to_device(MdSort.astype(REAL))
#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    multipole_gpu = kernel.get_function("M2P")

    multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev,  
                    surfTar.xcDev, surfTar.ycDev, surfTar.zcDev,
                    MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, 
                    ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa), 
                    numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY), 
                    block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_M2P += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:48,代码来源:FMMutils.py

示例14: batch_indexing

    def batch_indexing(self, planes, data_points):

        data_size = data_points.shape[0] / 128

        self.benchmark_begin('preparing')

        gpu_alloc_objs = []

        # for data points

        #addresses = [] 
        #for point in data_points:
        #    point_addr = drv.to_device(point)
        #    gpu_alloc_objs.append(point_addr)
        #    addresses.append(int(point_addr))

        #np_addresses = numpy.array(addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
        #drv.memcpy_htod(arrays_gpu, np_addresses)

        # for planes

        planes_addresses = [] 
        for plane in planes:
            plane_addr = drv.to_device(plane)
            gpu_alloc_objs.append(plane_addr)
            planes_addresses.append(int(plane_addr))

        planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
        drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)

        # projections
 
        projections = numpy.zeros(data_size).astype(numpy.uint64)

        length = numpy.array([data_size]).astype(numpy.uint64)
 
        print "total: " + str(data_size) + " data points to indexing." 

        self.benchmark_end('preparing')
        self.benchmark_begin('cudaing')

        self.indexing_kernel(
            planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
            block = self.block, grid = self.grid)
        
        self.benchmark_end('cudaing')

        #count = 0
        #for pro in projections:
        #    print "count: " + str(count) + " " + str(pro)
        #    count += 1
        #print projections.shape

        return projections
开发者ID:viirya,项目名称:fastdict,代码行数:60,代码来源:cuda_indexing.py

示例15: index_list_backend

    def index_list_backend(self, ilists):
        from pytools import single_valued

        ilist_length = single_valued(len(il) for il in ilists)
        assert ilist_length == self.plan.dofs_per_face

        from cgen import Typedef, POD

        from pytools import flatten

        flat_ilists_uncast = numpy.array(list(flatten(ilists)))

        if numpy.max(flat_ilists_uncast) >= 256:
            tp = numpy.uint16
        else:
            tp = numpy.uint8

        flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
        assert (flat_ilists == flat_ilists_uncast).all()

        return GPUIndexLists(
            type=tp,
            code=[Typedef(POD(tp, "index_list_entry_t"))],
            device_memory=cuda.to_device(flat_ilists),
            bytes=flat_ilists.size * flat_ilists.itemsize,
        )
开发者ID:gimac,项目名称:hedge,代码行数:26,代码来源:fluxgather.py


注:本文中的pycuda.driver.to_device函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。