Python cuda.stream函数代码示例

本文整理汇总了Python中numba.cuda.stream函数的典型用法代码示例。如果您正苦于以下问题：Python stream函数的具体用法？Python stream怎么用？Python stream使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了stream函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()

开发者ID:ContinuumIO，项目名称:numbapro-examples，代码行数:27，代码来源:pricer_cuda_vectorize.py

示例2: test_gufunc_stream

    def test_gufunc_stream(self):
        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))

开发者ID:GaZ3ll3，项目名称:numba，代码行数:27，代码来源:test_gufunc.py

示例3: setup

 def setup(self):
     self.stream = cuda.stream()
     self.f32 = np.zeros(self.n, dtype=np.float32)
     self.d_f32 = cuda.to_device(self.f32, self.stream)
     self.f64 = np.zeros(self.n, dtype=np.float64)
     self.d_f64 = cuda.to_device(self.f64, self.stream)
     self.stream.synchronize()

开发者ID:gmarkall，项目名称:numba-benchmark，代码行数:7，代码来源:bench_cuda.py

示例4: test_func

    def test_func(self):
        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)

开发者ID:GaZ3ll3，项目名称:numba，代码行数:26，代码来源:test_matmul.py

示例5: test_laplace_small

    def test_laplace_small(self):
        NN = 256
        NM = 256

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM
        iter_max = 1000

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            if iter % 100 == 0:
                print("%5d, %0.6f (elapsed: %f s)" %
                      (iter, error, time.time() - timer))

            iter += 1

        runtime = time.time() - timer
        print(" total: %f s" % runtime)

开发者ID:ASPP，项目名称:numba，代码行数:60，代码来源:test_laplace.py

示例6: test

        def test(ty):
            print("Test %s" % ty)
            data = np.array(np.random.random(1e6 + 1), dtype=ty)

            ts = time()
            stream = cuda.stream()
            device_data = cuda.to_device(data, stream)
            dresult = cuda_ufunc(device_data, device_data, stream=stream)
            result = dresult.copy_to_host()
            stream.synchronize()
            tnumba = time() - ts

            ts = time()
            gold = np_ufunc(data, data)
            tnumpy = time() - ts

            print("Numpy time: %fs" % tnumpy)
            print("Numba time: %fs" % tnumba)

            if tnumba < tnumpy:
                print("Numba is FASTER by %fx" % (tnumpy / tnumba))
            else:
                print("Numba is SLOWER by %fx" % (tnumba / tnumpy))

            self.assertTrue(np.allclose(gold, result), (gold, result))

开发者ID:maartenscholl，项目名称:numba，代码行数:25，代码来源:test_vectorize.py

示例7: reduce

    def reduce(self, arg, stream=0):
        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
                                                         "ufunc"
        assert arg.ndim == 1, "must use 1d array"

        n = arg.shape[0]
        gpu_mems = []

        if n == 0:
            raise TypeError("Reduction on an empty array.")
        elif n == 1:  # nothing to do
            return arg[0]

        # always use a stream
        stream = stream or cuda.stream()
        with stream.auto_synchronize():
            # transfer memory to device if necessary
            if devicearray.is_cuda_ndarray(arg):
                mem = arg
            else:
                mem = cuda.to_device(arg, stream)
                # do reduction
            out = self.__reduce(mem, gpu_mems, stream)
            # use a small buffer to store the result element
            buf = np.array((1,), dtype=arg.dtype)
            out.copy_to_host(buf, stream=stream)

        return buf[0]

开发者ID:cpcloud，项目名称:numba，代码行数:28，代码来源:dispatcher.py

示例8: test_func

    def test_func(self):
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        print("N = %d x %d" % (n, n))

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        Amat = np.matrix(A)
        Bmat = np.matrix(B)

        s = time()
        Cans = Amat * Bmat
        e = time()
        tcpu = e - s

        print('cpu:  %f' % tcpu)
        print('cuda: %f' % tcuda)
        print('cuda speedup: %.2fx' % (tcpu / tcuda))

        # Check result
        self.assertTrue(np.allclose(C, Cans))

开发者ID:ASPP，项目名称:numba，代码行数:34，代码来源:test_matmul.py

示例9: test_func

    def test_func(self):

        @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]])
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)

开发者ID:Alexhuszagh，项目名称:numba，代码行数:59，代码来源:test_matmul.py

示例10: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()

开发者ID:AngelBerihuete，项目名称:numbapro-examples，代码行数:58，代码来源:pricer_cuda_overlap.py

示例11: newthread

def newthread():
    cuda.select_device(0)
    stream = cuda.stream()
    A = np.arange(100)
    dA = cuda.to_device(A, stream=stream)
    stream.synchronize()
    del dA
    del stream
    cuda.close()

开发者ID:ASPP，项目名称:numba，代码行数:9，代码来源:test_select_device.py

示例12: test_stream_bind

 def test_stream_bind(self):
     stream = cuda.stream()
     with stream.auto_synchronize():
         arr = cuda.device_array(
             (3, 3),
             dtype=np.float64,
             stream=stream)
         self.assertEqual(arr.bind(stream).stream, stream)
         self.assertEqual(arr.stream, stream)

开发者ID:esc，项目名称:numba，代码行数:9，代码来源:test_cuda_ndarray.py

示例13: getGraphFromEdges_gpu

def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
                          MAX_TPB = 512, stream = None):
    """
    All input (except MAX_TPB and stream) are device arrays.
    edges       : array with the IDs of the edges that will be part of the new graph
    n_edges     : array of 1 element with the number of valid edges in the edges array;
                  if n_edges < size of edges, the last elements of the edges array are
                  not considered
    """

    # check if number of valid edges was received
    if n_edges is None:
        edges_size = edges.size
        n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
    else:
        edges_size = int(n_edges.getitem(0))

    # check if a stream was received, if not create one
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream
    
    new_n_edges = edges_size * 2

    # allocate memory for new graph
    ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
                              stream = myStream)
    nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                stream = myStream)
    nfe = cuda.device_array_like(fe, stream = myStream)
    nod = cuda.device_array_like(od, stream = myStream)

    # fill new outdegree with zeros
    vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
    memSet[vertexGrid, MAX_TPB, myStream](nod, 0)

    # count all edges of new array and who they belong to
    edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
    countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)

    # get new first_edge array from new outdegree
    nfe.copy_to_device(nod, stream=myStream)
    ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)


    # copy new first_edge to top_edge to serve as pointer in adding edges
    top_edge = cuda.device_array_like(nfe, stream = myStream)
    top_edge.copy_to_device(nfe, stream = myStream)

    addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
                                          top_edge, ndest, nweight)

    del top_edge
    #del dest, weight, fe, od
    return ndest, nweight, nfe, nod

开发者ID:Chiroptera，项目名称:masters_code，代码行数:56，代码来源:build.py

示例14: test_laplace_small

    def test_laplace_small(self):
        if config.ENABLE_CUDASIM:
            NN, NM = 4, 4
            iter_max = 20
        else:
            NN, NM = 256, 256
            iter_max = 1000

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            iter += 1

        runtime = time.time() - timer

开发者ID:CaptainAL，项目名称:Spyder，代码行数:56，代码来源:test_laplace.py

示例15: _run_copies

    def _run_copies(self, A):
        A0 = np.copy(A)

        stream = cuda.stream()
        ptr = cuda.to_device(A, copy=False, stream=stream)
        ptr.copy_to_device(A, stream=stream)
        ptr.copy_to_host(A, stream=stream)
        stream.synchronize()

        self.assertTrue(np.allclose(A, A0))

开发者ID:FedericoStra，项目名称:numba，代码行数:10，代码来源:test_pinned.py

注：本文中的numba.cuda.stream函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。