当前位置: 首页>>代码示例>>C++>>正文


C++ Func::gpu_tile方法代码示例

本文整理汇总了C++中Func::gpu_tile方法的典型用法代码示例。如果您正苦于以下问题:C++ Func::gpu_tile方法的具体用法?C++ Func::gpu_tile怎么用?C++ Func::gpu_tile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Func的用法示例。


在下文中一共展示了Func::gpu_tile方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: main

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();
    if (!target.has_feature(Target::OpenCL)) {
        printf("This test requires opencl.\n");
        return 0;
    }

    // These calls are only available for AOT-compiled code:
    //
    //   halide_set_custom_get_symbol(my_get_symbol_impl);
    //   halide_set_custom_load_library(my_load_library_impl);
    //   halide_set_custom_get_library_symbol(my_get_library_symbol_impl);
    //
    // For JIT code, we must use JITSharedRuntime::set_default_handlers().

    Internal::JITHandlers handlers;
    handlers.custom_get_symbol = my_get_symbol_impl;
    handlers.custom_load_library = my_load_library_impl;
    handlers.custom_get_library_symbol = my_get_library_symbol_impl;
    Internal::JITSharedRuntime::set_default_handlers(handlers);

    Var x, y, xi, yi;
    Func f;
    f(x, y) = cast<int32_t>(x + y);
    f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenCL);
    f.set_error_handler(my_error_handler);

    Buffer<int32_t> out = f.realize(64, 64, target);

    fprintf(stderr, "Should not get here.\n");
    return -1;
}
开发者ID:jiapei100,项目名称:Halide,代码行数:32,代码来源:load_library.cpp

示例2: main

int main(int argc, char **argv) {
    // Make sure that freeing GPU buffers doesn't occur before the
    // computation that is filling them completes.
    Func f;
    Var x, y;
    RDom r(0, 100);
    f(x, y) = sum(sqrt(sqrt(sqrt(sqrt(x+y+r)))));

    Target t = get_jit_target_from_environment();

    if (t.has_feature(Target::OpenCL) ||
        t.has_feature(Target::CUDA)) {
        f.gpu_tile(x, y, 16, 16);

        // This allocates a buffer, does gpu compute into it, and then
        // frees it (calling dev_free) possibly before the compute is
        // done.
        for (int i = 0; i < 10; i++) {
            f.realize(1024, 1024, t);
        }
    } else {
        // Skip this test if gpu target not enabled (it's pretty slow on a cpu).
    }

    printf("Success!\n");
    return 0;
}
开发者ID:DoDNet,项目名称:Halide,代码行数:27,代码来源:gpu_free_sync.cpp

示例3: schedule_for_gpu

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, 16);

        // Func::gpu_tile method is similar to Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing the
        // intermediate result in shared memory. Var::gpu_blocks, and
        // Var::gpu_threads exist to help you schedule producers within
        // GPU threads and blocks.
        padded.compute_at(curved, Var::gpu_blocks());

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.features |= Target::OpenCL;

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.features |= Target::CUDA;

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the GPUDebug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        //target.features |= Target::GPUDebug;

        curved.compile_jit(target);
    }
开发者ID:kree-colemcalughlin,项目名称:Halide,代码行数:92,代码来源:lesson_12_using_the_gpu.cpp

示例4: main

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("Not running test because no gpu target enabled\n");
        return 0;
    }

    {
        Func f;
        Var x, y, z;

        // Construct a Func with lots of potential race conditions, and
        // then run it in thread blocks on the gpu.

        f(x, y) = x + 100 * y;

        const int passes = 10;
        for (int i = 0; i < passes; i++) {
            RDom rx(0, 10);
            // Flip each row, using spots 10-19 as temporary storage
            f(rx + 10, y) = f(9 - rx, y);
            f(rx, y) = f(rx + 10, y);
            // Flip each column the same way
            RDom ry(0, 8);
            f(x, ry + 8) = f(x, 7 - ry);
            f(x, ry) = f(x, ry + 8);
        }

        Func g;
        g(x, y) = f(0, 0)+ f(9, 7);

        g.gpu_tile(x, y, 16, 8);
        f.compute_at(g, Var::gpu_blocks());

        for (int i = 0; i < passes; i++) {
            f.update(i*4 + 0).gpu_threads(y);
            f.update(i*4 + 1).gpu_threads(y);
            f.update(i*4 + 2).gpu_threads(x);
            f.update(i*4 + 3).gpu_threads(x);
        }

        Image<int> out = g.realize(100, 100);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 7*100 + 9;
                if (out(x, y) != correct) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, out(x, y), correct);
                    return -1;
                }
            }
        }

    }

    {
        // Construct a Func with undef stages, then run it in thread
        // blocks and make sure the right number of syncthreads are
        // added.

        Func f;
        Var x, y, z;
        f(x, y) = undef<int>();
        f(x, y) += x + 100 * y;
        // This next line is dubious, because it entirely masks the
        // effect of the previous definition. If you add an undefined
        // value to the previous def, then Halide can evaluate this to
        // whatever it likes. Currently we'll just elide this update
        // definition.
        f(x, y) += undef<int>();
        f(x, y) += y * 100 + x;

        Func g;
        g(x, y) = f(0, 0) + f(7, 7);

        g.gpu_tile(x, y, 8, 8);
        f.compute_at(g, Var::gpu_blocks());

        f.gpu_threads(x, y);
        f.update(0).gpu_threads(x, y);
        f.update(1).gpu_threads(x, y);
        f.update(2).gpu_threads(x, y);

        // There should be two thread barriers: one in between the
        // non-undef definitions, and one between f and g.
        g.add_custom_lowering_pass(new CheckBarrierCount(2));

        Image<int> out = g.realize(100, 100);
    }

    printf("Success!\n");
    return 0;
}
开发者ID:josephsieh,项目名称:Halide,代码行数:92,代码来源:gpu_thread_barrier.cpp

示例5: main

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();

    if (1) {
        // Test a tuple reduction on the gpu
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        // Updates to a reduction are atomic.
        f(x, y) = Tuple(f(x, y)[1]*2, f(x, y)[0]*2);
        // now equals ((x - y)*2, (x + y)*2)

        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, 16, 16);
            f.update().gpu_tile(x, y, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon(y).vectorize(x, 32);
            f.update().hexagon(y).vectorize(x, 32);
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x - y)*2;
                int correct_b = (x + y)*2;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }
    }

    if (1) {
        // Now test one that alternates between cpu and gpu per update step
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        for (size_t i = 0; i < 10; i++) {
            // Swap the tuple elements and increment both
            f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
        }

        // Schedule the pure step and the odd update steps on the gpu
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon(y).vectorize(x, 32);
        }
        for (int i = 0; i < 10; i ++) {
	    if (i & 1) {
                if (target.has_gpu_feature()) {
                    f.update(i).gpu_tile(x, y, 16, 16);
                } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
                    f.update(i).hexagon(y).vectorize(x, 32);
                }
	    } else {
		f.update(i);
	    }
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x + y) + 10;
                int correct_b = (x - y) + 10;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }

    }

    if (1) {
        // Same as above, but switches which steps are gpu and cpu
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        for (size_t i = 0; i < 10; i++) {
            // Swap the tuple elements and increment both
            f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
        }

        // Schedule the even update steps on the gpu
//.........这里部分代码省略.........
开发者ID:Mengke-Yuan,项目名称:Halide,代码行数:101,代码来源:tuple_reduction.cpp

示例6: main

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("No gpu target enabled. Skipping test.\n");
        return 0;
    }

    Var x, y, z, w;
    Image<int> full(80, 60, 10, 10);

    const int x_off = 4, y_off = 8, z_off = 2, w_off = 4;
    const int x_size = 16, y_size = 16, z_size = 3, w_size = 3;

    buffer_t cropped = *full.raw_buffer();
    cropped.host = (uint8_t *)&(full(x_off, y_off, z_off, w_off));
    cropped.min[0] = 0;
    cropped.min[1] = 0;
    cropped.min[2] = 0;
    cropped.min[3] = 0;
    cropped.extent[0] = x_size;
    cropped.extent[1] = y_size;
    cropped.extent[2] = z_size;
    cropped.extent[3] = w_size;
    cropped.stride[0] *= 2;
    cropped.stride[1] *= 2;
    cropped.stride[2] *= 2;
    cropped.stride[3] *= 2;
    Buffer out(Int(32), &cropped);

    // Make a bitmask representing the region inside the crop.
    Image<bool> in_subregion(80, 60, 10, 10);
    Expr test = ((x >= x_off) && (x < x_off + x_size*2) &&
                 (y >= y_off) && (y < y_off + y_size*2) &&
                 (z >= z_off) && (z < z_off + z_size*2) &&
                 (w >= w_off) && (w < w_off + w_size*2) &&
                 (x % 2 == 0) &&
                 (y % 2 == 0) &&
                 (z % 2 == 0) &&
                 (w % 2 == 0));
    Func test_func;
    test_func(x, y, z, w) = test;
    test_func.realize(in_subregion);

    Func f;
    f(x, y, z, w) = 3*x + 2*y + z + 4*w;
    f.gpu_tile(x, y, 16, 16);
    f.output_buffer().set_stride(0, Expr());
    f.realize(out);

    // Put some data in the full host buffer, avoiding the region
    // being evaluated above.
    Expr change_out_of_subregion = select(test, undef<int>(), 4*x + 3*y + 2*z + w);
    lambda(x, y, z, w, change_out_of_subregion).realize(full);

    // Copy back the output subset from the GPU.
    out.copy_to_host();

    for (int w = 0; w < full.extent(3); ++w) {
        for (int z = 0; z < full.extent(2); ++z) {
            for (int y = 0; y < full.extent(1); ++y) {
                for (int x = 0; x < full.extent(0); ++x) {
                    int correct;
                    if (in_subregion(x, y, z, w)) {
                        int x_ = (x - x_off)/2;
                        int y_ = (y - y_off)/2;
                        int z_ = (z - z_off)/2;
                        int w_ = (w - w_off)/2;
                        correct = 3*x_ + 2*y_ + z_ + 4*w_;
                    } else {
                        correct = 4*x + 3*y + 2*z + w;
                    }
                    if (full(x, y, z, w) != correct) {
                        printf("Error! Incorrect value %i != %i at %i, %i, %i, %i\n", full(x, y, z, w), correct, x, y, z, w);
                        return -1;
                    }
                }
            }
        }
    }

    printf("Success!\n");
    return 0;
}
开发者ID:DoDNet,项目名称:Halide,代码行数:82,代码来源:gpu_non_contiguous_copy.cpp

示例7: main

int main(int argc, char **argv) {

    Buffer<uint8_t> input(128, 64);

    for (int y = 0; y < input.height(); y++) {
        for (int x = 0; x < input.width(); x++) {
            input(x, y) = y*input.width() + x;
        }
    }

    Var x, y, xi, yi;
    {
        Func f;
        f(x, y) = select(((input(x, y) > 10) && (input(x, y) < 20)) ||
                         ((input(x, y) > 40) && (!(input(x, y) > 50))),
                         u8(255), u8(0));

        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 8);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool cond = ((input(x, y) > 10) && (input(x, y) < 20)) ||
                    ((input(x, y) > 40) && (!(input(x, y) > 50)));
                uint8_t correct = cond ? 255 : 0;
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a condition that uses a let resulting from common
    // subexpression elimination.
    {
        Func f;
        Expr common_cond = input(x, y) > 10;
        f(x, y) = select((common_cond && (input(x, y) < 20)) ||
                         ((input(x, y) > 40) && (!common_cond)),
                         u8(255), u8(0));

        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 8);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool common_cond = input(x, y) > 10;
                bool cond = (common_cond && (input(x, y) < 20)) ||
                    ((input(x, y) > 40) && (!common_cond));
                uint8_t correct = cond ? 255 : 0;
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a condition which has vector and scalar inputs.
    {
        Func f("f");
        f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y));

        Target target = get_jit_target_from_environment();

        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 128);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool cond = x < 10 || x > 20 || y < 10 || y > 20;
                uint8_t correct = cond ? 0 : input(x,y);
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
//.........这里部分代码省略.........
开发者ID:adityaatluri,项目名称:Halide,代码行数:101,代码来源:logical.cpp

示例8: main

int main(int argc, char **argv) {
    Target t(get_jit_target_from_environment());
    if (!t.has_gpu_feature()) {
        printf("Not running test because no gpu target enabled\n");
        return 0;
    }

    const int n_types = 9;

    Type types[] = {Int(8), Int(16), Int(32), Int(64),
                    UInt(8), UInt(16), UInt(32), UInt(64),
                    Float(32)};
    Func funcs[n_types];

    Var x;

    Func out;

    Type result_type;
    if (t.has_feature(Target::Metal)) {
        result_type = UInt(32);
    } else {
        result_type = UInt(64);
    }
    Expr e = cast(result_type, 0);
    int offset = 0;
    for (int i = 0; i < n_types; i++) {
        int off = 0;
        if ((types[i].is_int() || types[i].is_uint())) {
            // Metal does not support 64-bit integers.
            if (t.has_feature(Target::Metal) &&
                types[i].bits() >= 64) {
                continue;
            }

            if (types[i].bits() <= 64) {
                off = (1 << (types[i].bits() - 4)) + 17;
            }
        }
        offset += off;

        funcs[i](x) = cast(types[i], x/16 + off);
        e += cast(result_type, funcs[i](x));
        funcs[i].compute_at(out, Var::gpu_blocks()).gpu_threads(x);
    }


    out(x) = e;
    out.gpu_tile(x, 23);

    Buffer output = out.realize(23*5);

    int result;
    if (t.has_feature(Target::Metal)) {
        result = check_result<uint32_t>(output, n_types - 2, offset);
    } else {
        result = check_result<uint64_t>(output, n_types, offset);
    }
    if (result != 0) {
        return result;
    }

    printf("Success!\n");
    return 0;
}
开发者ID:DoDNet,项目名称:Halide,代码行数:65,代码来源:gpu_mixed_shared_mem_types.cpp

示例9: main


//.........这里部分代码省略.........

    Expr maskOutHelp = 0;

    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j));
//            maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j);    
        }
    }
    maskOut(x, y) = maskOutHelp;



    //Schedule
  //  blur.reorder(i_v, x, y);


//    kernel1.compute_at(blurImage, x);
//    kernel1.vectorize(x, 8);
//    kernel1.split(y, y0, yi, 4);
//    kernel1.parallel(y0);

/*    kernel1.compute_root();
    kernel2.compute_root();
    kernel3.compute_root();
    kernel4.compute_root();
    kernel5.compute_root();
*/
    //best schedule found:

#ifdef TESTING_GPU
        blurImage.gpu_tile(x, y, 16, 16);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.set_feature(Target::OpenCL);

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        blurImage.compile_jit(target);
#else
开发者ID:stanford-gfx,项目名称:astro,代码行数:67,代码来源:linearCombinationKernel.cpp

示例10: schedule_for_gpu

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, block, thread, 16);

        // Func::gpu_tile behaves the same as Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing
        // the intermediate result in shared memory. In the schedule
        // above xo corresponds to GPU blocks.
        padded.compute_at(curved, xo);

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
        // Metal are not enabled by default. We have to construct a
        // Target object, enable one of them, and then pass that
        // target object to compile_jit. Otherwise your CPU will very
        // slowly pretend it's a GPU, and use one thread per output
        // pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or Metal, depending on which platform
        // we're on. OS X doesn't update its OpenCL drivers, so they
        // tend to be broken. CUDA would also be a fine choice on
        // machines with NVidia GPUs.
        if (target.os == Target::OSX) {
            target.set_feature(Target::Metal);
        } else {
            target.set_feature(Target::OpenCL);
        }

        // Uncomment the next line and comment out the lines above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL, Metal, or CUDA API
        // calls done by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        curved.compile_jit(target);
    }
开发者ID:darkbuck,项目名称:Halide,代码行数:94,代码来源:lesson_12_using_the_gpu.cpp

示例11: main

int main(int argc, char **argv) {
    Target t = get_jit_target_from_environment();

    if (!t.features_any_of({Target::CUDACapability50,
                            Target::CUDACapability61})) {
        printf("This test requires cuda enabled with cuda capability 5.0 or greater\n");
        return 0;
    }

    {
        // Shuffle test to do a small convolution
        Func f, g;
        Var x, y;

        f(x, y) = x + y;
        g(x, y) = f(x-1, y) + f(x+1, y);

        Var xo, xi, yi, yo;
        g.gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp).gpu_lanes(xi);
        f.compute_root();
        f.in(g).compute_at(g, yi).split(x, xo, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(xo);

        Buffer<int> out = g.realize(32, 4);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 2*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Broadcast test - an outer product access pattern
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .gpu_lanes(xi);
        // We're going to be computing 'a' and 'b' at block level, but
        // we want them in register, not shared, so we explicitly call
        // store_in.
        a.in(c).compute_at(c, x)
            .gpu_lanes(x)
            .store_in(MemoryType::Register);
        b.in(c).compute_at(c, x)
            .gpu_lanes(y)
            .store_in(MemoryType::Register);

        Buffer<float> out = c.realize(32, 32);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float correct = x + 100 * y;
                float actual = out(x, y);
                // The floats are small integers, so they should be exact.
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Vectorized broadcast test. Each lane is responsible for a
        // 2-vector from 'a' and a 2-vector from 'b' instead of a single
        // value.
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 64, 64, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .split(yi, yi, yii, 64).unroll(yii, 2).gpu_threads(yi)
            .vectorize(xi, 2).gpu_lanes(xi);
        a.in(c).compute_at(c, yi).vectorize(x, 2).gpu_lanes(x);
        b.in(c).compute_at(c, yi).vectorize(y, 2).gpu_lanes(y);

        Buffer<float> out = c.realize(64, 64);
//.........这里部分代码省略.........
开发者ID:jiapei100,项目名称:Halide,代码行数:101,代码来源:register_shuffle.cpp

示例12: test

bool test(int vec_width, const Target &target) {
    if (!is_type_supported<A>(vec_width, target) || !is_type_supported<B>(vec_width, target)) {
        // Type not supported, return pass.
        return true;
    }

    int W = 1024;
    int H = 1;

    Buffer<A> input(W, H);
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            input(x, y) = (A)((rand()&0xffff)*0.1);
        }
    }

    Var x, y;
    Func f;

    f(x, y) = cast<B>(input(x, y));

    if (target.has_gpu_feature()) {
        Var xo, xi;
        f.gpu_tile(x, xo, xi, 64);
    } else {
        if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            // TODO: Non-native vector widths hang the compiler here.
            //f.hexagon();
        }
        if (vec_width > 1) {
            f.vectorize(x, vec_width);
        }
    }

    Buffer<B> output = f.realize(W, H);

    /*
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            printf("%d %d -> %d %d\n", x, y, (int)(input(x, y)), (int)(output(x, y)));
        }
    }
    */

    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {

            bool ok = ((B)(input(x, y)) == output(x, y));
            if (!ok) {
                fprintf(stderr, "%s x %d -> %s x %d failed\n",
                       string_of_type<A>(), vec_width,
                       string_of_type<B>(), vec_width);
                fprintf(stderr, "At %d %d, %f -> %f instead of %f\n",
                       x, y,
                       (double)(input(x, y)),
                       (double)(output(x, y)),
                       (double)((B)(input(x, y))));
                return false;
            }
        }
    }

    return true;
}
开发者ID:kgnk,项目名称:Halide,代码行数:64,代码来源:vector_cast.cpp

示例13: main

int main(int arch, char **argv) {
    const int W = 256, H = 256;

    Buffer<uint8_t> in(W, H);
    // Set up the input.
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            in(x, y) = rand() & 0xff;
        }
    }

    // Define a convolution kernel, and its sum.
    Buffer<int8_t> kernel(3, 3);
    kernel.set_min(-1, -1);
    for (int y = -1; y <= 1; y++) {
        for (int x = -1; x <= 1; x++) {
            kernel(x, y) = rand() % 8 - 4;
        }
    }

    Var x("x"), y("y"), xi("xi"), yi("yi");
    RDom r(-1, 3, -1, 3);

    // Boundary condition.
    Func input = BoundaryConditions::repeat_edge(in);
    input.compute_root();

    // Test a widening reduction, followed by a narrowing.
    {
        Func f;
        f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16);

        // Schedule.
        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, target.natural_vector_size<uint8_t>());
        }

        // Run the pipeline and verify the results are correct.
        Buffer<uint8_t> out = f.realize(W, H, target);

        for (int y = 1; y < H-1; y++) {
            for (int x = 1; x < W-1; x++) {
                int16_t correct = 0;
                for (int ry = -1; ry <= 1; ry++) {
                    for (int rx = -1; rx <= 1; rx++) {
                        correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
                    }
                }
                correct = std::min(std::max(correct / 16, 0), 255);
                if (correct != out(x, y)) {
                    std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
                    return -1;
                }
            }
        }
    }

    // Test a tuple reduction with widening, followed by narrowing the result.
    {
        Func f;
        f(x, y) = { i16(0), i8(0) };
        f(x, y) = {
            f(x, y)[0] + i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y),
            f(x, y)[1] + kernel(r.x, r.y),
        };

        Func g;
        g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16);

        // Schedule.
        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            g.gpu_tile(x, y, xi, yi, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            g.hexagon().vectorize(x, 128);
        } else {
            g.vectorize(x, target.natural_vector_size<uint8_t>());
        }

        // Run the pipeline and verify the results are correct.
        Buffer<uint8_t> out = g.realize(W, H, target);

        for (int y = 1; y < H-1; y++) {
            for (int x = 1; x < W-1; x++) {
                int16_t correct = 0;
                for (int ry = -1; ry <= 1; ry++) {
                    for (int rx = -1; rx <= 1; rx++) {
                        correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
                        correct += kernel(rx, ry);
                    }
                }
                correct = std::min(std::max(correct / 16, 0), 255);
                if (correct != out(x, y)) {
                    std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
                    return -1;
//.........这里部分代码省略.........
开发者ID:adityaatluri,项目名称:Halide,代码行数:101,代码来源:widening_reduction.cpp


注:本文中的Func::gpu_tile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。