本文整理汇总了C++中Func::gpu_tile方法的典型用法代码示例。如果您正苦于以下问题:C++ Func::gpu_tile方法的具体用法?C++ Func::gpu_tile怎么用?C++ Func::gpu_tile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Func
的用法示例。
在下文中一共展示了Func::gpu_tile方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: main
int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
if (!target.has_feature(Target::OpenCL)) {
printf("This test requires opencl.\n");
return 0;
}
// These calls are only available for AOT-compiled code:
//
// halide_set_custom_get_symbol(my_get_symbol_impl);
// halide_set_custom_load_library(my_load_library_impl);
// halide_set_custom_get_library_symbol(my_get_library_symbol_impl);
//
// For JIT code, we must use JITSharedRuntime::set_default_handlers().
Internal::JITHandlers handlers;
handlers.custom_get_symbol = my_get_symbol_impl;
handlers.custom_load_library = my_load_library_impl;
handlers.custom_get_library_symbol = my_get_library_symbol_impl;
Internal::JITSharedRuntime::set_default_handlers(handlers);
Var x, y, xi, yi;
Func f;
f(x, y) = cast<int32_t>(x + y);
f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenCL);
f.set_error_handler(my_error_handler);
Buffer<int32_t> out = f.realize(64, 64, target);
fprintf(stderr, "Should not get here.\n");
return -1;
}
示例2: main
int main(int argc, char **argv) {
// Make sure that freeing GPU buffers doesn't occur before the
// computation that is filling them completes.
Func f;
Var x, y;
RDom r(0, 100);
f(x, y) = sum(sqrt(sqrt(sqrt(sqrt(x+y+r)))));
Target t = get_jit_target_from_environment();
if (t.has_feature(Target::OpenCL) ||
t.has_feature(Target::CUDA)) {
f.gpu_tile(x, y, 16, 16);
// This allocates a buffer, does gpu compute into it, and then
// frees it (calling dev_free) possibly before the compute is
// done.
for (int i = 0; i < 10; i++) {
f.realize(1024, 1024, t);
}
} else {
// Skip this test if gpu target not enabled (it's pretty slow on a cpu).
}
printf("Success!\n");
return 0;
}
示例3: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, 16);
// Func::gpu_tile method is similar to Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing the
// intermediate result in shared memory. Var::gpu_blocks, and
// Var::gpu_threads exist to help you schedule producers within
// GPU threads and blocks.
padded.compute_at(curved, Var::gpu_blocks());
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA or OpenCL are
// not enabled by default. We have to construct a Target
// object, enable one of them, and then pass that target
// object to compile_jit. Otherwise your CPU will very slowly
// pretend it's a GPU, and use one thread per output pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or CUDA.
// We'll enable OpenCL here, because it tends to give better
// performance than CUDA, even with NVidia's drivers, because
// NVidia's open source LLVM backend doesn't seem to do all
// the same optimizations their proprietary compiler does.
target.features |= Target::OpenCL;
// Uncomment the next line and comment out the line above to
// try CUDA instead.
// target.features |= Target::CUDA;
// If you want to see all of the OpenCL or CUDA API calls done
// by the pipeline, you can also enable the GPUDebug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
//target.features |= Target::GPUDebug;
curved.compile_jit(target);
}
示例4: main
int main(int argc, char **argv) {
if (!get_jit_target_from_environment().has_gpu_feature()) {
printf("Not running test because no gpu target enabled\n");
return 0;
}
{
Func f;
Var x, y, z;
// Construct a Func with lots of potential race conditions, and
// then run it in thread blocks on the gpu.
f(x, y) = x + 100 * y;
const int passes = 10;
for (int i = 0; i < passes; i++) {
RDom rx(0, 10);
// Flip each row, using spots 10-19 as temporary storage
f(rx + 10, y) = f(9 - rx, y);
f(rx, y) = f(rx + 10, y);
// Flip each column the same way
RDom ry(0, 8);
f(x, ry + 8) = f(x, 7 - ry);
f(x, ry) = f(x, ry + 8);
}
Func g;
g(x, y) = f(0, 0)+ f(9, 7);
g.gpu_tile(x, y, 16, 8);
f.compute_at(g, Var::gpu_blocks());
for (int i = 0; i < passes; i++) {
f.update(i*4 + 0).gpu_threads(y);
f.update(i*4 + 1).gpu_threads(y);
f.update(i*4 + 2).gpu_threads(x);
f.update(i*4 + 3).gpu_threads(x);
}
Image<int> out = g.realize(100, 100);
for (int y = 0; y < out.height(); y++) {
for (int x = 0; x < out.width(); x++) {
int correct = 7*100 + 9;
if (out(x, y) != correct) {
printf("out(%d, %d) = %d instead of %d\n",
x, y, out(x, y), correct);
return -1;
}
}
}
}
{
// Construct a Func with undef stages, then run it in thread
// blocks and make sure the right number of syncthreads are
// added.
Func f;
Var x, y, z;
f(x, y) = undef<int>();
f(x, y) += x + 100 * y;
// This next line is dubious, because it entirely masks the
// effect of the previous definition. If you add an undefined
// value to the previous def, then Halide can evaluate this to
// whatever it likes. Currently we'll just elide this update
// definition.
f(x, y) += undef<int>();
f(x, y) += y * 100 + x;
Func g;
g(x, y) = f(0, 0) + f(7, 7);
g.gpu_tile(x, y, 8, 8);
f.compute_at(g, Var::gpu_blocks());
f.gpu_threads(x, y);
f.update(0).gpu_threads(x, y);
f.update(1).gpu_threads(x, y);
f.update(2).gpu_threads(x, y);
// There should be two thread barriers: one in between the
// non-undef definitions, and one between f and g.
g.add_custom_lowering_pass(new CheckBarrierCount(2));
Image<int> out = g.realize(100, 100);
}
printf("Success!\n");
return 0;
}
示例5: main
int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
if (1) {
// Test a tuple reduction on the gpu
Func f;
Var x, y;
f(x, y) = Tuple(x + y, x - y);
// Updates to a reduction are atomic.
f(x, y) = Tuple(f(x, y)[1]*2, f(x, y)[0]*2);
// now equals ((x - y)*2, (x + y)*2)
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, 16, 16);
f.update().gpu_tile(x, y, 16, 16);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon(y).vectorize(x, 32);
f.update().hexagon(y).vectorize(x, 32);
}
Realization result = f.realize(1024, 1024);
Image<int> a = result[0], b = result[1];
for (int y = 0; y < a.height(); y++) {
for (int x = 0; x < a.width(); x++) {
int correct_a = (x - y)*2;
int correct_b = (x + y)*2;
if (a(x, y) != correct_a || b(x, y) != correct_b) {
printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
x, y, a(x, y), b(x, y), correct_a, correct_b);
return -1;
}
}
}
}
if (1) {
// Now test one that alternates between cpu and gpu per update step
Func f;
Var x, y;
f(x, y) = Tuple(x + y, x - y);
for (size_t i = 0; i < 10; i++) {
// Swap the tuple elements and increment both
f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
}
// Schedule the pure step and the odd update steps on the gpu
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, 16, 16);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon(y).vectorize(x, 32);
}
for (int i = 0; i < 10; i ++) {
if (i & 1) {
if (target.has_gpu_feature()) {
f.update(i).gpu_tile(x, y, 16, 16);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.update(i).hexagon(y).vectorize(x, 32);
}
} else {
f.update(i);
}
}
Realization result = f.realize(1024, 1024);
Image<int> a = result[0], b = result[1];
for (int y = 0; y < a.height(); y++) {
for (int x = 0; x < a.width(); x++) {
int correct_a = (x + y) + 10;
int correct_b = (x - y) + 10;
if (a(x, y) != correct_a || b(x, y) != correct_b) {
printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
x, y, a(x, y), b(x, y), correct_a, correct_b);
return -1;
}
}
}
}
if (1) {
// Same as above, but switches which steps are gpu and cpu
Func f;
Var x, y;
f(x, y) = Tuple(x + y, x - y);
for (size_t i = 0; i < 10; i++) {
// Swap the tuple elements and increment both
f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
}
// Schedule the even update steps on the gpu
//.........这里部分代码省略.........
示例6: main
int main(int argc, char **argv) {
if (!get_jit_target_from_environment().has_gpu_feature()) {
printf("No gpu target enabled. Skipping test.\n");
return 0;
}
Var x, y, z, w;
Image<int> full(80, 60, 10, 10);
const int x_off = 4, y_off = 8, z_off = 2, w_off = 4;
const int x_size = 16, y_size = 16, z_size = 3, w_size = 3;
buffer_t cropped = *full.raw_buffer();
cropped.host = (uint8_t *)&(full(x_off, y_off, z_off, w_off));
cropped.min[0] = 0;
cropped.min[1] = 0;
cropped.min[2] = 0;
cropped.min[3] = 0;
cropped.extent[0] = x_size;
cropped.extent[1] = y_size;
cropped.extent[2] = z_size;
cropped.extent[3] = w_size;
cropped.stride[0] *= 2;
cropped.stride[1] *= 2;
cropped.stride[2] *= 2;
cropped.stride[3] *= 2;
Buffer out(Int(32), &cropped);
// Make a bitmask representing the region inside the crop.
Image<bool> in_subregion(80, 60, 10, 10);
Expr test = ((x >= x_off) && (x < x_off + x_size*2) &&
(y >= y_off) && (y < y_off + y_size*2) &&
(z >= z_off) && (z < z_off + z_size*2) &&
(w >= w_off) && (w < w_off + w_size*2) &&
(x % 2 == 0) &&
(y % 2 == 0) &&
(z % 2 == 0) &&
(w % 2 == 0));
Func test_func;
test_func(x, y, z, w) = test;
test_func.realize(in_subregion);
Func f;
f(x, y, z, w) = 3*x + 2*y + z + 4*w;
f.gpu_tile(x, y, 16, 16);
f.output_buffer().set_stride(0, Expr());
f.realize(out);
// Put some data in the full host buffer, avoiding the region
// being evaluated above.
Expr change_out_of_subregion = select(test, undef<int>(), 4*x + 3*y + 2*z + w);
lambda(x, y, z, w, change_out_of_subregion).realize(full);
// Copy back the output subset from the GPU.
out.copy_to_host();
for (int w = 0; w < full.extent(3); ++w) {
for (int z = 0; z < full.extent(2); ++z) {
for (int y = 0; y < full.extent(1); ++y) {
for (int x = 0; x < full.extent(0); ++x) {
int correct;
if (in_subregion(x, y, z, w)) {
int x_ = (x - x_off)/2;
int y_ = (y - y_off)/2;
int z_ = (z - z_off)/2;
int w_ = (w - w_off)/2;
correct = 3*x_ + 2*y_ + z_ + 4*w_;
} else {
correct = 4*x + 3*y + 2*z + w;
}
if (full(x, y, z, w) != correct) {
printf("Error! Incorrect value %i != %i at %i, %i, %i, %i\n", full(x, y, z, w), correct, x, y, z, w);
return -1;
}
}
}
}
}
printf("Success!\n");
return 0;
}
示例7: main
int main(int argc, char **argv) {
Buffer<uint8_t> input(128, 64);
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
input(x, y) = y*input.width() + x;
}
}
Var x, y, xi, yi;
{
Func f;
f(x, y) = select(((input(x, y) > 10) && (input(x, y) < 20)) ||
((input(x, y) > 40) && (!(input(x, y) > 50))),
u8(255), u8(0));
Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon().vectorize(x, 128);
} else {
f.vectorize(x, 8);
}
Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
bool cond = ((input(x, y) > 10) && (input(x, y) < 20)) ||
((input(x, y) > 40) && (!(input(x, y) > 50)));
uint8_t correct = cond ? 255 : 0;
if (correct != output(x, y)) {
fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
return -1;
}
}
}
}
// Test a condition that uses a let resulting from common
// subexpression elimination.
{
Func f;
Expr common_cond = input(x, y) > 10;
f(x, y) = select((common_cond && (input(x, y) < 20)) ||
((input(x, y) > 40) && (!common_cond)),
u8(255), u8(0));
Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon().vectorize(x, 128);
} else {
f.vectorize(x, 8);
}
Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
bool common_cond = input(x, y) > 10;
bool cond = (common_cond && (input(x, y) < 20)) ||
((input(x, y) > 40) && (!common_cond));
uint8_t correct = cond ? 255 : 0;
if (correct != output(x, y)) {
fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
return -1;
}
}
}
}
// Test a condition which has vector and scalar inputs.
{
Func f("f");
f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y));
Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon().vectorize(x, 128);
} else {
f.vectorize(x, 128);
}
Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
bool cond = x < 10 || x > 20 || y < 10 || y > 20;
uint8_t correct = cond ? 0 : input(x,y);
if (correct != output(x, y)) {
fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
return -1;
}
//.........这里部分代码省略.........
示例8: main
int main(int argc, char **argv) {
Target t(get_jit_target_from_environment());
if (!t.has_gpu_feature()) {
printf("Not running test because no gpu target enabled\n");
return 0;
}
const int n_types = 9;
Type types[] = {Int(8), Int(16), Int(32), Int(64),
UInt(8), UInt(16), UInt(32), UInt(64),
Float(32)};
Func funcs[n_types];
Var x;
Func out;
Type result_type;
if (t.has_feature(Target::Metal)) {
result_type = UInt(32);
} else {
result_type = UInt(64);
}
Expr e = cast(result_type, 0);
int offset = 0;
for (int i = 0; i < n_types; i++) {
int off = 0;
if ((types[i].is_int() || types[i].is_uint())) {
// Metal does not support 64-bit integers.
if (t.has_feature(Target::Metal) &&
types[i].bits() >= 64) {
continue;
}
if (types[i].bits() <= 64) {
off = (1 << (types[i].bits() - 4)) + 17;
}
}
offset += off;
funcs[i](x) = cast(types[i], x/16 + off);
e += cast(result_type, funcs[i](x));
funcs[i].compute_at(out, Var::gpu_blocks()).gpu_threads(x);
}
out(x) = e;
out.gpu_tile(x, 23);
Buffer output = out.realize(23*5);
int result;
if (t.has_feature(Target::Metal)) {
result = check_result<uint32_t>(output, n_types - 2, offset);
} else {
result = check_result<uint64_t>(output, n_types, offset);
}
if (result != 0) {
return result;
}
printf("Success!\n");
return 0;
}
示例9: main
//.........这里部分代码省略.........
Expr maskOutHelp = 0;
for(int i = -boundingBox; i <= boundingBox; i++){
for(int j = -boundingBox; j <= boundingBox; j++){
maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) +
polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) +
polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j));
// maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j);
}
}
maskOut(x, y) = maskOutHelp;
//Schedule
// blur.reorder(i_v, x, y);
// kernel1.compute_at(blurImage, x);
// kernel1.vectorize(x, 8);
// kernel1.split(y, y0, yi, 4);
// kernel1.parallel(y0);
/* kernel1.compute_root();
kernel2.compute_root();
kernel3.compute_root();
kernel4.compute_root();
kernel5.compute_root();
*/
//best schedule found:
#ifdef TESTING_GPU
blurImage.gpu_tile(x, y, 16, 16);
// JIT-compile the pipeline for the GPU. CUDA or OpenCL are
// not enabled by default. We have to construct a Target
// object, enable one of them, and then pass that target
// object to compile_jit. Otherwise your CPU will very slowly
// pretend it's a GPU, and use one thread per output pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or CUDA.
// We'll enable OpenCL here, because it tends to give better
// performance than CUDA, even with NVidia's drivers, because
// NVidia's open source LLVM backend doesn't seem to do all
// the same optimizations their proprietary compiler does.
target.set_feature(Target::OpenCL);
// Uncomment the next line and comment out the line above to
// try CUDA instead.
// target.set_feature(Target::CUDA);
// If you want to see all of the OpenCL or CUDA API calls done
// by the pipeline, you can also enable the Debug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
// target.set_feature(Target::Debug);
blurImage.compile_jit(target);
#else
示例10: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, block, thread, 16);
// Func::gpu_tile behaves the same as Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing
// the intermediate result in shared memory. In the schedule
// above xo corresponds to GPU blocks.
padded.compute_at(curved, xo);
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
// Metal are not enabled by default. We have to construct a
// Target object, enable one of them, and then pass that
// target object to compile_jit. Otherwise your CPU will very
// slowly pretend it's a GPU, and use one thread per output
// pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or Metal, depending on which platform
// we're on. OS X doesn't update its OpenCL drivers, so they
// tend to be broken. CUDA would also be a fine choice on
// machines with NVidia GPUs.
if (target.os == Target::OSX) {
target.set_feature(Target::Metal);
} else {
target.set_feature(Target::OpenCL);
}
// Uncomment the next line and comment out the lines above to
// try CUDA instead.
// target.set_feature(Target::CUDA);
// If you want to see all of the OpenCL, Metal, or CUDA API
// calls done by the pipeline, you can also enable the Debug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
// target.set_feature(Target::Debug);
curved.compile_jit(target);
}
示例11: main
int main(int argc, char **argv) {
Target t = get_jit_target_from_environment();
if (!t.features_any_of({Target::CUDACapability50,
Target::CUDACapability61})) {
printf("This test requires cuda enabled with cuda capability 5.0 or greater\n");
return 0;
}
{
// Shuffle test to do a small convolution
Func f, g;
Var x, y;
f(x, y) = x + y;
g(x, y) = f(x-1, y) + f(x+1, y);
Var xo, xi, yi, yo;
g.gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp).gpu_lanes(xi);
f.compute_root();
f.in(g).compute_at(g, yi).split(x, xo, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(xo);
Buffer<int> out = g.realize(32, 4);
for (int y = 0; y < out.height(); y++) {
for (int x = 0; x < out.width(); x++) {
int correct = 2*(x + y);
int actual = out(x, y);
if (correct != actual) {
printf("out(%d, %d) = %d instead of %d\n",
x, y, actual, correct);
return -1;
}
}
}
}
{
// Broadcast test - an outer product access pattern
Func a, b, c;
Var x, y;
a(x) = cast<float>(x);
b(y) = cast<float>(y);
c(x, y) = a(x) + 100 * b(y);
a.compute_root();
b.compute_root();
Var xi, yi, yii;
c.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
.gpu_blocks(x, y)
.gpu_lanes(xi);
// We're going to be computing 'a' and 'b' at block level, but
// we want them in register, not shared, so we explicitly call
// store_in.
a.in(c).compute_at(c, x)
.gpu_lanes(x)
.store_in(MemoryType::Register);
b.in(c).compute_at(c, x)
.gpu_lanes(y)
.store_in(MemoryType::Register);
Buffer<float> out = c.realize(32, 32);
for (int y = 0; y < out.height(); y++) {
for (int x = 0; x < out.width(); x++) {
float correct = x + 100 * y;
float actual = out(x, y);
// The floats are small integers, so they should be exact.
if (correct != actual) {
printf("out(%d, %d) = %f instead of %f\n",
x, y, actual, correct);
return -1;
}
}
}
}
{
// Vectorized broadcast test. Each lane is responsible for a
// 2-vector from 'a' and a 2-vector from 'b' instead of a single
// value.
Func a, b, c;
Var x, y;
a(x) = cast<float>(x);
b(y) = cast<float>(y);
c(x, y) = a(x) + 100 * b(y);
a.compute_root();
b.compute_root();
Var xi, yi, yii;
c.tile(x, y, xi, yi, 64, 64, TailStrategy::RoundUp)
.gpu_blocks(x, y)
.split(yi, yi, yii, 64).unroll(yii, 2).gpu_threads(yi)
.vectorize(xi, 2).gpu_lanes(xi);
a.in(c).compute_at(c, yi).vectorize(x, 2).gpu_lanes(x);
b.in(c).compute_at(c, yi).vectorize(y, 2).gpu_lanes(y);
Buffer<float> out = c.realize(64, 64);
//.........这里部分代码省略.........
示例12: test
bool test(int vec_width, const Target &target) {
if (!is_type_supported<A>(vec_width, target) || !is_type_supported<B>(vec_width, target)) {
// Type not supported, return pass.
return true;
}
int W = 1024;
int H = 1;
Buffer<A> input(W, H);
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
input(x, y) = (A)((rand()&0xffff)*0.1);
}
}
Var x, y;
Func f;
f(x, y) = cast<B>(input(x, y));
if (target.has_gpu_feature()) {
Var xo, xi;
f.gpu_tile(x, xo, xi, 64);
} else {
if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
// TODO: Non-native vector widths hang the compiler here.
//f.hexagon();
}
if (vec_width > 1) {
f.vectorize(x, vec_width);
}
}
Buffer<B> output = f.realize(W, H);
/*
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
printf("%d %d -> %d %d\n", x, y, (int)(input(x, y)), (int)(output(x, y)));
}
}
*/
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
bool ok = ((B)(input(x, y)) == output(x, y));
if (!ok) {
fprintf(stderr, "%s x %d -> %s x %d failed\n",
string_of_type<A>(), vec_width,
string_of_type<B>(), vec_width);
fprintf(stderr, "At %d %d, %f -> %f instead of %f\n",
x, y,
(double)(input(x, y)),
(double)(output(x, y)),
(double)((B)(input(x, y))));
return false;
}
}
}
return true;
}
示例13: main
int main(int arch, char **argv) {
const int W = 256, H = 256;
Buffer<uint8_t> in(W, H);
// Set up the input.
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
in(x, y) = rand() & 0xff;
}
}
// Define a convolution kernel, and its sum.
Buffer<int8_t> kernel(3, 3);
kernel.set_min(-1, -1);
for (int y = -1; y <= 1; y++) {
for (int x = -1; x <= 1; x++) {
kernel(x, y) = rand() % 8 - 4;
}
}
Var x("x"), y("y"), xi("xi"), yi("yi");
RDom r(-1, 3, -1, 3);
// Boundary condition.
Func input = BoundaryConditions::repeat_edge(in);
input.compute_root();
// Test a widening reduction, followed by a narrowing.
{
Func f;
f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16);
// Schedule.
Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, xi, yi, 16, 16);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
f.hexagon().vectorize(x, 128);
} else {
f.vectorize(x, target.natural_vector_size<uint8_t>());
}
// Run the pipeline and verify the results are correct.
Buffer<uint8_t> out = f.realize(W, H, target);
for (int y = 1; y < H-1; y++) {
for (int x = 1; x < W-1; x++) {
int16_t correct = 0;
for (int ry = -1; ry <= 1; ry++) {
for (int rx = -1; rx <= 1; rx++) {
correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
}
}
correct = std::min(std::max(correct / 16, 0), 255);
if (correct != out(x, y)) {
std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
return -1;
}
}
}
}
// Test a tuple reduction with widening, followed by narrowing the result.
{
Func f;
f(x, y) = { i16(0), i8(0) };
f(x, y) = {
f(x, y)[0] + i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y),
f(x, y)[1] + kernel(r.x, r.y),
};
Func g;
g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16);
// Schedule.
Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
g.gpu_tile(x, y, xi, yi, 16, 16);
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
g.hexagon().vectorize(x, 128);
} else {
g.vectorize(x, target.natural_vector_size<uint8_t>());
}
// Run the pipeline and verify the results are correct.
Buffer<uint8_t> out = g.realize(W, H, target);
for (int y = 1; y < H-1; y++) {
for (int x = 1; x < W-1; x++) {
int16_t correct = 0;
for (int ry = -1; ry <= 1; ry++) {
for (int rx = -1; rx <= 1; rx++) {
correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
correct += kernel(rx, ry);
}
}
correct = std::min(std::max(correct / 16, 0), 255);
if (correct != out(x, y)) {
std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
return -1;
//.........这里部分代码省略.........