本文整理汇总了C++中Func::gpu_threads方法的典型用法代码示例。如果您正苦于以下问题:C++ Func::gpu_threads方法的具体用法?C++ Func::gpu_threads怎么用?C++ Func::gpu_threads使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Func
的用法示例。
在下文中一共展示了Func::gpu_threads方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: main
int main(int argc, char **argv) {
if (!get_jit_target_from_environment().has_gpu_feature()) {
printf("Not running test because no gpu target enabled\n");
return 0;
}
{
Func f;
Var x, y, z;
// Construct a Func with lots of potential race conditions, and
// then run it in thread blocks on the gpu.
f(x, y) = x + 100 * y;
const int passes = 10;
for (int i = 0; i < passes; i++) {
RDom rx(0, 10);
// Flip each row, using spots 10-19 as temporary storage
f(rx + 10, y) = f(9 - rx, y);
f(rx, y) = f(rx + 10, y);
// Flip each column the same way
RDom ry(0, 8);
f(x, ry + 8) = f(x, 7 - ry);
f(x, ry) = f(x, ry + 8);
}
Func g;
g(x, y) = f(0, 0)+ f(9, 7);
g.gpu_tile(x, y, 16, 8);
f.compute_at(g, Var::gpu_blocks());
for (int i = 0; i < passes; i++) {
f.update(i*4 + 0).gpu_threads(y);
f.update(i*4 + 1).gpu_threads(y);
f.update(i*4 + 2).gpu_threads(x);
f.update(i*4 + 3).gpu_threads(x);
}
Image<int> out = g.realize(100, 100);
for (int y = 0; y < out.height(); y++) {
for (int x = 0; x < out.width(); x++) {
int correct = 7*100 + 9;
if (out(x, y) != correct) {
printf("out(%d, %d) = %d instead of %d\n",
x, y, out(x, y), correct);
return -1;
}
}
}
}
{
// Construct a Func with undef stages, then run it in thread
// blocks and make sure the right number of syncthreads are
// added.
Func f;
Var x, y, z;
f(x, y) = undef<int>();
f(x, y) += x + 100 * y;
// This next line is dubious, because it entirely masks the
// effect of the previous definition. If you add an undefined
// value to the previous def, then Halide can evaluate this to
// whatever it likes. Currently we'll just elide this update
// definition.
f(x, y) += undef<int>();
f(x, y) += y * 100 + x;
Func g;
g(x, y) = f(0, 0) + f(7, 7);
g.gpu_tile(x, y, 8, 8);
f.compute_at(g, Var::gpu_blocks());
f.gpu_threads(x, y);
f.update(0).gpu_threads(x, y);
f.update(1).gpu_threads(x, y);
f.update(2).gpu_threads(x, y);
// There should be two thread barriers: one in between the
// non-undef definitions, and one between f and g.
g.add_custom_lowering_pass(new CheckBarrierCount(2));
Image<int> out = g.realize(100, 100);
}
printf("Success!\n");
return 0;
}
示例2: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, 16);
// Func::gpu_tile method is similar to Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing the
// intermediate result in shared memory. Var::gpu_blocks, and
// Var::gpu_threads exist to help you schedule producers within
// GPU threads and blocks.
padded.compute_at(curved, Var::gpu_blocks());
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA or OpenCL are
// not enabled by default. We have to construct a Target
// object, enable one of them, and then pass that target
// object to compile_jit. Otherwise your CPU will very slowly
// pretend it's a GPU, and use one thread per output pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or CUDA.
// We'll enable OpenCL here, because it tends to give better
// performance than CUDA, even with NVidia's drivers, because
// NVidia's open source LLVM backend doesn't seem to do all
// the same optimizations their proprietary compiler does.
target.features |= Target::OpenCL;
// Uncomment the next line and comment out the line above to
// try CUDA instead.
// target.features |= Target::CUDA;
// If you want to see all of the OpenCL or CUDA API calls done
// by the pipeline, you can also enable the GPUDebug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
//target.features |= Target::GPUDebug;
curved.compile_jit(target);
}
示例3: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, block, thread, 16);
// Func::gpu_tile behaves the same as Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing
// the intermediate result in shared memory. In the schedule
// above xo corresponds to GPU blocks.
padded.compute_at(curved, xo);
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
// Metal are not enabled by default. We have to construct a
// Target object, enable one of them, and then pass that
// target object to compile_jit. Otherwise your CPU will very
// slowly pretend it's a GPU, and use one thread per output
// pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or Metal, depending on which platform
// we're on. OS X doesn't update its OpenCL drivers, so they
// tend to be broken. CUDA would also be a fine choice on
// machines with NVidia GPUs.
if (target.os == Target::OSX) {
target.set_feature(Target::Metal);
} else {
target.set_feature(Target::OpenCL);
}
// Uncomment the next line and comment out the lines above to
// try CUDA instead.
// target.set_feature(Target::CUDA);
// If you want to see all of the OpenCL, Metal, or CUDA API
// calls done by the pipeline, you can also enable the Debug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
// target.set_feature(Target::Debug);
curved.compile_jit(target);
}