本文整理汇总了C++中Func::compile_jit方法的典型用法代码示例。如果您正苦于以下问题:C++ Func::compile_jit方法的具体用法?C++ Func::compile_jit怎么用?C++ Func::compile_jit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Func
的用法示例。
在下文中一共展示了Func::compile_jit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: main
int main(int argc, char **argv) {
Var x, y;
Func f;
f(x, y) = x+y;
// Dig out the raw function pointer so we can use it as if we were
// compiling statically
void (*function)(buffer_t *) = (void (*)(buffer_t *))(f.compile_jit());
buffer_t out;
memset(&out, 0, sizeof(out));
out.host = (uint8_t *)malloc(10*10);
out.elem_size = 1; // should be 4!
out.extent[0] = 10;
out.extent[1] = 10;
out.stride[0] = 1;
out.stride[1] = 10;
f.set_error_handler(&halide_error);
error_occurred = false;
function(&out);
if (error_occurred) {
printf("Success!\n");
return 0;
} else {
printf("There should have been a runtime error\n");
return -1;
}
}
示例2: schedule_for_cpu
// Now we define methods that give our pipeline several different
// schedules.
void schedule_for_cpu() {
// Compute the look-up-table ahead of time.
lut.compute_root();
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Look-up-tables don't vectorize well, so just parallelize
// curved in slices of 16 scanlines.
Var yo, yi;
curved.split(y, yo, yi, 16)
.parallel(yo);
// Compute sharpen as needed per scanline of curved.
sharpen.compute_at(curved, yi);
// Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
sharpen.vectorize(x, 8);
// Compute the padded input as needed per scanline of curved,
// reusing previous values computed within the same strip of
// 16 scanlines.
padded.store_at(curved, yo)
.compute_at(curved, yi);
// Also vectorize the padding. It's 8-bit, so we'll vectorize
// 16-wide.
padded.vectorize(x, 16);
// JIT-compile the pipeline for the CPU.
curved.compile_jit();
}
示例3: test
double test(Func f, bool test_correctness = true) {
f.compile_to_assembly(f.name() + ".s", Internal::vec<Argument>(input), f.name());
f.compile_jit();
f.realize(output);
if (test_correctness) {
for (int y = 0; y < output.height(); y++) {
for (int x = 0; x < output.width(); x++) {
int ix1 = std::max(std::min(x, MAX), MIN);
int ix2 = std::max(std::min(x+1, MAX), MIN);
uint16_t correct = input(ix1, y) * 3 + input(ix2, y);
if (output(x, y) != correct) {
printf("output(%d, %d) = %d instead of %d\n",
x, y, output(x, y), correct);
exit(-1);
}
}
}
}
double t1 = currentTime();
for (int i = 0; i < 10; i++) {
f.realize(output);
}
return currentTime() - t1;
}
示例4: main
int main(int argc, char **argv) {
Buffer<int> input(100, 50);
// This image represents the range [100, 199]*[50, 99]
input.set_min(100, 50);
input(100, 50) = 123;
input(198, 99) = 234;
Func f;
Var x, y;
f(x, y) = input(2*x, y/2);
f.compile_jit();
// The output will represent the range from [50, 99]*[100, 199]
Buffer<int> result(50, 100);
result.set_min(50, 100);
f.realize(result);
if (result(50, 100) != 123 || result(99, 199) != 234) {
fprintf(stderr, "Err: f(50, 100) = %d (supposed to be 123)\n"
"f(99, 199) = %d (supposed to be 234)\n",
result(50, 100), result(99, 199));
return -1;
}
printf("Success!\n");
return 0;
}
示例5: main
int main(int argc, char **argv) {
ImageParam input(UInt(8), 1);
input.dim(0).set_bounds(0, size);
{
Func f;
Var x;
f(x) = input(x);
// Output must have the same size as the input.
f.output_buffer().dim(0).set_bounds(input.dim(0).min(), input.dim(0).extent());
f.add_custom_lowering_pass(new Validator);
f.compile_jit();
Buffer<uint8_t> dummy(size);
dummy.fill(42);
input.set(dummy);
Buffer<uint8_t> out = f.realize(size);
if (!out.all_equal(42)) {
std::cerr << "wrong output" << std::endl;
exit(-1);
}
}
{
Func f;
Var x;
f(x) = undef(UInt(8));
RDom r(input);
f(r.x) = cast<uint8_t>(42);
f.add_custom_lowering_pass(new Validator);
f.compile_jit();
Buffer<uint8_t> dummy(size);
input.set(dummy);
Buffer<uint8_t> out = f.realize(size);
if (!out.all_equal(42)) {
std::cerr << "wrong output" << std::endl;
exit(-1);
}
}
std::cout << "Success!" << std::endl;
return 0;
}
示例6: main
int main(int argc, char **argv) {
ImageParam src(UInt(8), 1);
Func dst;
Var x;
dst(x) = src(x);
Var xo;
dst.split(x, xo, x, 8*4096);
// dst.parallel(xo); speeds up halide's memcpy considerably, but doesn't seem sporting
dst.vectorize(x, 16);
dst.compile_to_assembly("memcpy.s", {src}, "memcpy");
dst.compile_jit();
const int32_t buffer_size = 12345678;
const int iterations = 50;
Image<uint8_t> input(buffer_size);
Image<uint8_t> output(buffer_size);
src.set(input);
// Get past one-time set-up issues for the ptx backend.
dst.realize(output);
double halide = 0, system = 0;
for (int i = 0; i < iterations; i++) {
double t1 = current_time();
dst.realize(output);
dst.realize(output);
dst.realize(output);
double t2 = current_time();
memcpy(output.data(), input.data(), input.width());
memcpy(output.data(), input.data(), input.width());
memcpy(output.data(), input.data(), input.width());
double t3 = current_time();
system += t3-t2;
halide += t2-t1;
}
printf("system memcpy: %.3e byte/s\n", (buffer_size / system) * 3 * 1000 * iterations);
printf("halide memcpy: %.3e byte/s\n", (buffer_size / halide) * 3 * 1000 * iterations);
// memcpy will win by a little bit for large inputs because it uses streaming stores
if (halide > system * 2) {
printf("Halide memcpy is slower than it should be.\n");
return -1;
}
printf("Success!\n");
return 0;
}
示例7: main
int main(int argc, char **argv) {
// Define a pipeline that dumps some squares to a file using an
// external consumer stage.
Func source;
Var x;
source(x) = x*x;
Param<int> min, extent;
Param<const char *> filename;
Func sink;
std::vector<ExternFuncArgument> args;
args.push_back(source);
args.push_back(filename);
args.push_back(min);
args.push_back(extent);
sink.define_extern("dump_to_file", args, Int(32), 0);
source.compute_root();
sink.compile_jit();
// Dump the first 10 squares to a file
filename.set("halide_test_extern_consumer.txt");
min.set(0);
extent.set(10);
sink.realize();
if (!check_result())
return -1;
// Test ImageParam ExternFuncArgument via passed in image.
Image<int32_t> buf = source.realize(10);
ImageParam passed_in(Int(32), 1);
passed_in.set(buf);
Func sink2;
std::vector<ExternFuncArgument> args2;
args2.push_back(passed_in);
args2.push_back(filename);
args2.push_back(min);
args2.push_back(extent);
sink2.define_extern("dump_to_file", args2, Int(32), 0);
sink2.realize();
if (!check_result())
return -1;
printf("Success!\n");
return 0;
}
示例8: main
int main(int argc, char **argv) {
Func f;
Var x, y;
ImageParam in(Float(32), 2);
ImageParam x_coord(Int(32), 2);
ImageParam y_coord(Int(32), 2);
f(x, y) = 0.0f;
RDom r(0, 100, 0, 100);
f(x_coord(r.x, r.y), y_coord(r.x, r.y)) += in(r.x, r.y);
f.compile_jit();
printf("I should not have reached here\n");
return 0;
}
示例9: main
int main(int argc, char **argv) {
Func f;
Var x, y;
f(x, y) = x + y;
f.parallel(x);
// Having more threads than tasks shouldn't hurt performance too much.
double correct_time = 0;
for (int t = 2; t <= 64; t *= 2) {
std::ostringstream ss;
ss << "HL_NUM_THREADS=" << t;
std::string str = ss.str();
char buf[32] = {0};
memcpy(buf, str.c_str(), str.size());
putenv(buf);
Halide::Internal::JITSharedRuntime::release_all();
f.compile_jit();
// Start the thread pool without giving any hints as to the
// number of tasks we'll be using.
f.realize(t, 1);
double min_time = 1e20;
for (int i = 0; i < 3; i++) {
double t1 = current_time();
f.realize(2, 1000000);
double t2 = current_time() - t1;
if (t2 < min_time) min_time = t2;
}
printf("%d: %f ms\n", t, min_time);
if (t == 2) {
correct_time = min_time;
} else if (min_time > correct_time * 5) {
printf("Unacceptable overhead when using %d threads for 2 tasks: %f ms vs %f ms\n",
t, min_time, correct_time);
return -1;
}
}
printf("Success!\n");
return 0;
}
示例10: main
int main(int argc, char **argv) {
ImageParam input(Float(32), 2);
Var x, y, z;
RDom dom(0, input.width()*8);
Func f;
Expr hard_to_reason_about = cast<int>(hypot(input.width(), input.height()));
f(x, y, z) = 1;
f(x, y, dom / hard_to_reason_about) += 1;
f.compile_jit();
Image<float> im(32, 32);
input.set(im);
f.realize(100, 100, 16);
printf("Success!\n");
return 0;
}
示例11: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, 16);
// Func::gpu_tile method is similar to Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing the
// intermediate result in shared memory. Var::gpu_blocks, and
// Var::gpu_threads exist to help you schedule producers within
// GPU threads and blocks.
padded.compute_at(curved, Var::gpu_blocks());
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA or OpenCL are
// not enabled by default. We have to construct a Target
// object, enable one of them, and then pass that target
// object to compile_jit. Otherwise your CPU will very slowly
// pretend it's a GPU, and use one thread per output pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or CUDA.
// We'll enable OpenCL here, because it tends to give better
// performance than CUDA, even with NVidia's drivers, because
// NVidia's open source LLVM backend doesn't seem to do all
// the same optimizations their proprietary compiler does.
target.features |= Target::OpenCL;
// Uncomment the next line and comment out the line above to
// try CUDA instead.
// target.features |= Target::CUDA;
// If you want to see all of the OpenCL or CUDA API calls done
// by the pipeline, you can also enable the GPUDebug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
//target.features |= Target::GPUDebug;
curved.compile_jit(target);
}
示例12: main
int main(int argc, char **argv) {
const int N = 1 << 10;
Image<int> data(N);
for (int i = 0; i < N; i++) {
data(i) = rand() & 0xfffff;
}
Func input = lambda(x, data(x));
printf("Bitonic sort...\n");
Func f = bitonic_sort(input, N);
f.bound(x, 0, N);
f.compile_jit();
printf("Running...\n");
Image<int> bitonic_sorted(N);
f.realize(bitonic_sorted);
double t1 = current_time();
for (int i = 0; i < 10; i++) {
f.realize(bitonic_sorted);
}
double t2 = current_time();
printf("Merge sort...\n");
f = merge_sort(input, N);
f.bound(x, 0, N);
f.compile_jit();
printf("Running...\n");
Image<int> merge_sorted(N);
f.realize(merge_sorted);
double t3 = current_time();
for (int i = 0; i < 10; i++) {
f.realize(merge_sorted);
}
double t4 = current_time();
Image<int> correct(N);
for (int i = 0; i < N; i++) {
correct(i) = data(i);
}
printf("std::sort...\n");
double t5 = current_time();
std::sort(&correct(0), &correct(N));
double t6 = current_time();
printf("Times:\n"
"bitonic sort: %f \n"
"merge sort: %f \n"
"std::sort %f\n",
(t2-t1)/10, (t4-t3)/10, t6-t5);
if (N <= 100) {
for (int i = 0; i < N; i++) {
printf("%8d %8d %8d\n",
correct(i), bitonic_sorted(i), merge_sorted(i));
}
}
for (int i = 0; i < N; i++) {
if (bitonic_sorted(i) != correct(i)) {
printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i));
return -1;
}
if (merge_sorted(i) != correct(i)) {
printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i));
return -1;
}
}
return 0;
}
示例13: main
int main(int argc, char **argv) {
const int N = 1 << 10;
Buffer<int> data(N);
for (int i = 0; i < N; i++) {
data(i) = rand() & 0xfffff;
}
Func input = lambda(x, data(x));
printf("Bitonic sort...\n");
Func f = bitonic_sort(input, N);
f.bound(x, 0, N);
f.compile_jit();
printf("Running...\n");
Buffer<int> bitonic_sorted(N);
f.realize(bitonic_sorted);
double t_bitonic = benchmark(1, 10, [&]() {
f.realize(bitonic_sorted);
});
printf("Merge sort...\n");
f = merge_sort(input, N);
f.bound(x, 0, N);
f.compile_jit();
printf("Running...\n");
Buffer<int> merge_sorted(N);
f.realize(merge_sorted);
double t_merge = benchmark(1, 10, [&]() {
f.realize(merge_sorted);
});
Buffer<int> correct(N);
for (int i = 0; i < N; i++) {
correct(i) = data(i);
}
printf("std::sort...\n");
double t_std = benchmark(1, 1, [&]() {
std::sort(&correct(0), &correct(N));
});
printf("Times:\n"
"bitonic sort: %fms \n"
"merge sort: %fms \n"
"std::sort %fms\n",
t_bitonic * 1e3, t_merge * 1e3, t_std * 1e3);
if (N <= 100) {
for (int i = 0; i < N; i++) {
printf("%8d %8d %8d\n",
correct(i), bitonic_sorted(i), merge_sorted(i));
}
}
for (int i = 0; i < N; i++) {
if (bitonic_sorted(i) != correct(i)) {
printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i));
return -1;
}
if (merge_sorted(i) != correct(i)) {
printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i));
return -1;
}
}
return 0;
}
示例14: main
//.........这里部分代码省略.........
#ifdef TESTING_GPU
blurImage.gpu_tile(x, y, 16, 16);
// JIT-compile the pipeline for the GPU. CUDA or OpenCL are
// not enabled by default. We have to construct a Target
// object, enable one of them, and then pass that target
// object to compile_jit. Otherwise your CPU will very slowly
// pretend it's a GPU, and use one thread per output pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or CUDA.
// We'll enable OpenCL here, because it tends to give better
// performance than CUDA, even with NVidia's drivers, because
// NVidia's open source LLVM backend doesn't seem to do all
// the same optimizations their proprietary compiler does.
target.set_feature(Target::OpenCL);
// Uncomment the next line and comment out the line above to
// try CUDA instead.
// target.set_feature(Target::CUDA);
// If you want to see all of the OpenCL or CUDA API calls done
// by the pipeline, you can also enable the Debug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
// target.set_feature(Target::Debug);
blurImage.compile_jit(target);
#else
blurImage.split(y, y0, yi, 4);
blurImage.parallel(y0);
blurImage.vectorize(x, 8);
#endif
// Split the y coordinate of the consumer into strips:
blurVariance.split(y, y0, yi, 4);
// Compute the strips using a thread pool and a task queue.
blurVariance.parallel(y0);
// Vectorize across x.
blurVariance.vectorize(x, 8);
// polynomial1.compute_at(blurImage, x).vectorize(x, 8);
// kernel1.compute_at(blurImage, x).vectorize(x, 8);
// Split the y coordinate of the consumer into strips of 16 scanlines:
maskOut.split(y, y0, yi, 30);
// Compute the strips using a thread pool and a task queue.
maskOut.parallel(y0);
// Vectorize across x by a factor of four.
maskOut.vectorize(x, 8);
// kernel1.trace_stores();
// blurImage.trace_stores();
//Check out what is happening
blurImage.print_loop_nest();
示例15: schedule_for_gpu
// Now a schedule that uses CUDA or OpenCL.
void schedule_for_gpu() {
// We make the decision about whether to use the GPU for each
// Func independently. If you have one Func computed on the
// CPU, and the next computed on the GPU, Halide will do the
// copy-to-gpu under the hood. For this pipeline, there's no
// reason to use the CPU for any of the stages. Halide will
// copy the input image to the GPU the first time we run the
// pipeline, and leave it there to reuse on subsequent runs.
// As before, we'll compute the LUT once at the start of the
// pipeline.
lut.compute_root();
// Let's compute the look-up-table using the GPU in 16-wide
// one-dimensional thread blocks. First we split the index
// into blocks of size 16:
Var block, thread;
lut.split(i, block, thread, 16);
// Then we tell cuda that our Vars 'block' and 'thread'
// correspond to CUDA's notions of blocks and threads, or
// OpenCL's notions of thread groups and threads.
lut.gpu_blocks(block)
.gpu_threads(thread);
// This is a very common scheduling pattern on the GPU, so
// there's a shorthand for it:
// lut.gpu_tile(i, block, thread, 16);
// Func::gpu_tile behaves the same as Func::tile, except that
// it also specifies that the tile coordinates correspond to
// GPU blocks, and the coordinates within each tile correspond
// to GPU threads.
// Compute color channels innermost. Promise that there will
// be three of them and unroll across them.
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
// Compute curved in 2D 8x8 tiles using the GPU.
curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
// This is equivalent to:
// curved.tile(x, y, xo, yo, xi, yi, 8, 8)
// .gpu_blocks(xo, yo)
// .gpu_threads(xi, yi);
// We'll leave sharpen as inlined into curved.
// Compute the padded input as needed per GPU block, storing
// the intermediate result in shared memory. In the schedule
// above xo corresponds to GPU blocks.
padded.compute_at(curved, xo);
// Use the GPU threads for the x and y coordinates of the
// padded input.
padded.gpu_threads(x, y);
// JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
// Metal are not enabled by default. We have to construct a
// Target object, enable one of them, and then pass that
// target object to compile_jit. Otherwise your CPU will very
// slowly pretend it's a GPU, and use one thread per output
// pixel.
// Start with a target suitable for the machine you're running
// this on.
Target target = get_host_target();
// Then enable OpenCL or Metal, depending on which platform
// we're on. OS X doesn't update its OpenCL drivers, so they
// tend to be broken. CUDA would also be a fine choice on
// machines with NVidia GPUs.
if (target.os == Target::OSX) {
target.set_feature(Target::Metal);
} else {
target.set_feature(Target::OpenCL);
}
// Uncomment the next line and comment out the lines above to
// try CUDA instead.
// target.set_feature(Target::CUDA);
// If you want to see all of the OpenCL, Metal, or CUDA API
// calls done by the pipeline, you can also enable the Debug
// flag. This is helpful for figuring out which stages are
// slow, or when CPU -> GPU copies happen. It hurts
// performance though, so we'll leave it commented out.
// target.set_feature(Target::Debug);
curved.compile_jit(target);
}