本文整理汇总了C++中Func::compileToFile方法的典型用法代码示例。如果您正苦于以下问题:C++ Func::compileToFile方法的具体用法?C++ Func::compileToFile怎么用?C++ Func::compileToFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Func
的用法示例。
在下文中一共展示了Func::compileToFile方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: main
int main(int argc, char **argv) {
// The camera pipe is specialized on the 2592x1968 images that
// come in, so we'll just use an image instead of a uniform image.
Image<int16_t> input(2592, 1968);
UniformImage matrix_3200(Float(32), 2, "m3200"), matrix_7000(Float(32), 2, "m7000");
Uniform<float> color_temp("color_temp", 3200.0f);
Uniform<float> gamma("gamma", 1.8f);
Uniform<float> contrast("contrast", 10.0f);
// shift things inwards to give us enough padding on the
// boundaries so that we don't need to check bounds. We're going
// to make a 2560x1920 output image, just like the FCam pipe, so
// shift by 16, 12
Func shifted;
shifted(x, y) = input(clamp(x+16, 0, input.width()-1), clamp(y+12, 0, input.height()-1));
// Parameterized output type, because LLVM PTX (GPU) backend does not
// currently allow 8-bit computations
int bit_width = atoi(argv[1]);
Type result_type = UInt(bit_width);
// Pick a schedule
schedule = atoi(argv[2]);
// Build the pipeline
Func processed = process(shifted, result_type, matrix_3200, matrix_7000, color_temp, gamma, contrast);
//string s = processed.serialize();
//printf("%s\n", s.c_str());
// In C++-11, this can be done as a simple initializer_list {color_temp,gamma,etc.} in place.
Arg args[] = {color_temp, gamma, contrast, input, matrix_3200, matrix_7000};
processed.compileToFile("curved", std::vector<Arg>(args, args+6));
return 0;
}
示例2: main
int main(int argc, char **argv) {
if (argc < 2) {
printf("Spatial sigma is a compile-time parameter, please provide it as an argument.\n"
"(llvm's ptx backend doesn't handle integer mods by non-consts yet)\n");
return 0;
}
UniformImage input(Float(32), 2);
Uniform<float> r_sigma;
int s_sigma = atoi(argv[1]);
Var x, y, z, c;
// Add a boundary condition
Func clamped;
clamped(x, y) = input(clamp(x, 0, input.width()-1),
clamp(y, 0, input.height()-1));
// Construct the bilateral grid
RDom r(0, s_sigma, 0, s_sigma);
Expr val = clamped(x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2);
val = clamp(val, 0.0f, 1.0f);
Expr zi = cast<int>(val * (1.0f/r_sigma) + 0.5f);
Func grid;
grid(x, y, zi, c) += select(c == 0, val, 1.0f);
// Blur the grid using a five-tap filter
Func blurx, blury, blurz;
blurx(x, y, z) = grid(x-2, y, z) + grid(x-1, y, z)*4 + grid(x, y, z)*6 + grid(x+1, y, z)*4 + grid(x+2, y, z);
blury(x, y, z) = blurx(x, y-2, z) + blurx(x, y-1, z)*4 + blurx(x, y, z)*6 + blurx(x, y+1, z)*4 + blurx(x, y+2, z);
blurz(x, y, z) = blury(x, y, z-2) + blury(x, y, z-1)*4 + blury(x, y, z)*6 + blury(x, y, z+1)*4 + blury(x, y, z+2);
// Take trilinear samples to compute the output
val = clamp(clamped(x, y), 0.0f, 1.0f);
Expr zv = val * (1.0f/r_sigma);
zi = cast<int>(zv);
Expr zf = zv - zi;
Expr xf = cast<float>(x % s_sigma) / s_sigma;
Expr yf = cast<float>(y % s_sigma) / s_sigma;
Expr xi = x/s_sigma;
Expr yi = y/s_sigma;
Func interpolated;
interpolated(x, y) =
lerp(lerp(lerp(blurz(xi, yi, zi), blurz(xi+1, yi, zi), xf),
lerp(blurz(xi, yi+1, zi), blurz(xi+1, yi+1, zi), xf), yf),
lerp(lerp(blurz(xi, yi, zi+1), blurz(xi+1, yi, zi+1), xf),
lerp(blurz(xi, yi+1, zi+1), blurz(xi+1, yi+1, zi+1), xf), yf), zf);
// Normalize
Func smoothed;
smoothed(x, y) = interpolated(x, y, 0)/interpolated(x, y, 1);
#ifndef USE_GPU
// Best schedule for CPU
printf("Compiling for CPU\n");
grid.root().parallel(z);
grid.update().transpose(y, c).transpose(x, c).parallel(y);
blurx.root().parallel(z).vectorize(x, 4);
blury.root().parallel(z).vectorize(x, 4);
blurz.root().parallel(z).vectorize(x, 4);
smoothed.root().parallel(y).vectorize(x, 4);
#else
printf("Compiling for GPU");
Var gridz = grid.arg(2);
grid.transpose(y, gridz).transpose(x, gridz).transpose(y, c).transpose(x, c)
.root().cudaTile(x, y, 16, 16);
grid.update().transpose(y, c).transpose(x, c).transpose(i, c).transpose(j, c)
.root().cudaTile(x, y, 16, 16);
c = blurx.arg(3);
blurx.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
.root().cudaTile(x, y, 8, 8);
c = blury.arg(3);
blury.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
.root().cudaTile(x, y, 8, 8);
c = blurz.arg(3);
blurz.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
.root().cudaTile(x, y, 8, 8);
smoothed.root().cudaTile(x, y, s_sigma, s_sigma);
#endif
smoothed.compileToFile("bilateral_grid", {r_sigma, input});
// Compared to Sylvain Paris' implementation from his webpage (on
// which this is based), for filter params s_sigma 0.1, on a 4 megapixel
// input, on a four core x86 (2 socket core2 mac pro)
// Filter s_sigma: 2 4 8 16 32
// Paris (ms): 5350 1345 472 245 184
// Us (ms): 383 142 77 62 65
// Speedup: 14 9.5 6.1 3.9 2.8
// Our schedule and inlining are roughly the same as his, so the
// gain is all down to vectorizing and parallelizing. In general
// for larger blurs our win shrinks to roughly the number of
// cores, as the stages we don't vectorize as well dominate (we
// don't vectorize them well because they do gathers and scatters,
// which don't work well on x86). For smaller blurs, our win
// grows, because the stages that we vectorize take up all the
//.........这里部分代码省略.........
示例3: main
//.........这里部分代码省略.........
if (j > 0) gPyramid[j].root().parallel(k);
outGPyramid[j].root().parallel(y);
}
break;
case 100:
// output stage only on GPU
output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
for (int j = 0; j < J; j++) {
inGPyramid[j].root();
gPyramid[j].root();
outGPyramid[j].root();
if (j == J-1) break;
lPyramid[j].root();
outLPyramid[j].root();
}
break;
case 101:
// all root on GPU, tiny blocks to prevent accidental bounds explosion
output.root().split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
for (int j = 0; j < J; j++) {
inGPyramid[j].root()
.split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
gPyramid[j].root()
.split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
outGPyramid[j].root()
.split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
if (j == J-1) break;
lPyramid[j].root()
.split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
outLPyramid[j].root()
.split(y, by, ty, 2).split(x, bx, tx, 2)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
}
break;
case 102:
// all root on GPU
output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
for (int j = 0; j < J; j++) {
int blockw = 32, blockh = 32;
if (j > 3) {
blockw = 2;
blockh = 2;
}
inGPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
gPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
outGPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
if (j == J-1) break;
lPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
outLPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
}
break;
case 103:
// most root, but inline laplacian pyramid levels - 49ms on Tesla
output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
for (int j = 0; j < J; j++) {
int blockw = 32, blockh = 32;
if (j > 3) {
blockw = 2;
blockh = 2;
}
inGPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
gPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
outGPyramid[j].root()
.split(y, by, ty, blockh).split(x, bx, tx, blockw)
.transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
}
break;
default:
break;
}
output.compileToFile("local_laplacian", {levels, alpha, beta, input});
return 0;
}