本文整理汇总了C++中Output::compute_root方法的典型用法代码示例。如果您正苦于以下问题:C++ Output::compute_root方法的具体用法?C++ Output::compute_root怎么用?C++ Output::compute_root使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Output
的用法示例。
在下文中一共展示了Output::compute_root方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: generate
void generate() {
// The algorithm.
// Some free variables, where x and y represent the spatial dimensions.
Var x("x"), y("y"), depth("depth"), batch("batch");
// Pad x and y with the value that produces zero after the input offset is
// added. The input offset is bounded to the range of a uint8, so this is
// safe.
Func input_bounded =
constant_exterior(input_, cast<uint8_t>(-input_offset_),
{ { Expr(), Expr() },
{ 0, input_.dim(1).extent() },
{ 0, input_.dim(2).extent() },
{ Expr(), Expr() } });
// For the filter, add the offset and upcast to 16-bit.
Func filter_with_offset("filter_with_offset");
filter_with_offset(depth, x, y) =
cast<int16_t>(filter_(depth, x, y)) + filter_offset_;
// Shift the input spatially in [x, y] by -[pad_width, pad_height].
Func shifted_input_with_offset("shifted_input_with_offset");
shifted_input_with_offset(depth, x, y, batch) = input_bounded(
depth, x - pad_width_, y - pad_height_, batch);
// Apply the depth multiplier.
Func resampled_input("resampled_input");
resampled_input(depth, x, y, batch) =
shifted_input_with_offset(depth / depth_multiplier_, x, y, batch);
// For the input, add the offset and upcast to 16-bit. This happens after
// resampling so we don't need to store/load as much data in the inner loop
// (at the cost of one add in the inner loop instead).
Func resampled_input_with_offset("resampled_input_with_offset");
resampled_input_with_offset(depth, x, y, batch) =
cast<int16_t>(resampled_input(depth, x, y, batch)) + input_offset_;
// Do the convolution in 32-bit. Apply the input stride. As before, the
// case stride == 1 is written separately for performance reasons.
Func convolved("convolved");
RDom filter_dom(0, filter_.dim(1).extent(), 0, filter_.dim(2).extent());
convolved(depth, x, y, batch) +=
(cast<int32_t>(filter_with_offset(depth, filter_dom.x, filter_dom.y)) *
cast<int32_t>(
resampled_input_with_offset(depth, x * stride_ + filter_dom.x,
y * stride_ + filter_dom.y, batch)));
Func scaled_plus_offset("scaled_plus_offset");
scaled_plus_offset(depth, x, y, batch) =
multiply_quantized_multiplier(
convolved(depth, x, y, batch) + bias_(depth), output_multiplier_,
output_shift_) +
output_offset_;
// Saturate and narrow the output.
output_(depth, x, y, batch) =
clamp(u8_sat(scaled_plus_offset(depth, x, y, batch)),
output_min_, output_max_);
// The schedule.
int vector_size_u8 = get_target().natural_vector_size<uint8_t>();
if (get_target().has_feature(Target::HVX_64)) {
vector_size_u8 = 64;
} else if (get_target().has_feature(Target::HVX_128)) {
vector_size_u8 = 128;
}
const bool use_hexagon =
get_target().features_any_of({ Target::HVX_64, Target::HVX_128 });
// Specifying .hexagon() on a Func will generate an RPC to run this stage
// on Hexagon. If Hexagon is the host (that is, the architecture is
// Hexagon), we have to omit the .hexagon() directive as we are already
// running on Hexagon.
if (use_hexagon && get_target().arch != Target::Hexagon) {
output_.hexagon();
}
output_.compute_root();
// We can't parallize batches, as we often have just a single batch to
// process. Also, x and y dimensions are often fairly small (8x8, 16x16).
// For now, we parallize along y, but may need to adapt when benchmarking
// real models.
Var yi("yi");
// For small tensors, make sure the split factor is not larger than the
// output y extent.
Expr y_split_factor = min(input_.dim(2).extent() / stride_, 4);
output_.split(y, y, yi, y_split_factor).parallel(y);
output_.vectorize(depth, vector_size_u8, TailStrategy::RoundUp);
if (use_hexagon) {
// Scheduling specifics for Hexagon.
if (depth_multiplier_ > 1) {
ScheduleResampledInput(output_, depth, y, depth_multiplier_,
vector_size_u8, &resampled_input);
}
output_.prefetch(input_, yi);
//.........这里部分代码省略.........
示例2: generate
void generate() {
Var x{"x"}, y{"y"}, c{"c"};
// We need a wrapper for the output so we can schedule the
// multiply update in tiles.
Func copy("copy");
copy(x, y, c) = input(x, y, c);
output(x, y, c) = copy(x, y, c) * 2;
input.dim(0).set_stride(4);
output.dim(0).set_stride(4);
Var tx("tx"), ty("ty");
Var ta("ta"), tb("tb");
// Break the output into tiles.
const int tile_width = 128;
const int tile_height = 32;
switch ((Schedule)schedule) {
case Schedule::Basic:
default:
output.compute_root()
.reorder(c, x, y)
.bound(c, 0, 4)
.tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);
copy.compute_at(output, tx)
.store_at(output, tx)
.bound(c, 0, 4)
.copy_to_host()
.reorder_storage(c, x, y);
break;
case Schedule::Fold:
output.compute_root()
.reorder(c, x, y)
.bound(c, 0, 4)
.tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);
copy.compute_at(output, tx)
.store_at(output, tx)
.bound(c, 0, 4)
.copy_to_host()
.reorder_storage(c, x, y)
.fold_storage(x, tile_width * 2);
break;
case Schedule::Async:
output.compute_root()
.reorder(c, x, y)
.bound(c, 0, 4)
.tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);
copy.compute_at(output, tx)
.store_at(output, tx)
.bound(c, 0, 4)
.copy_to_host()
.async()
.reorder_storage(c, x, y)
.fold_storage(x, tile_width * 2);
break;
case Schedule::Split: {
Expr fac = output.dim(1).extent()/2;
Var yo, yi;
output.split(y, yo, yi, fac);
output.compute_root()
.reorder(c, x, yo)
.bound(c, 0, 4)
.tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
.parallel(yo);
copy.compute_at(output, tx)
.store_at(output, ty)
.bound(c, 0, 4)
.copy_to_host()
.reorder_storage(c, x, y);
}
break;
case Schedule::Split_Fold: {
Expr fac = output.dim(1).extent()/2;
Var yo, yi;
output.split(y, yo, yi, fac);
output.compute_root()
.reorder(c, x, yo)
.bound(c, 0, 4)
.tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
.parallel(yo);
copy.compute_at(output, tx)
.store_at(output, ty)
.bound(c, 0, 4)
.copy_to_host()
.async()
.reorder_storage(c, x, y)
.fold_storage(x, tile_width * 2);
}
break;
}
}