当前位置: 首页>>代码示例>>C++>>正文


C++ Output::compute_root方法代码示例

本文整理汇总了C++中Output::compute_root方法的典型用法代码示例。如果您正苦于以下问题:C++ Output::compute_root方法的具体用法?C++ Output::compute_root怎么用?C++ Output::compute_root使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Output的用法示例。


在下文中一共展示了Output::compute_root方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: generate

    void generate() {
        // The algorithm.

        // Some free variables, where x and y represent the spatial dimensions.
        Var x("x"), y("y"), depth("depth"), batch("batch");

        // Pad x and y with the value that produces zero after the input offset is
        // added. The input offset is bounded to the range of a uint8, so this is
        // safe.
        Func input_bounded =
            constant_exterior(input_, cast<uint8_t>(-input_offset_),
                              { { Expr(), Expr() },
                                { 0, input_.dim(1).extent() },
                                { 0, input_.dim(2).extent() },
                                { Expr(), Expr() } });

        // For the filter, add the offset and upcast to 16-bit.
        Func filter_with_offset("filter_with_offset");
        filter_with_offset(depth, x, y) =
            cast<int16_t>(filter_(depth, x, y)) + filter_offset_;

        // Shift the input spatially in [x, y] by -[pad_width, pad_height].
        Func shifted_input_with_offset("shifted_input_with_offset");
        shifted_input_with_offset(depth, x, y, batch) = input_bounded(
            depth, x - pad_width_, y - pad_height_, batch);

        // Apply the depth multiplier.
        Func resampled_input("resampled_input");
        resampled_input(depth, x, y, batch) =
            shifted_input_with_offset(depth / depth_multiplier_, x, y, batch);

        // For the input, add the offset and upcast to 16-bit. This happens after
        // resampling so we don't need to store/load as much data in the inner loop
        // (at the cost of one add in the inner loop instead).
        Func resampled_input_with_offset("resampled_input_with_offset");
        resampled_input_with_offset(depth, x, y, batch) =
            cast<int16_t>(resampled_input(depth, x, y, batch)) + input_offset_;

        // Do the convolution in 32-bit. Apply the input stride. As before, the
        // case stride == 1 is written separately for performance reasons.
        Func convolved("convolved");
        RDom filter_dom(0, filter_.dim(1).extent(), 0, filter_.dim(2).extent());
        convolved(depth, x, y, batch) +=
            (cast<int32_t>(filter_with_offset(depth, filter_dom.x, filter_dom.y)) *
             cast<int32_t>(
                 resampled_input_with_offset(depth, x * stride_ + filter_dom.x,
                                             y * stride_ + filter_dom.y, batch)));

        Func scaled_plus_offset("scaled_plus_offset");
        scaled_plus_offset(depth, x, y, batch) =
            multiply_quantized_multiplier(
                convolved(depth, x, y, batch) + bias_(depth), output_multiplier_,
                output_shift_) +
            output_offset_;

        // Saturate and narrow the output.
        output_(depth, x, y, batch) =
            clamp(u8_sat(scaled_plus_offset(depth, x, y, batch)),
                  output_min_, output_max_);

        // The schedule.
        int vector_size_u8 = get_target().natural_vector_size<uint8_t>();
        if (get_target().has_feature(Target::HVX_64)) {
            vector_size_u8 = 64;
        } else if (get_target().has_feature(Target::HVX_128)) {
            vector_size_u8 = 128;
        }
        const bool use_hexagon =
            get_target().features_any_of({ Target::HVX_64, Target::HVX_128 });

        // Specifying .hexagon() on a Func will generate an RPC to run this stage
        // on Hexagon. If Hexagon is the host (that is, the architecture is
        // Hexagon), we have to omit the .hexagon() directive as we are already
        // running on Hexagon.
        if (use_hexagon && get_target().arch != Target::Hexagon) {
            output_.hexagon();
        }

        output_.compute_root();

        // We can't parallize batches, as we often have just a single batch to
        // process. Also, x and y dimensions are often fairly small (8x8, 16x16).
        // For now, we parallize along y, but may need to adapt when benchmarking
        // real models.
        Var yi("yi");
        // For small tensors, make sure the split factor is not larger than the
        // output y extent.
        Expr y_split_factor = min(input_.dim(2).extent() / stride_, 4);

        output_.split(y, y, yi, y_split_factor).parallel(y);
        output_.vectorize(depth, vector_size_u8, TailStrategy::RoundUp);

        if (use_hexagon) {
            // Scheduling specifics for Hexagon.

            if (depth_multiplier_ > 1) {
                ScheduleResampledInput(output_, depth, y, depth_multiplier_,
                                       vector_size_u8, &resampled_input);
            }
            output_.prefetch(input_, yi);
//.........这里部分代码省略.........
开发者ID:adityaatluri,项目名称:Halide,代码行数:101,代码来源:DepthwiseConvolution_generator.cpp

示例2: generate

    void generate() {
        Var x{"x"}, y{"y"}, c{"c"};

        // We need a wrapper for the output so we can schedule the
        // multiply update in tiles.
        Func copy("copy");

        copy(x, y, c) = input(x, y, c);

        output(x, y, c) = copy(x, y, c) * 2;

        input.dim(0).set_stride(4);
        output.dim(0).set_stride(4);  

        Var tx("tx"), ty("ty");
        Var ta("ta"), tb("tb");

        // Break the output into tiles.
        const int tile_width = 128;
        const int tile_height = 32;

        switch ((Schedule)schedule) {
            case Schedule::Basic:
            default:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y);
            break;
            case Schedule::Fold:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            break;
            case Schedule::Async:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .async()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            break;
            case Schedule::Split: {
                Expr fac = output.dim(1).extent()/2;
                Var yo, yi;
                output.split(y, yo, yi, fac);
                output.compute_root()
                      .reorder(c, x, yo)
                      .bound(c, 0, 4)
                      .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
                      .parallel(yo);

                copy.compute_at(output, tx)
                    .store_at(output, ty)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y);
            }
            break;
            case Schedule::Split_Fold: {
                Expr fac = output.dim(1).extent()/2;
                Var yo, yi;
                output.split(y, yo, yi, fac);
                output.compute_root()
                      .reorder(c, x, yo)
                      .bound(c, 0, 4)
                      .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
                      .parallel(yo);

                copy.compute_at(output, tx)
                    .store_at(output, ty)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .async()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            }
            break;
        }
    }
开发者ID:white-pony,项目名称:Halide,代码行数:100,代码来源:pipeline_raw_linear_ro_basic_interleaved.cpp


注:本文中的Output::compute_root方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。