当前位置: 首页>>代码示例>>C++>>正文


C++ command_queue::enqueue_1d_range_kernel方法代码示例

本文整理汇总了C++中command_queue::enqueue_1d_range_kernel方法的典型用法代码示例。如果您正苦于以下问题:C++ command_queue::enqueue_1d_range_kernel方法的具体用法?C++ command_queue::enqueue_1d_range_kernel怎么用?C++ command_queue::enqueue_1d_range_kernel使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在command_queue的用法示例。


在下文中一共展示了command_queue::enqueue_1d_range_kernel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: generate

    void generate(OutputIterator first_ctr, OutputIterator last_ctr, command_queue &queue) {
        const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr);
        if(!size_ctr) {
            return;
        }
        boost::compute::vector<uint_> vector_key(size_ctr, m_context);
        vector_key.assign(size_ctr, 0, queue);
        kernel rng_kernel = m_program.create_kernel("generate_rng");

        rng_kernel.set_arg(0, first_ctr.get_buffer());
        rng_kernel.set_arg(1, vector_key);
        size_t offset = 0;

        for(;;){
            size_t count = 0;
            size_t size = size_ctr/2;
            if(size > threads){
                count = threads;
            }
            else {
                count = size;
            }
            rng_kernel.set_arg(2, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0);

            offset += count;

            if(offset >= size){
                break;
            }

        }
    }
开发者ID:jpola,项目名称:compute,代码行数:33,代码来源:threefry_engine.hpp

示例2: fill

    void fill(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        const buffer &buffer = first.get_buffer();
        const size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(0, m_state_buffer);
        fill_kernel.set_arg(1, buffer);

        size_t p = 0;

        for(;;){
            size_t count = 0;
            if(size - p >= n)
                count = n;
            else
                count = size - p;

            fill_kernel.set_arg(2, static_cast<uint_>(p));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            p += n;

            if(p >= size)
                break;

            generate_state(queue);
        }
    }
开发者ID:sajis997,项目名称:compute,代码行数:29,代码来源:mersenne_twister.hpp

示例3: generate

    void generate(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        const size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(0, m_state_buffer);
        fill_kernel.set_arg(2, first.get_buffer());

        size_t offset = 0;
        size_t &p = m_state_index;

        for(;;){
            size_t count = 0;
            if(size > n){
                count = n;
            }
            else {
                count = size;
            }
            fill_kernel.set_arg(1, static_cast<const uint_>(p));
            fill_kernel.set_arg(3, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            p += count;
            offset += count;

            if(offset >= size){
                break;
            }

            generate_state(queue);
            p = 0;
        }
    }
开发者ID:huiyongsun,项目名称:compute,代码行数:34,代码来源:mersenne_twister_engine.hpp

示例4: generate

    void generate(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(1, m_multiplicands);
        fill_kernel.set_arg(2, first.get_buffer());

        size_t offset = 0;

        for(;;){
            size_t count = 0;
            if(size > threads){
                count = threads;
            }
            else {
                count = size;
            }
            fill_kernel.set_arg(0, static_cast<const uint_>(m_seed));
            fill_kernel.set_arg(3, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            offset += count;

            if(offset >= size){
                break;
            }

            update_seed(queue);
        }
    }
开发者ID:huiyongsun,项目名称:compute,代码行数:31,代码来源:linear_congruential_engine.hpp

示例5: binary_find

inline InputIterator binary_find(InputIterator first,
                                 InputIterator last,
                                 UnaryPredicate predicate,
                                 command_queue &queue = system::default_queue())
{
    const device &device = queue.get_device();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    const std::string cache_key = "__boost_binary_find";

    size_t find_if_limit = 128;
    size_t threads = parameters->get(cache_key, "tpb", 128);
    size_t count = iterator_range_size(first, last);

    InputIterator search_first = first;
    InputIterator search_last = last;

    scalar<uint_> index(queue.get_context());

    // construct and compile binary_find kernel
    binary_find_kernel<InputIterator, UnaryPredicate>
        binary_find_kernel(search_first, search_last, predicate);
    ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context());

    // set buffer for index
    kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer());

    while(count > find_if_limit) {
        index.write(static_cast<uint_>(count), queue);

        // set block and run binary_find kernel
        uint_ block = static_cast<uint_>((count - 1)/(threads - 1));
        kernel.set_arg(binary_find_kernel.m_block_arg, block);
        queue.enqueue_1d_range_kernel(kernel, 0, threads, 0);

        size_t i = index.read(queue);

        if(i == count) {
            search_first = search_last - ((count - 1)%(threads - 1));
            break;
        } else {
            search_last = search_first + i;
            search_first = search_last - ((count - 1)/(threads - 1));
        }

        // Make sure that first and last stay within the input range
        search_last = (std::min)(search_last, last);
        search_last = (std::max)(search_last, first);

        search_first = (std::max)(search_first, first);
        search_first = (std::min)(search_first, last);

        count = iterator_range_size(search_first, search_last);
    }

    return find_if(search_first, search_last, predicate, queue);
}
开发者ID:2bbb,项目名称:compute,代码行数:59,代码来源:binary_find.hpp

示例6: initial_reduce

inline void initial_reduce(InputIterator first,
                           InputIterator last,
                           buffer result,
                           const Function &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;
    (void) reduce_kernel;

    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;

    size_t count = std::distance(first, last);
    detail::meta_kernel k("initial_reduce");
    k.add_set_arg<const uint_>("count", uint_(count));
    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");

    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<

        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(offset + lid + i*TPB < count){\n" <<
        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n" <<

        // local reduction
        ReduceBody<T,false>::body() <<

        // write sum to output
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n";

    const context &context = queue.get_context();
    std::stringstream options;
    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
    kernel generic_reduce_kernel = k.compile(context, options.str());
    generic_reduce_kernel.set_arg(output_arg, result);

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}
开发者ID:Fantasticer,项目名称:arangodb,代码行数:54,代码来源:reduce_on_gpu.hpp

示例7: compile

    void exec_1d(command_queue &queue,
                 size_t global_work_offset,
                 size_t global_work_size)
    {
        const context &context = queue.get_context();

        ::boost::compute::kernel kernel = compile(context);

        queue.enqueue_1d_range_kernel(kernel,
                                      global_work_offset,
                                      global_work_size);
    }
开发者ID:d-meiser,项目名称:compute,代码行数:12,代码来源:meta_kernel.hpp

示例8: scan_on_cpu

inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  command_queue &queue)
{
    if(first == last){
        return result;
    }

    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();

    // create scan kernel
    meta_kernel k("scan_on_cpu");
    k.add_arg<ulong_>("n");
    k <<
        k.decl<input_type>("sum") << " = 0;\n" <<
        "for(ulong i = 0; i < n; i++){\n" <<
        k.decl<const input_type>("x") << " = "
            << first[k.var<ulong_>("i")] << ";\n";

    if(exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "    sum = sum + x;\n";

    if(!exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "}\n";

    // compile scan kernel
    kernel scan_kernel = k.compile(context);

    // setup kernel arguments
    size_t n = detail::iterator_range_size(first, last);
    scan_kernel.set_arg<ulong_>(0, n);

    // execute the kernel
    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);

    // return iterator pointing to the end of the result range
    return result + n;
}
开发者ID:Quanteek,项目名称:compute,代码行数:51,代码来源:scan_on_cpu.hpp

示例9: pp_floor

inline InputIterator pp_floor(InputIterator first,
                              InputIterator last,
                              ValueType value,
                              command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return last;
    }
    const context &context = queue.get_context();

    detail::meta_kernel k("pp_floor");
    size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
    size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value");
    atomic_max<int_> atomic_max_int;

    k << k.decl<const int_>("i") << " = get_global_id(0);\n"
      << k.decl<const value_type>("cur_value") << "="
      <<     first[k.var<const int_>("i")] << ";\n"
      << "if(cur_value >= " << first[k.expr<int_>("*index")]
      << "      && cur_value < value){\n"
      << "    " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
      << "}\n";

    kernel kernel = k.compile(context);

    scalar<int_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());

    index.write(static_cast<int_>(0), queue);

    kernel.set_arg(value_arg, value);

    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);

    int result = static_cast<int>(index.read(queue));
    return first + result;
}
开发者ID:alanwoolley,项目名称:innoextract-android,代码行数:40,代码来源:prev_permutation.hpp

示例10: prev_permutation_helper

inline InputIterator prev_permutation_helper(InputIterator first,
                                             InputIterator last,
                                             command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0 || count == 1){
        return last;
    }
    count = count - 1;
    const context &context = queue.get_context();

    detail::meta_kernel k("prev_permutation");
    size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
    atomic_max<int_> atomic_max_int;

    k << k.decl<const int_>("i") << " = get_global_id(0);\n"
      << k.decl<const value_type>("cur_value") << "="
      <<     first[k.var<const int_>("i")] << ";\n"
      << k.decl<const value_type>("next_value") << "="
      <<     first[k.expr<const int_>("i+1")] << ";\n"
      << "if(cur_value > next_value){\n"
      << "    " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
      << "}\n";

    kernel kernel = k.compile(context);

    scalar<int_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());

    index.write(static_cast<int_>(-1), queue);

    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);

    int result = static_cast<int>(index.read(queue));
    if(result == -1) return last;
    else return first + result;
}
开发者ID:alanwoolley,项目名称:innoextract-android,代码行数:39,代码来源:prev_permutation.hpp

示例11: find_end_helper

inline InputIterator find_end_helper(InputIterator first,
                                     InputIterator last,
                                     UnaryPredicate predicate,
                                     command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return last;
    }

    const context &context = queue.get_context();

    detail::meta_kernel k("find_end");
    size_t index_arg = k.add_arg<int *>("__global", "index");
    atomic_max<int_> atomic_max_int;

    k << k.decl<const int_>("i") << " = get_global_id(0);\n"
      << k.decl<const value_type>("value") << "="
      <<     first[k.var<const int_>("i")] << ";\n"
      << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
      << "    " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
      << "}\n";

    kernel kernel = k.compile(context);

    scalar<int_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());

    index.write(static_cast<int_>(-1), queue);

    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);

    int result = static_cast<int>(index.read(queue));
    if(result == -1) return last;
    else return first + result;
}
开发者ID:BeauJoh,项目名称:compute,代码行数:38,代码来源:find_end.hpp

示例12: block_insertion_sort

inline void block_insertion_sort(KeyIterator keys_first,
                                 ValueIterator values_first,
                                 Compare compare,
                                 const size_t count,
                                 const size_t block_size,
                                 const bool sort_by_key,
                                 command_queue &queue)
{
    (void) values_first;

    typedef typename std::iterator_traits<KeyIterator>::value_type K;
    typedef typename std::iterator_traits<ValueIterator>::value_type T;

    meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
    size_t count_arg = k.add_arg<uint_>("count");
    size_t block_size_arg = k.add_arg<uint_>("block_size");

    k <<
        k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
        k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<

        // block insertion sort (stable)
        "for(uint i = start+1; i < end; i++){\n" <<
        "    " << k.decl<const K>("key") << " = " <<
                  keys_first[k.var<uint_>("i")] << ";\n";
    if(sort_by_key){
        k <<
        "    " << k.decl<const T>("value") << " = " <<
                  values_first[k.var<uint_>("i")] << ";\n";
    }
    k <<
        "    uint pos = i;\n" <<
        "    while(pos > start && " <<
                   compare(k.var<const K>("key"),
                           keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
        "        " << keys_first[k.var<uint_>("pos")] << " = " <<
                      keys_first[k.var<uint_>("pos-1")] << ";\n";
    if(sort_by_key){
        k <<
        "        " << values_first[k.var<uint_>("pos")] << " = " <<
                      values_first[k.var<uint_>("pos-1")] << ";\n";
    }
    k <<
        "        pos--;\n" <<
        "    }\n" <<
        "    " << keys_first[k.var<uint_>("pos")] << " = key;\n";
    if(sort_by_key) {
        k <<
        "    " << values_first[k.var<uint_>("pos")] << " = value;\n";
    }
    k <<
        "}\n"; // block insertion sort

    const context &context = queue.get_context();
    ::boost::compute::kernel kernel = k.compile(context);
    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));

    const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
}
开发者ID:2bbb,项目名称:compute,代码行数:61,代码来源:merge_sort_on_cpu.hpp

示例13: scan_on_cpu

inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  T init,
                                  BinaryOperator op,
                                  command_queue &queue)
{
    if(first == last){
        return result;
    }

    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();

    // create scan kernel
    meta_kernel k("scan_on_cpu");

    // Arguments
    size_t n_arg = k.add_arg<ulong_>("n");
    size_t init_arg = k.add_arg<output_type>("initial_value");

    if(!exclusive){
        k <<
            k.decl<const ulong_>("start_idx") << " = 1;\n" <<
            k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
            result[0] << " = sum;\n";
    }
    else {
        k <<
            k.decl<const ulong_>("start_idx") << " = 0;\n" <<
            k.decl<output_type>("sum") << " = initial_value;\n";
    }

    k <<
        "for(ulong i = start_idx; i < n; i++){\n" <<
        k.decl<const input_type>("x") << " = "
            << first[k.var<ulong_>("i")] << ";\n";

    if(exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "    sum = "
        << op(k.var<output_type>("sum"), k.var<output_type>("x"))
        << ";\n";

    if(!exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "}\n";

    // compile scan kernel
    kernel scan_kernel = k.compile(context);

    // setup kernel arguments
    size_t n = detail::iterator_range_size(first, last);
    scan_kernel.set_arg<ulong_>(n_arg, n);
    scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));

    // execute the kernel
    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);

    // return iterator pointing to the end of the result range
    return result + n;
}
开发者ID:2bbb,项目名称:compute,代码行数:71,代码来源:scan_on_cpu.hpp

示例14: reduce

size_t reduce(InputIterator first,
              size_t count,
              OutputIterator result,
              size_t block_size,
              BinaryFunction function,
              command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t block_count = count / 2 / block_size;
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));

    if(block_count != 0){
        meta_kernel k("block_reduce");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");

        k <<
            "const uint gid = get_global_id(0);\n" <<
            "const uint lid = get_local_id(0);\n" <<

            // copy values to local memory
            "block[lid] = " <<
                function(first[k.make_var<uint_>("gid*2+0")],
                         first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<

            // perform reduction
            "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "    uint mask = (i << 1) - 1;\n" <<
            "    if((lid & mask) == 0){\n" <<
            "        block[lid] = " <<
                         function(k.expr<input_type>("block[lid]"),
                                  k.expr<input_type>("block[lid+i]")) << ";\n" <<
            "    }\n" <<
            "}\n" <<

            // write block result to global output
            "if(lid == 0)\n" <<
            "    output[get_group_id(0)] = block[0];\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(block_arg, local_buffer<input_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);
    }

    // serially reduce any leftovers
    if(block_count * block_size * 2 < count){
        size_t last_block_start = block_count * block_size * 2;

        meta_kernel k("extra_serial_reduce");
        size_t count_arg = k.add_arg<uint_>("count");
        size_t offset_arg = k.add_arg<uint_>("offset");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t output_offset_arg = k.add_arg<uint_>("output_offset");

        k <<
            k.decl<result_type>("result") << " = \n" <<
                first[k.expr<uint_>("offset")] << ";\n" <<
            "for(uint i = offset + 1; i < count; i++)\n" <<
            "    result = " <<
                     function(k.var<result_type>("result"),
                              first[k.var<uint_>("i")]) << ";\n" <<
            "output[output_offset] = result;\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(count_arg, static_cast<uint_>(count));
        kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));

        queue.enqueue_task(kernel);
    }

    return total_block_count;
}
开发者ID:junmuz,项目名称:compute,代码行数:88,代码来源:reduce.hpp

示例15: find_extrema_with_reduce


//.........这里部分代码省略.........
            k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
            "#endif\n" <<

            // Comparison between currently best element (acc) and next element
            "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
            "compare_result = " << compare(k.var<input_type>("next"),
                                           k.var<input_type>("acc")) << ";\n" <<
            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "equal = !compare_result && !" <<
                compare(k.var<input_type>("acc"),
                        k.var<input_type>("next")) << ";\n" <<
            "# endif\n" <<
            "#else\n" <<
            "compare_result = " << compare(k.var<input_type>("acc"),
                                           k.var<input_type>("next")) << ";\n" <<
            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "equal = !compare_result && !" <<
                compare(k.var<input_type>("next"),
                        k.var<input_type>("acc")) << ";\n" <<
            "# endif\n" <<
            "#endif\n" <<

            // save the winner
            "acc = compare_result ? acc : next;\n" <<
            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "acc_idx = compare_result ? " <<
                "acc_idx : " <<
                "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
            "#else\n" <<
            "acc_idx = compare_result ? acc_idx : idx;\n" <<
            "#endif\n" <<
            "idx += get_global_size(0);\n" <<
        "}\n\n" <<

        // Work item local id
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
        "block[lid] = acc;\n" <<
        "block_idx[lid] = acc_idx;\n" <<
        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<

        k.decl<uint_>("group_offset") <<
            " = count - (get_local_size(0) * get_group_id(0));\n\n";

    k <<
        "#pragma unroll\n"
        "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
             "offset = offset / 2) {\n" <<
             "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
                 k.decl<input_type>("mine") << " = block[lid];\n" <<
                 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
                 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
                 "compare_result = " << compare(k.var<input_type>("other"),
                                                k.var<input_type>("mine")) << ";\n" <<
                 "equal = !compare_result && !" <<
                     compare(k.var<input_type>("mine"),
                             k.var<input_type>("other")) << ";\n" <<
                 "#else\n" <<
                 "compare_result = " << compare(k.var<input_type>("mine"),
                                                k.var<input_type>("other")) << ";\n" <<
                 "equal = !compare_result && !" <<
                     compare(k.var<input_type>("other"),
                             k.var<input_type>("mine")) << ";\n" <<
                 "#endif\n" <<
                 "block[lid] = compare_result ? mine : other;\n" <<
                 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
                 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
                 "block_idx[lid] = compare_result ? " <<
                     "mine_idx : " <<
                     "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
             "}\n"
             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "}\n\n" <<

         // write block result to global output
        "if(lid == 0){\n" <<
            result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
            result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
        "}";

    std::string options;
    if(!find_minimum){
        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
    }
    if(use_input_idx){
        options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
    }

    kernel kernel = k.compile(context, options);

    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
    kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));

    queue.enqueue_1d_range_kernel(kernel,
                                  0,
                                  work_groups_no * work_group_size,
                                  work_group_size);
}
开发者ID:2bbb,项目名称:compute,代码行数:101,代码来源:find_extrema_with_reduce.hpp


注:本文中的command_queue::enqueue_1d_range_kernel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。