本文整理汇总了C++中command_queue类的典型用法代码示例。如果您正苦于以下问题:C++ command_queue类的具体用法?C++ command_queue怎么用?C++ command_queue使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了command_queue类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: bitonic_block_sort
inline size_t bitonic_block_sort(KeyIterator keys_first,
ValueIterator values_first,
Compare compare,
const size_t count,
const bool sort_by_key,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
meta_kernel k("bitonic_block_sort");
size_t count_arg = k.add_arg<const uint_>("count");
size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
size_t local_vals_arg = 0;
if(sort_by_key) {
local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
}
k <<
// Work item global and local ids
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n";
// declare my_key and my_value
k <<
k.decl<key_type>("my_key") << ";\n";
// Instead of copying values (my_value) in local memory with keys
// we save local index (uchar) and copy my_value at the end at
// final index. This saves local memory.
if(sort_by_key)
{
k <<
k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
}
// load key
k <<
"if(gid < count) {\n" <<
k.var<key_type>("my_key") << " = " <<
keys_first[k.var<const uint_>("gid")] << ";\n" <<
"}\n";
// load key and index to local memory
k <<
"lkeys[lid] = my_key;\n";
if(sort_by_key)
{
k <<
"lidx[lid] = my_index;\n";
}
k <<
k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
// When work group size is a power of 2 bitonic sorter can be used;
// otherwise, slower odd-even sort is used.
k <<
// check if n is power of 2
"if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
// bitonic sort, not stable
k <<
// wait for keys and vals to be stored in local memory
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"#pragma unroll\n" <<
"for(" <<
k.decl<uint_>("length") << " = 1; " <<
"length < n; " <<
"length <<= 1" <<
") {\n" <<
// direction of sort: false -> asc, true -> desc
k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
"for(" <<
k.decl<uint_>("k") << " = length; " <<
"k > 0; " <<
"k >>= 1" <<
") {\n" <<
// sibling to compare with my key
k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
k.decl<bool>("compare") << " = " <<
compare(k.var<key_type>("sibling_key"),
k.var<key_type>("my_key")) << ";\n" <<
k.decl<bool>("equal") << " = !(compare || " <<
compare(k.var<key_type>("my_key"),
k.var<key_type>("sibling_key")) << ");\n" <<
k.decl<bool>("swap") <<
" = compare ^ (sibling_idx < lid) ^ direction;\n" <<
"swap = equal ? false : swap;\n" <<
"my_key = swap ? sibling_key : my_key;\n";
if(sort_by_key)
{
k <<
"my_index = swap ? lidx[sibling_idx] : my_index;\n";
}
k <<
//.........这里部分代码省略.........
示例2: find_extrema_with_reduce
InputIterator find_extrema_with_reduce(InputIterator first,
InputIterator last,
::boost::compute::less<
typename std::iterator_traits<
InputIterator
>::value_type
>
compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
// Getting information about used queue and device
const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
const size_t count = detail::iterator_range_size(first, last);
std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ type_name<input_type>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// get preferred work group size and preferred number
// of work groups per compute unit
size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
// calculate work group size and number of work groups
work_group_size = (std::min)(max_work_group_size, work_group_size);
size_t work_groups_no = compute_units_no * work_groups_per_cu;
work_groups_no = (std::min)(
work_groups_no,
static_cast<size_t>(std::ceil(float(count) / work_group_size))
);
// phase I: finding candidates for extremum
// device buffors for extremum candidates and their indices
// each work-group computes its candidate
// zero-copy buffers are used to eliminate copying data back to host
vector<input_type, ::boost::compute::pinned_allocator<input_type> >
candidates(work_groups_no, context);
vector<uint_, ::boost::compute::pinned_allocator <uint_> >
candidates_idx(work_groups_no, context);
// finding candidates for first extremum and their indices
find_extrema_with_reduce(
first, count, candidates.begin(), candidates_idx.begin(),
work_groups_no, work_group_size, compare, find_minimum, queue
);
// phase II: finding extremum from among the candidates
// mapping candidates and their indices to host
input_type* candidates_host_ptr =
static_cast<input_type*>(
queue.enqueue_map_buffer(
candidates.get_buffer(), command_queue::map_read,
0, work_groups_no * sizeof(input_type)
)
);
uint_* candidates_idx_host_ptr =
static_cast<uint_*>(
queue.enqueue_map_buffer(
candidates_idx.get_buffer(), command_queue::map_read,
0, work_groups_no * sizeof(uint_)
)
);
input_type* i = candidates_host_ptr;
uint_* idx = candidates_idx_host_ptr;
uint_* extremum_idx = idx;
input_type extremum = *candidates_host_ptr;
i++; idx++;
// find extremum (serial) from among the candidates on host
if(!find_minimum) {
while(idx != (candidates_idx_host_ptr + work_groups_no)) {
input_type next = *i;
bool compare_result = next > extremum;
bool equal = next == extremum;
extremum = compare_result ? next : extremum;
extremum_idx = compare_result ? idx : extremum_idx;
extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
idx++, i++;
}
}
else {
while(idx != (candidates_idx_host_ptr + work_groups_no)) {
input_type next = *i;
bool compare_result = next < extremum;
//.........这里部分代码省略.........
示例3: reduce
size_t reduce(InputIterator first,
size_t count,
OutputIterator result,
size_t block_size,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type
input_type;
typedef typename
boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
result_type;
const context &context = queue.get_context();
size_t block_count = count / 2 / block_size;
size_t total_block_count =
static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
if(block_count != 0){
meta_kernel k("block_reduce");
size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
k <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n" <<
// copy values to local memory
"block[lid] = " <<
function(first[k.make_var<uint_>("gid*2+0")],
first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
// perform reduction
"for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" if((lid & mask) == 0){\n" <<
" block[lid] = " <<
function(k.expr<input_type>("block[lid]"),
k.expr<input_type>("block[lid+i]")) << ";\n" <<
" }\n" <<
"}\n" <<
// write block result to global output
"if(lid == 0)\n" <<
" output[get_group_id(0)] = block[0];\n";
kernel kernel = k.compile(context);
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
}
// serially reduce any leftovers
if(block_count * block_size * 2 < count){
size_t last_block_start = block_count * block_size * 2;
meta_kernel k("extra_serial_reduce");
size_t count_arg = k.add_arg<uint_>("count");
size_t offset_arg = k.add_arg<uint_>("offset");
size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
size_t output_offset_arg = k.add_arg<uint_>("output_offset");
k <<
k.decl<result_type>("result") << " = \n" <<
first[k.expr<uint_>("offset")] << ";\n" <<
"for(uint i = offset + 1; i < count; i++)\n" <<
" result = " <<
function(k.var<result_type>("result"),
first[k.var<uint_>("i")]) << ";\n" <<
"output[output_offset] = result;\n";
kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
queue.enqueue_task(kernel);
}
return total_block_count;
}
示例4: get_context_id
/// Returns raw context id for the given queue.
inline context_id get_context_id(const command_queue &q) {
return q.get_context().get();
}
示例5: inplace_reduce
inline void inplace_reduce(Iterator first,
Iterator last,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<Iterator>::value_type
value_type;
size_t input_size = iterator_range_size(first, last);
if(input_size < 2){
return;
}
const context &context = queue.get_context();
size_t block_size = 64;
size_t values_per_thread = 8;
size_t block_count = input_size / (block_size * values_per_thread);
if(block_count * block_size * values_per_thread != input_size)
block_count++;
vector<value_type> output(block_count, context);
meta_kernel k("inplace_reduce");
size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
size_t input_size_arg = k.add_arg<const uint_>("input_size");
size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
k <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n" <<
"const uint values_per_thread =\n"
<< uint_(values_per_thread) << ";\n" <<
// thread reduce
"const uint index = gid * values_per_thread;\n" <<
"if(index < input_size){\n" <<
k.decl<value_type>("sum") << " = input[index];\n" <<
"for(uint i = 1;\n" <<
"i < values_per_thread && (index + i) < input_size;\n" <<
"i++){\n" <<
" sum = " <<
function(k.var<value_type>("sum"),
k.var<value_type>("input[index+i]")) << ";\n" <<
"}\n" <<
"scratch[lid] = sum;\n" <<
"}\n" <<
// local reduce
"for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" uint next_index = (gid + i) * values_per_thread;\n"
" if((lid & mask) == 0 && next_index < input_size){\n" <<
" scratch[lid] = " <<
function(k.var<value_type>("scratch[lid]"),
k.var<value_type>("scratch[lid+i]")) << ";\n" <<
" }\n" <<
"}\n" <<
// write output for block
"if(lid == 0){\n" <<
" output[get_group_id(0)] = scratch[0];\n" <<
"}\n"
;
const buffer *input_buffer = &first.get_buffer();
const buffer *output_buffer = &output.get_buffer();
kernel kernel = k.compile(context);
while(input_size > 1){
kernel.set_arg(input_arg, *input_buffer);
kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
kernel.set_arg(output_arg, *output_buffer);
kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
input_size =
static_cast<size_t>(
std::ceil(float(input_size) / (block_size * values_per_thread)
)
);
block_count = input_size / (block_size * values_per_thread);
if(block_count * block_size * values_per_thread != input_size)
block_count++;
std::swap(input_buffer, output_buffer);
}
if(input_buffer != &first.get_buffer()){
::boost::compute::copy(output.begin(),
output.begin() + 1,
first,
//.........这里部分代码省略.........
示例6: duplicate_queue
/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
return command_queue(q.get_context(), q.get_device(), q.get_properties());
}
示例7: get_device
/// Returns device associated with the given queue.
inline device get_device(const command_queue &q) {
return q.get_device();
}
示例8: scan_on_cpu
inline OutputIterator scan_on_cpu(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
if(first == last){
return result;
}
typedef typename
std::iterator_traits<InputIterator>::value_type input_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
// create scan kernel
meta_kernel k("scan_on_cpu");
// Arguments
size_t n_arg = k.add_arg<ulong_>("n");
size_t init_arg = k.add_arg<output_type>("initial_value");
if(!exclusive){
k <<
k.decl<const ulong_>("start_idx") << " = 1;\n" <<
k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
result[0] << " = sum;\n";
}
else {
k <<
k.decl<const ulong_>("start_idx") << " = 0;\n" <<
k.decl<output_type>("sum") << " = initial_value;\n";
}
k <<
"for(ulong i = start_idx; i < n; i++){\n" <<
k.decl<const input_type>("x") << " = "
<< first[k.var<ulong_>("i")] << ";\n";
if(exclusive){
k << result[k.var<ulong_>("i")] << " = sum;\n";
}
k << " sum = "
<< op(k.var<output_type>("sum"), k.var<output_type>("x"))
<< ";\n";
if(!exclusive){
k << result[k.var<ulong_>("i")] << " = sum;\n";
}
k << "}\n";
// compile scan kernel
kernel scan_kernel = k.compile(context);
// setup kernel arguments
size_t n = detail::iterator_range_size(first, last);
scan_kernel.set_arg<ulong_>(n_arg, n);
scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
// execute the kernel
queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
// return iterator pointing to the end of the result range
return result + n;
}
示例9: radix_sort_impl
inline void radix_sort_impl(const buffer_iterator<T> first,
const buffer_iterator<T> last,
const buffer_iterator<T2> values_first,
const bool ascending,
command_queue &queue)
{
typedef T value_type;
typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;
const device &device = queue.get_device();
const context &context = queue.get_context();
// if we have a valid values iterator then we are doing a
// sort by key and have to set up the values buffer
bool sort_by_key = (values_first.get_buffer().get() != 0);
// load (or create) radix sort program
std::string cache_key =
std::string("__boost_radix_sort_") + type_name<value_type>();
if(sort_by_key){
cache_key += std::string("_with_") + type_name<T2>();
}
boost::shared_ptr<program_cache> cache =
program_cache::get_global_cache(context);
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// sort parameters
const uint_ k = parameters->get(cache_key, "k", 4);
const uint_ k2 = 1 << k;
const uint_ block_size = parameters->get(cache_key, "tpb", 128);
// sort program compiler options
std::stringstream options;
options << "-DK_BITS=" << k;
options << " -DT=" << type_name<sort_type>();
options << " -DBLOCK_SIZE=" << block_size;
if(boost::is_floating_point<value_type>::value){
options << " -DIS_FLOATING_POINT";
}
if(boost::is_signed<value_type>::value){
options << " -DIS_SIGNED";
}
if(sort_by_key){
options << " -DSORT_BY_KEY";
options << " -DT2=" << type_name<T2>();
options << enable_double<T2>();
}
if(ascending){
options << " -DASC";
}
// load radix sort program
program radix_sort_program = cache->get_or_build(
cache_key, options.str(), radix_sort_source, context
);
kernel count_kernel(radix_sort_program, "count");
kernel scan_kernel(radix_sort_program, "scan");
kernel scatter_kernel(radix_sort_program, "scatter");
size_t count = detail::iterator_range_size(first, last);
uint_ block_count = static_cast<uint_>(count / block_size);
if(block_count * block_size != count){
block_count++;
}
// setup temporary buffers
vector<value_type> output(count, context);
vector<T2> values_output(sort_by_key ? count : 0, context);
vector<uint_> offsets(k2, context);
vector<uint_> counts(block_count * k2, context);
const buffer *input_buffer = &first.get_buffer();
uint_ input_offset = static_cast<uint_>(first.get_index());
const buffer *output_buffer = &output.get_buffer();
uint_ output_offset = 0;
const buffer *values_input_buffer = &values_first.get_buffer();
uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
const buffer *values_output_buffer = &values_output.get_buffer();
uint_ values_output_offset = 0;
for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
// write counts
count_kernel.set_arg(0, *input_buffer);
count_kernel.set_arg(1, input_offset);
count_kernel.set_arg(2, static_cast<uint_>(count));
count_kernel.set_arg(3, counts);
count_kernel.set_arg(4, offsets);
count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
count_kernel.set_arg(6, i * k);
//.........这里部分代码省略.........
示例10: scan_on_cpu
inline OutputIterator scan_on_cpu(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type input_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
const size_t compute_units = queue.get_device().compute_units();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
std::string cache_key =
"__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
// for inputs smaller than serial_scan_threshold
// serial_scan algorithm is used
uint_ serial_scan_threshold =
parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
serial_scan_threshold =
(std::max)(serial_scan_threshold, uint_(compute_units));
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return result;
}
else if(count < serial_scan_threshold) {
return serial_scan(first, last, result, exclusive, init, op, queue);
}
buffer block_partial_sums(context, sizeof(output_type) * compute_units );
// create scan kernel
meta_kernel k("scan_on_cpu_block_scan");
// Arguments
size_t count_arg = k.add_arg<uint_>("count");
size_t init_arg = k.add_arg<output_type>("initial_value");
size_t block_partial_sums_arg =
k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
k <<
"uint block = " <<
"(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
"uint index = get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n";
if(!exclusive){
k <<
k.decl<output_type>("sum") << " = " <<
first[k.var<uint_>("index")] << ";\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"index++;\n";
}
else {
k <<
k.decl<output_type>("sum") << ";\n" <<
"if(index == 0){\n" <<
"sum = initial_value;\n" <<
"}\n" <<
"else {\n" <<
"sum = " << first[k.var<uint_>("index")] << ";\n" <<
"index++;\n" <<
"}\n";
}
k <<
"while(index < end){\n" <<
// load next value
k.decl<const input_type>("value") << " = "
<< first[k.var<uint_>("index")] << ";\n";
if(exclusive){
k <<
"if(get_global_id(0) == 0){\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"}\n";
}
k <<
"sum = " << op(k.var<output_type>("sum"),
k.var<output_type>("value")) << ";\n";
if(!exclusive){
k <<
"if(get_global_id(0) == 0){\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"}\n";
}
k <<
"index++;\n" <<
"}\n" << // end while
//.........这里部分代码省略.........
示例11: duplicate_queue
/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
return command_queue(q.context(), q.device(), q.flags());
}
示例12: select_context
/// Binds the specified CUDA context to the calling CPU thread.
inline void select_context(const command_queue &q) {
q.context().set_current();
}
示例13: pad_vector
static inline std::vector<uint>
pad_vector(command_queue &q, const V &v, uint x) {
std::vector<uint> w { v.begin(), v.end() };
w.resize(q.device().max_block_size().size(), x);
return w;
}
示例14: build_sources
/// Create and build a program from source string.
inline vex::backend::program build_sources(
const command_queue &queue, const std::string &source,
const std::string &options = ""
)
{
#ifdef VEXCL_SHOW_KERNELS
std::cout << source << std::endl;
#else
if (getenv("VEXCL_SHOW_KERNELS"))
std::cout << source << std::endl;
#endif
std::string compile_options = options + " " + get_compile_options(queue);
queue.context().set_current();
auto cc = queue.device().compute_capability();
std::ostringstream ccstr;
ccstr << std::get<0>(cc) << std::get<1>(cc);
sha1_hasher sha1;
sha1.process(source)
.process(queue.device().name())
.process(compile_options)
.process(ccstr.str())
;
std::string hash = static_cast<std::string>(sha1);
// Write source to a .cu file
std::string basename = program_binaries_path(hash, true) + "kernel";
std::string ptxfile = basename + ".ptx";
if ( !boost::filesystem::exists(ptxfile) ) {
std::string cufile = basename + ".cu";
{
std::ofstream f(cufile);
f << source;
}
// Compile the source to ptx.
std::ostringstream cmdline;
cmdline
<< "nvcc -ptx -O3"
<< " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc)
<< " " << compile_options
<< " -o " << ptxfile << " " << cufile;
if (0 != system(cmdline.str().c_str()) ) {
#ifndef VEXCL_SHOW_KERNELS
std::cerr << source << std::endl;
#endif
vex::detail::print_backtrace();
throw std::runtime_error("nvcc invocation failed");
}
}
// Load the compiled ptx.
CUmodule prg;
cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) );
return program(queue.context(), prg);
}
示例15: operator
bool operator()(const command_queue &a, const command_queue &b) const {
return a.get() < b.get();
}