本文整理汇总了C++中OptionParser::getOptionBool方法的典型用法代码示例。如果您正苦于以下问题:C++ OptionParser::getOptionBool方法的具体用法?C++ OptionParser::getOptionBool怎么用?C++ OptionParser::getOptionBool使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类OptionParser
的用法示例。
在下文中一共展示了OptionParser::getOptionBool方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: RunBenchmark
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
// Measures the floating point capability of the device for a variety of
// combinations of arithmetic operations.
//
// Arguments:
// op: the options parser / parameter database
//
// Returns: nothing
//
// Programmer: Zhi Ying([email protected])
// Jun Jin([email protected])
//
// Creation: May 23, 2011
//
// Modifications:
// 12/12/12 - Kyle Spafford - Code style and minor integration updates
//
// ****************************************************************************
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
const bool verbose = op.getOptionBool("verbose");
// Quiet == no progress bar.
const bool quiet = op.getOptionBool("quiet");
const unsigned int passes = op.getOptionInt("passes");
const int micdev = op.getOptionInt("target");
double repeatF = 3;
cout << "Adjust repeat factor = " << repeatF << "\n";
// Initialize progress bar
int totalRuns = 16*passes*2;
ProgressBar pb(totalRuns);
if (!verbose && !quiet)
{
pb.Show(stdout);
}
RunTest<float>(resultDB, passes, verbose, quiet,
repeatF, pb, "-SP", micdev);
RunTest<double>(resultDB, passes, verbose, quiet,
repeatF, pb, "-DP", micdev);
if (!verbose) cout << endl;
}
示例2: if
void
RunBenchmark(cl::Device& devcpp,
cl::Context& ctxcpp,
cl::CommandQueue& queuecpp,
ResultDatabase &resultDB,
OptionParser &op)
{
// convert from C++ bindings to C bindings
// TODO propagate use of C++ bindings
cl_device_id dev = devcpp();
cl_context ctx = ctxcpp();
cl_command_queue queue = queuecpp();
if (getMaxWorkGroupSize(dev) < 64) {
cout << "FFT requires MaxWorkGroupSize of at least 64" << endl;
fillResultDB("SP-FFT", "MaxWorkGroupSize<64", op, resultDB);
fillResultDB("DP-FFT", "MaxWorkGroupSize<64", op, resultDB);
return;
}
bool has_dp = checkExtension(dev, "cl_khr_fp64") ||
checkExtension(dev, "cl_amd_fp64");
if (op.getOptionBool("dump-sp")) {
dump<cplxflt>(op);
}
else if (op.getOptionBool("dump-dp")) {
if (!has_dp) {
cout << "dump-dp: no double precision support!\n";
return;
}
dump<cplxdbl>(op);
}
else {
// Always run single precision test
runTest<cplxflt>("SP-FFT", dev, ctx, queue, resultDB, op);
// If double precision is supported, run the DP test
if (has_dp) {
cout << "DP Supported\n";
runTest<cplxdbl>("DP-FFT", dev, ctx, queue, resultDB, op);
}
else {
cout << "DP Not Supported\n";
fillResultDB("DP-FFT", "DP_Not_Supported", op, resultDB);
}
}
}
示例3: GPUSetup
// ****************************************************************************
// Function: GPUSetup
//
// Purpose:
// do the necessary OpenCL setup for GPU part of the test
//
// Arguments:
// op: the options parser / parameter database
// mympirank: for printing errors in case of failure
// mynoderank: this is typically the device ID (the mapping done in main)
//
// Returns: success/failure
//
// Creation: 2009
//
// Modifications:
//
// ****************************************************************************
//
int GPUSetup(OptionParser &op, int mympirank, int mynoderank)
{
addBenchmarkSpecOptions(op);
if (op.getOptionBool("infoDevices"))
{
OpenCLNodePlatformContainer ndc1;
ndc1.Print (cout);
return (0);
}
// The device option supports specifying more than one device
int platform = op.getOptionInt("platform");
int deviceIdx = mynoderank;
if( deviceIdx >= op.getOptionVecInt( "device" ).size() )
{
std::ostringstream estr;
estr << "Warning: not enough devices specified with --device flag for task "
<< mympirank
<< " ( node rank " << mynoderank
<< ") to claim its own device; forcing to use first device ";
std::cerr << estr.str() << std::endl;
deviceIdx = 0;
}
int device = op.getOptionVecInt("device")[deviceIdx];
// Initialization
_mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) );
std::vector<cl::Device> ctxDevices;
ctxDevices.push_back( *_mpicontention_ocldev );
_mpicontention_ocldriver_ctx = new cl::Context( ctxDevices );
_mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE );
_mpicontention_gpuop = op;
return 0;
}
示例4: if
void
RunBenchmark(ResultDatabase &resultDB, OptionParser &op)
{
// Test to see if this device supports double precision
cudaGetDevice(&fftDevice);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, fftDevice);
bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) ||
(deviceProp.major >= 2);
if (op.getOptionBool("dump-sp")) {
dump<float2>(op);
}
else if (op.getOptionBool("dump-dp")) {
if (!has_dp) {
cout << "dump-dp: no double precision support!\n";
return;
}
dump<double2>(op);
}
else {
cout << "Running single precision test" << endl;
runTest<float2>("SP-FFT", resultDB, op);
if (has_dp) {
cout << "Running double precision test" << endl;
runTest<double2>("DP-FFT", resultDB, op);
}
else {
cout << "Skipping double precision test" << endl;
char atts[32] = "DP_Not_Supported";
// resultDB requires neg entry for every possible result
int passes = op.getOptionInt("passes");
for (int k=0; k<passes; k++) {
resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX);
resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX);
resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX);
resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX);
resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX);
resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX);
}
}
}
}
示例5:
void
RunBenchmark(OptionParser &op)
{
// convert from C++ bindings to C bindings
// TODO propagate use of C++ bindings
if(op.getOptionBool("2D"))
dump2D<cplxflt>(op);
else
dump1D<cplxflt>(op);
}
示例6: strrchr
void
RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
const bool verbose = op.getOptionBool("verbose");
if (verbose) // print MKL version info
{
static char mklver[200];
char *p;
MKL_Get_Version_String(mklver,sizeof(mklver));
mklver[sizeof(mklver)-1] = 0;
p = strrchr(mklver,' ');
if (p) while (p[0]==' ' && p[1]==0) *p-- = 0;
printf("SHOC FFT benchmark using MKL verison %s\n",mklver);
}
RunTest<cplxflt>("SP-FFT", resultDB, op);
RunTest<cplxdbl>("DP-FFT", resultDB, op);
}
示例7: InvalidArgValue
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
StencilFactory<T>* stdStencilFactory = NULL;
Stencil<T>* stdStencil = NULL;
StencilFactory<T>* testStencilFactory = NULL;
Stencil<T>* testStencil = NULL;
try
{
#if defined(PARALLEL)
stdStencilFactory = new MPIHostStencilFactory<T>;
testStencilFactory = new MPICUDAStencilFactory<T>;
#else
stdStencilFactory = new HostStencilFactory<T>;
testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );
// do a sanity check on option values
CheckOptions( opts );
stdStencilFactory->CheckOptions( opts );
testStencilFactory->CheckOptions( opts );
// extract and validate options
std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
if( arrayDims.size() != 2 )
{
cerr << "Dim size: " << arrayDims.size() << "\n";
throw InvalidArgValue( "all overall dimensions must be positive" );
}
if (arrayDims[0] == 0) // User has not specified a custom size
{
int sizeClass = opts.getOptionInt("size");
arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
}
long int seed = (long)opts.getOptionInt( "seed" );
bool beVerbose = opts.getOptionBool( "verbose" );
unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );
#if defined(PARALLEL)
unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
unsigned int haloWidth = 1;
#endif // defined(PARALLEL)
float haloVal = (float)opts.getOptionFloat( "haloVal" );
// build a description of this experiment
std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
assert( lDims.size() == 2 );
std::ostringstream experimentDescriptionStr;
experimentDescriptionStr
<< nIters << ':'
<< arrayDims[0] << 'x' << arrayDims[1] << ':'
<< lDims[0] << 'x' << lDims[1];
unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );
// compute the expected result on the host
// or read it from a pre-existing file
std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
int cwrank;
MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
if( cwrank == 0 )
{
#endif // defined(PARALLEL)
if( !matrixFilenameBase.empty() )
{
std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
<< std::endl;
}
else
{
std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
<< "Depending on host capabilities, this may take a while."
<< std::endl;
}
#if defined(PARALLEL)
}
#endif // defined(PARALLEL)
Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
arrayDims[1] + 2*haloWidth );
Initialize<T> init( seed, haloWidth, haloVal );
bool haveExpectedData = false;
if( ! matrixFilenameBase.empty() )
{
bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
if( readOK )
{
if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
(expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
{
//.........这里部分代码省略.........
示例8: main
// ****************************************************************************
// Method: main()
//
// Purpose:
// serial and parallel main for OpenCL level0 benchmarks
//
// Arguments:
// argc, argv
//
// Programmer: SHOC Team
// Creation: The Epoch
//
// Modifications:
// Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010
// Changed the way device selection works. It now defaults to the device
// index corresponding to the process's rank within a node if no devices
// are specified on the command command line, and otherwise, round-robins
// the list of devices among the tasks.
//
// Gabriel Marin, Tue Jun 01 15:38 EST 2010
// Check that we have valid (not NULL) context and queue objects before
// running the benchmarks. Errors inside CreateContextFromSingleDevice or
// CreateCommandQueueForContextAndDevice were not propagated out to the main
// program.
//
// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
// Split timing reports into detailed and summary. For serial code, we
// report all trial values, and for parallel, skip the per-process vals.
// Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
int ret = 0;
try
{
#ifdef PARALLEL
int rank, size;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n";
#endif
OptionParser op;
//Add shared options to the parser
op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use",
'p');
op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd');
op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
op.addOption("infoDevices", OPT_BOOL, "",
"show info for available platforms and devices", 'i');
op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');
addBenchmarkSpecOptions(op);
if (!op.parse(argc, argv))
{
#ifdef PARALLEL
if (rank == 0)
op.usage();
MPI_Finalize();
#else
op.usage();
#endif
return (op.HelpRequested() ? 0 : 1 );
}
if (op.getOptionBool("infoDevices"))
{
#define DEBUG_DEVICE_CONTAINER 0
#ifdef PARALLEL
// execute following code only if I am the process of lowest
// rank on this node
NodeInfo NI;
int mynoderank = NI.nodeRank();
if (mynoderank==0)
{
int nlrrank, nlrsize;
MPI_Comm nlrcomm = NI.getNLRComm();
MPI_Comm_size(nlrcomm, &nlrsize);
MPI_Comm_rank(nlrcomm, &nlrrank);
OpenCLNodePlatformContainer ndc1;
OpenCLMultiNodeContainer localMnc(ndc1);
localMnc.doMerge (nlrrank, nlrsize, nlrcomm);
if (rank==0) // I am the global rank 0, print all configurations
localMnc.Print (cout);
}
#else
OpenCLNodePlatformContainer ndc1;
ndc1.Print (cout);
#if DEBUG_DEVICE_CONTAINER
OpenCLMultiNodeContainer mnc1(ndc1), mnc2;
mnc1.Print (cout);
ostringstream oss;
//.........这里部分代码省略.........
示例9: RunBenchmark
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
const bool verbose = op.getOptionBool("verbose");
// Sizes are in kb
const int nSizes = 17;
int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
32768, 65536};
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
// Create host memory
hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN);
if(hostMem==NULL)
{
cerr << "Couldn't allocate CPU memory! \n";
cerr << "Test failed." << endl;
return;
}
// Initialize memory with some pattern.
for (int i = 0; i < numMaxFloats; i++)
{
hostMem[i] = i % 77;
}
const unsigned int passes = op.getOptionInt("passes");
int micdev = op.getOptionInt("target");
// Allocate memory on the card
#pragma offload target(mic:micdev) \
nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) )
{
}
// Three passes, forward and backward both
for (int pass = 0; pass < passes; pass++)
{
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++)
{
int sizeIndex;
if ((pass % 2) == 0)
{
sizeIndex = i;
}
else
{
sizeIndex = (nSizes - 1) - i;
}
int nbytes = sizes[sizeIndex] * 1024;
// D->H test
double start = curr_second();
#pragma offload target(mic:micdev) \
out(hostMem:length((1024*sizes[sizeIndex]/4)) \
free_if(0) alloc_if(0) )
{
}
double t = curr_second()-start;
if (verbose)
{
cerr << "Size " << sizes[sizeIndex] << "k took " << t <<
" sec\n";
}
double speed = (double(sizes[sizeIndex]) * 1024 /
(1000. * 1000. * 1000.)) / t;
char sizeStr[256];
sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]);
resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);
resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000);
}
}
// Free memory allocated on the mic
#pragma offload target(mic:micdev) \
in(hostMem:length(numMaxFloats) alloc_if(0) )
{
}
// Cleanup
_mm_free(hostMem);
}
示例10: assert
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
StencilFactory<T>* stdStencilFactory = NULL;
Stencil<T>* stdStencil = NULL;
StencilFactory<T>* testStencilFactory = NULL;
Stencil<T>* testStencil = NULL;
//try
{
stdStencilFactory = new HostStencilFactory<T>;
testStencilFactory = new MICStencilFactory<T>;
assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );
// do a sanity check on option values
CheckOptions( opts );
stdStencilFactory->CheckOptions( opts );
testStencilFactory->CheckOptions( opts );
// extract and validate options
std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
if( arrayDims.size() != 2 )
{
cerr << "Dim size: " << arrayDims.size() << "\n";
//throw InvalidArgValue( "all overall dimensions must be positive" );
}
if (arrayDims[0] == 0) // User has not specified a custom size
{
const int probSizes[4] = { 768, 1408, 2048, 4096 };
int sizeClass = opts.getOptionInt("size");
if (!(sizeClass >= 0 && sizeClass < 5))
{
//throw InvalidArgValue( "Size class must be between 1-4" );
}
arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1];
}
long int seed = (long)opts.getOptionInt( "seed" );
bool beVerbose = opts.getOptionBool( "verbose" );
unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );
#if defined(PARALLEL)
unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
unsigned int haloWidth = 1;
#endif // defined(PARALLEL)
float haloVal = (float)opts.getOptionFloat( "haloVal" );
// build a description of this experiment
std::ostringstream experimentDescriptionStr;
experimentDescriptionStr
<< nIters << ':'
<< arrayDims[0] << 'x' << arrayDims[1] << ':'
<< LROWS << 'x' << LCOLS;
unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" );
unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
(arrayDims[1] + 2*haloWidth - 2);
unsigned long nflops = npts * 11 * nIters;
cout<<"flops are = "<<nflops<<endl;
// compute the expected result on the host
#if defined(PARALLEL)
int cwrank;
MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
if( cwrank == 0 )
{
#endif // defined(PARALLEL)
std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n"
<< "Depending on host capabilities, this may take a while."
<< std::endl;
#if defined(PARALLEL)
}
#endif // defined(PARALLEL)
Matrix2D<T> exp( arrayDims[0] + 2*haloWidth,
arrayDims[1] + 2*haloWidth );
Initialize<T> init( seed,
haloWidth,
haloVal );
init( exp );
if( beVerbose )
{
std::cout << "initial state:\n" << exp << std::endl;
}
Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts );
(*stdStencil)( exp, nIters );
if( beVerbose )
{
std::cout << "expected result:\n" << exp << std::endl;
}
// compute the result on the MIC device
Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
arrayDims[1] + 2*haloWidth );
//.........这里部分代码省略.........
示例11: RunBenchmark
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
// Executes a series of arithmetic benchmarks for OpenCL devices.
// OpenCL kernels are auto-generated based on the values in the
// _benchmark_type structures.
// The benchmark tests throughput for add, multiply, multiply-add and
// multiply+multiply-add series of operations, for 1, 2, 4 and 8
// independent streams..
//
// Arguments:
// ctx: the opencl context to use for the benchmark
// queue: the opencl command queue to issue commands to
// resultDB: results from the benchmark are stored in this db
// op: the options parser (contains input parameters)
//
// Returns: nothing
//
// Programmer: Gabriel Marin
// Creation: June 26, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(cl::Device& devcpp,
cl::Context& ctxcpp,
cl::CommandQueue& queuecpp,
ResultDatabase &resultDB,
OptionParser &op)
{
// convert from C++ bindings to C bindings
// TODO propagate use of C++ bindings
cl_device_id id = devcpp();
cl_context ctx = ctxcpp();
cl_command_queue queue = queuecpp();
int npasses = op.getOptionInt("passes");
bool verbose = op.getOptionBool("verbose");
bool quiet = op.getOptionBool("quiet");
int err;
cl_mem mem1;
float *hostMem, *hostMem2;
size_t maxGroupSize = 1;
size_t localWorkSize = 1;
// Seed the random number generator
srand48(8650341L);
// To prevent this benchmark from taking too long to run, we
// calibrate how many repetitions of each test to execute. To do this we
// run one pass through a multiply-add benchmark and then adjust
// the repeat factor based on runtime. Use MulMAdd4 for this.
int aIdx = 0;
float repeatF = 1.0f;
// Find the index of the MAdd4 benchmark
while ((aTests!=0) && (aTests[aIdx].name!=0) &&
strcmp(aTests[aIdx].name,"MAdd4"))
{
aIdx += 1;
}
if (aTests && aTests[aIdx].name) // we found a benchmark with that name
{
struct _benchmark_type temp = aTests[aIdx];
// Limit to one repetition
temp.numRepeats = 10;
// Kernel will be generated into this stream
ostringstream oss;
generateKernel (oss, temp, "float", "");
std::string kernelCode(oss.str());
// Allocate host memory
int halfNumFloatsMax = temp.halfBufSizeMax*1024/4;
int numFloatsMax = 2*halfNumFloatsMax;
hostMem = new float[numFloatsMax];
hostMem2 = new float[numFloatsMax];
// Allocate device memory
mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
sizeof(float)*numFloatsMax, NULL, &err);
CL_CHECK_ERROR(err);
err = clEnqueueWriteBuffer(queue, mem1, true, 0,
numFloatsMax*sizeof(float), hostMem,
0, NULL, NULL);
CL_CHECK_ERROR(err);
// Create the kernel program
const char* progSource[] = {kernelCode.c_str()};
cl_program prog = clCreateProgramWithSource(ctx, 1, progSource,
NULL, &err);
CL_CHECK_ERROR(err);
// Compile the kernel
err = clBuildProgram(prog, 0, NULL, opts, NULL, NULL);
// Compile the kernel
CL_CHECK_ERROR(err);
//.........这里部分代码省略.........
示例12: RunTest
void RunTest(const string& name, ResultDatabase &resultDB, OptionParser &op)
{
static __declspec(target(mic)) T2 *source;
int chk;
unsigned long bytes = 0;
const int micdev = op.getOptionInt("target");
const bool verbose = op.getOptionBool("verbose");
// Get problem size
if (op.getOptionInt("MB") == 0) {
int probSizes[4] = { 1, 8, 96, 256 };
int sizeIndex = op.getOptionInt("size")-1;
if (sizeIndex < 0 || sizeIndex >= 4) {
cerr << "Invalid size index specified\n";
exit(-1);
}
bytes = probSizes[sizeIndex];
} else {
bytes = op.getOptionInt("MB");
}
// Convert to MiB
bytes *= 1024 * 1024;
int passes = op.getOptionInt("passes");
// The size of the transform computed is fixed at 512 complex elements
int fftsz = 512;
int N = (bytes)/sizeof(T2);
int n_ffts = N/fftsz;
// Allocate space (aligned)
source = (T2*) MKL_malloc(bytes, 4096);
//allocate buffers and create FFT plans
#pragma offload target(mic:micdev) in(fftsz, n_ffts) \
nocopy(source:length(N) \
align(4096) alloc_if(1) free_if(0))
{
forward((T2*)NULL, fftsz, n_ffts);
inverse((T2*)NULL, fftsz, n_ffts);
}
const char *sizeStr;
stringstream ss;
ss << "N=" << (long)N;
sizeStr = strdup(ss.str().c_str());
for(int k = 0; k < passes; k++)
{
init<T2>( source, fftsz, n_ffts );
// Warmup
if (k==0)
{
#pragma offload target(mic:micdev) in(fftsz, n_ffts) \
in(source:length(N) \
alloc_if(0) free_if(0))
{
forward(source, fftsz, n_ffts);
}
}
// Time forward fft with data transfer over PCIe
double time_fwd_pcie = -curr_second();
// Using in rather than inout to be consistent with CUDA version.
#pragma offload target(mic:micdev) in(fftsz, n_ffts) \
in(source:length(N) alloc_if(0) \
free_if(0))
{
forward(source, fftsz, n_ffts);
}
time_fwd_pcie += curr_second();
#pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
free_if(0))
{
}
// Time inverse fft with data transfer over PCIe
double time_inv_pcie = -curr_second();
#pragma offload target(mic:micdev) in(fftsz, n_ffts) \
in(source:length(N) \
alloc_if(0) free_if(0))
{
inverse(source, fftsz, n_ffts);
}
time_inv_pcie += curr_second();
#pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
free_if(0))
{}
// Check result
#pragma offload target(mic:micdev) in(fftsz,n_ffts) nocopy(source) \
out(chk)
{
chk = checkDiff(source, fftsz, n_ffts);
}
if (verbose || chk)
{
cout << "Test " << k << ((chk) ? ": Failed\n" : ": Passed\n");
//.........这里部分代码省略.........
示例13: main
// ****************************************************************************
// Function: main
//
// Purpose:
// The main function takes care of initialization (device and MPI), then
// performs the benchmark and prints results.
//
// Arguments:
//
//
// Programmer: Jeremy Meredith
// Creation:
//
// Modifications:
// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
// Split timing reports into detailed and summary. For serial code, we
// report all trial values, and for parallel, skip the per-process vals.
// Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
int ret = 0;
bool noprompt = false;
try
{
#ifdef PARALLEL
int rank, size;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n";
#endif
// Get args
OptionParser op;
//Add shared options to the parser
op.addOption("device", OPT_VECINT, "0",
"specify device(s) to run on", 'd');
op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
op.addOption("size", OPT_INT, "1", "specify problem size", 's');
op.addOption("infoDevices", OPT_BOOL, "",
"show info for available platforms and devices", 'i');
op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');
#ifdef _WIN32
op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit");
#endif
addBenchmarkSpecOptions(op);
if (!op.parse(argc, argv))
{
#ifdef PARALLEL
if (rank == 0)
op.usage();
MPI_Finalize();
#else
op.usage();
#endif
return (op.HelpRequested() ? 0 : 1);
}
bool verbose = op.getOptionBool("verbose");
bool infoDev = op.getOptionBool("infoDevices");
#ifdef _WIN32
noprompt = op.getOptionBool("noprompt");
#endif
int device;
#ifdef PARALLEL
NodeInfo ni;
int myNodeRank = ni.nodeRank();
vector<long long> deviceVec = op.getOptionVecInt("device");
if (myNodeRank >= deviceVec.size()) {
// Default is for task i to test device i
device = myNodeRank;
} else {
device = deviceVec[myNodeRank];
}
#else
device = op.getOptionVecInt("device")[0];
#endif
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
cerr << "Warning: device index: " << device <<
" out of range, defaulting to device 0.\n";
device = 0;
}
// Initialization
EnumerateDevicesAndChoose(device, infoDev);
if( infoDev )
{
return 0;
}
ResultDatabase resultDB;
//.........这里部分代码省略.........
示例14: if
void
init(OptionParser& op, bool _do_dp)
{
cl_int err;
do_dp = _do_dp;
if (!fftCtx) {
// first get the device
int device, platform = op.getOptionInt("platform");
if (op.getOptionVecInt("device").size() > 0) {
device = op.getOptionVecInt("device")[0];
}
else {
device = 0;
}
fftDev = ListDevicesAndGetDevice(platform, device);
// now get the context
fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err);
CL_CHECK_ERROR(err);
}
if (!fftQueue) {
// get a queue
fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE,
&err);
CL_CHECK_ERROR(err);
}
// create the program...
fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err);
CL_CHECK_ERROR(err);
// ...and build it
string args = " -cl-mad-enable ";
if (op.getOptionBool("use-native")) {
args += " -cl-fast-relaxed-math ";
}
if (!do_dp) {
args += " -DSINGLE_PRECISION ";
}
else if (checkExtension(fftDev, "cl_khr_fp64")) {
args += " -DK_DOUBLE_PRECISION ";
}
else if (checkExtension(fftDev, "cl_amd_fp64")) {
args += " -DAMD_DOUBLE_PRECISION ";
}
err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL);
{
char* log = NULL;
size_t bytesRequired = 0;
err = clGetProgramBuildInfo(fftProg,
fftDev,
CL_PROGRAM_BUILD_LOG,
0,
NULL,
&bytesRequired );
log = (char*)malloc( bytesRequired + 1 );
err = clGetProgramBuildInfo(fftProg,
fftDev,
CL_PROGRAM_BUILD_LOG,
bytesRequired,
log,
NULL );
std::cout << log << std::endl;
free( log );
}
if (err != CL_SUCCESS) {
char log[50000];
size_t retsize = 0;
err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG,
50000*sizeof(char), log, &retsize);
CL_CHECK_ERROR(err);
cout << "Retsize: " << retsize << endl;
cout << "Log: " << log << endl;
dumpPTXCode(fftCtx, fftProg, "oclFFT");
exit(-1);
}
else {
// dumpPTXCode(fftCtx, fftProg, "oclFFT");
}
// Create kernel for forward FFT
fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err);
CL_CHECK_ERROR(err);
// Create kernel for inverse FFT
ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err);
CL_CHECK_ERROR(err);
// Create kernel for check
chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err);
CL_CHECK_ERROR(err);
}
示例15: RunBenchmark
// Modifications:
// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010
// Added calculation of latency estimate.
//
void RunBenchmark(cl::Device& devcpp,
cl::Context& ctxcpp,
cl::CommandQueue& queuecpp,
ResultDatabase &resultDB,
OptionParser &op)
{
// convert from C++ bindings to C bindings
// TODO propagate use of C++ bindings
cl_device_id id = devcpp();
cl_context ctx = ctxcpp();
cl_command_queue queue = queuecpp();
bool verbose = op.getOptionBool("verbose");
bool pinned = !op.getOptionBool("nopinned");
int npasses = op.getOptionInt("passes");
const bool waitForEvents = true;
// Sizes are in kb
int nSizes = 20;
int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
32768,65536,131072,262144,524288};
// Max sure we don't surpass the OpenCL limit.
cl_long maxAllocSizeBytes = 0;
clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes)
{
--nSizes;
if (verbose) cout << " - dropping allocation size to keep under reported limit.\n";
if (nSizes < 1)
{
cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b";
return;
}
}
// Create some host memory pattern
if (verbose) cout << ">> creating host mem pattern\n";
int err;
float *hostMem;
cl_mem hostMemObj;
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
if (pinned)
{
hostMemObj = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err);
if (err == CL_SUCCESS)
{
hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err);
}
while (err != CL_SUCCESS)
{
// drop the size and try again
if (verbose) cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
cerr << "Error: Couldn't allocated any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hostMemObj = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err);
if (err == CL_SUCCESS)
{
hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err);
}
}
}
else
{
hostMem = new float[numMaxFloats];
}
for (int i=0; i<numMaxFloats; i++)
hostMem[i] = i % 77;
// Allocate some device memory
if (verbose) cout << ">> allocating device mem\n";
cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
sizeof(float)*numMaxFloats, NULL, &err);
while (err != CL_SUCCESS)
{
// drop the size and try again
if (verbose) cout << " - dropping size allocating device mem\n";
--nSizes;
if (nSizes < 1)
{
//.........这里部分代码省略.........