本文整理汇总了C++中LayerDimensions::buildOptionsString方法的典型用法代码示例。如果您正苦于以下问题:C++ LayerDimensions::buildOptionsString方法的具体用法?C++ LayerDimensions::buildOptionsString怎么用?C++ LayerDimensions::buildOptionsString使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类LayerDimensions
的用法示例。
在下文中一共展示了LayerDimensions::buildOptionsString方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: measurePerf
TEST(SLOW_testbackward, perf_kgsgo_32c5) {
int batchSize = 128;
LayerDimensions dim;
dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
.setPadZeros(true).setBiased(true);
cout << dim.buildOptionsString() << endl;
// ActivationFunction *fn = new ReluActivation();
measurePerf(2, batchSize, dim);
}
示例2: compareSpecific
TEST(SLOW_testbackward, compare_kgsgo_32c5mini) {
int batchSize = 4;
LayerDimensions dim;
dim.setInputPlanes(2).setInputSize(3).setNumFilters(2).setFilterSize(3)
.setPadZeros(true).setBiased(true);
cout << dim.buildOptionsString() << endl;
// ActivationFunction *fn = new ReluActivation();
compareSpecific(1, 2, 1, batchSize, dim);
}
示例3: BackpropWeights
BackpropWeightsScratchBias::BackpropWeightsScratchBias( OpenCLHelper *cl, LayerDimensions dim, ActivationFunction const *fn ) :
BackpropWeights( cl, dim, fn )
{
// [[[cog
// import stringify
// # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
// ]]]
// [[[end]]]
std::string options = dim.buildOptionsString();
options += " -D " + fn->getDefineName();
kernel = cl->buildKernel( "backpropweights.cl", "backprop_floats_withscratch_dobias", options );
// kernel = cl->buildKernelFromString( kernelSource, "calcErrorsForUpstream", options );
}
示例4: testPerf
void testPerf( int instance, int N, int batchSize, LayerDimensions dim ) {
cout << dim.buildOptionsString() << endl;
int inputsSize = batchSize * dim.inputCubeSize;
int filtersSize = dim.filtersSize;
int biasSize = dim.numFilters;
int inputsAllocated = std::max( inputsSize, 10000 );
int filtersAllocated = std::max( filtersSize, 10000 );
int biasFiltersAllocated = std::max( biasSize, 10000 );
float *inputs = new float[ inputsAllocated ];
float *filters = new float[ filtersAllocated ];
float *biasFilters = new float[ biasFiltersAllocated ];
memset( inputs, 0, sizeof(float) * inputsAllocated );
memset( filters, 0, sizeof(float) * filtersAllocated );
memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated );
WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f );
WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f );
WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f );
EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
Forward *p1 = Forward::instanceSpecific( instance, cl, dim );
for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) {
int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it;
float *output1 = new float[p1->getOutputTotalSize(thisBatchSize)];
p1->forward( thisBatchSize, inputs, filters, biasFilters, output1 );
delete[] output1;
}
StatefulTimer::dump(true);
delete p1;
delete cl;
delete[] inputs;
delete[] filters;
delete[] biasFilters;
}
示例5: testPerf
void testPerf( int instance, int N, int batchSize, LayerDimensions dim, ActivationFunction *fn ) {
cout << dim.buildOptionsString() << endl;
int inputsSize = batchSize * dim.inputCubeSize;
int filtersSize = dim.filtersSize;
int biasSize = dim.numFilters;
int inputsAllocated = std::max( inputsSize, 10000 );
int filtersAllocated = std::max( filtersSize, 10000 );
int biasFiltersAllocated = std::max( biasSize, 10000 );
float *inputs = new float[ inputsAllocated ];
float *filters = new float[ filtersAllocated ];
float *biasFilters = new float[ biasFiltersAllocated ];
memset( inputs, 0, sizeof(float) * inputsAllocated );
memset( filters, 0, sizeof(float) * filtersAllocated );
memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated );
WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f );
WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f );
WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f );
OpenCLHelper *cl = OpenCLHelper::createForFirstGpuOtherwiseCpu();
Propagate *p1 = Propagate::instanceSpecific( instance, cl, dim, fn );
for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) {
int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it;
float *results1 = p1->propagate( thisBatchSize, inputs, filters, biasFilters );
delete[] results1;
}
StatefulTimer::dump(true);
delete p1;
delete cl;
delete[] inputs;
delete[] filters;
delete[] biasFilters;
}
示例6: Backward
BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) :
Backward( cl, dim )
{
std::string options = dim.buildOptionsString();
options += ""; // " -D " + upstreamFn->getDefineName();
// [[[cog
// import stringify
// stringify.write_kernel2( "kernel", "cl/backward.cl", "calcGradInput", 'options' )
// # stringify.write_kernel2( "broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options' )
// # stringify.write_kernel2( "applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options' )
// # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
// ]]]
// generated using cog, from cl/backward.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"// expected defines:\n"
"// - none\n"
"\n"
"// globalid as: [n][upstreamPlane][upstreamrow][upstreamcol]\n"
"// inputdata: [n][upstreamPlane][upstreamrow][upstreamcol] 128 * 32 * 19 * 19 * 4 = 6MB\n"
"// gradOutput: [n][outPlane][outRow][outCol] 128 * 32 * 19 * 19 * 4 = 6MB\n"
"// weights: [filterId][inputPlane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4 = 409KB\n"
"void kernel calcGradInput(\n"
" const int batchSize,\n"
" global const float *gradOutput, global float *weights, global float *gradInput ) {\n"
" int globalId = get_global_id(0);\n"
"\n"
" const int upstreamImage2dId = globalId / gInputImageSizeSquared;\n"
"\n"
" const int intraImageOffset = globalId % gInputImageSizeSquared;\n"
" const int upstreamRow = intraImageOffset / gInputImageSize;\n"
" const int upstreamCol = intraImageOffset % gInputImageSize;\n"
"\n"
" const int upstreamPlane = upstreamImage2dId % gInputPlanes;\n"
" const int n = upstreamImage2dId / gInputPlanes;\n"
"\n"
" if( n >= batchSize ) {\n"
" return;\n"
" }\n"
"\n"
" const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );\n"
" const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );\n"
" const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );\n"
" const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );\n"
"\n"
" float sumWeightTimesOutError = 0;\n"
" // aggregate over [outPlane][outRow][outCol]\n"
" for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {\n"
" for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {\n"
" int outRow = upstreamRow + gMargin - filterRow;\n"
" for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {\n"
" int outCol = upstreamCol + gMargin - filterCol;\n"
" int resultIndex = ( ( n * gNumFilters\n"
" + outPlane ) * gOutputImageSize\n"
" + outRow ) * gOutputImageSize\n"
" + outCol;\n"
" float thisError = gradOutput[resultIndex];\n"
" int thisWeightIndex = ( ( outPlane * gInputPlanes\n"
" + upstreamPlane ) * gFilterSize\n"
" + filterRow ) * gFilterSize\n"
" + filterCol;\n"
" float thisWeight = weights[thisWeightIndex];\n"
" float thisWeightTimesError = thisWeight * thisError;\n"
" sumWeightTimesOutError += thisWeightTimesError;\n"
" }\n"
" }\n"
" }\n"
" gradInput[globalId] = sumWeightTimesOutError;\n"
"}\n"
"\n"
"";
kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options, "cl/backward.cl" );
// [[[end]]]
// kernel = cl->buildKernel( "backproperrorsv2.cl", "calcGradInput", options );
// kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options );
}
示例7: Forward
Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) :
Forward( cl, dim )
{
addBias = new AddBias( cl );
std::string options = "";
options += dim.buildOptionsString();
// [[[cog
// import stringify
// stringify.write_kernel2( "kernel", "cl/forward1.cl", "convolve_imagecubes_float2", 'options' )
// ]]]
// generated using cog, from cl/forward1.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"// notes on non-odd filtersizes:\n"
"// for odd, imagesize and filtersize 3, padZeros = 0:\n"
"// output is a single square\n"
"// m and n should vary between -1,0,1\n"
"// for even, imagesize and filtersize 2, padzeros = 0\n"
"// output is a single square, which we can position at topleft or bottomrigth\n"
"// lets position it in bottomright\n"
"// then m and n should vary as -1,0\n"
"//\n"
"// for even, imagesize and filtersize 2, padzeros = 1\n"
"// output is 2 by 2\n"
"// well... if it is even:\n"
"// - if we are not padding zeros, then we simply move our filter around the image somehow\n"
"// - if we are padding zeros, then we conceptually pad the bottom and right edge of the image with zeros by 1\n"
"// filtersize remains the same\n"
"// m will vary as -1,0,1\n"
"// outputrow is fixed by globalid\n"
"// inputrow should be unchanged...\n"
"// padzeros = 0:\n"
"// x x . . . .\n"
"// x x . . x x\n"
"// . . . . x x\n"
"// when filtersize even:\n"
"// new imagesize = oldimagesize - filtersize + 1\n"
"// when filtersize odd:\n"
"// x x x .\n"
"// x x x .\n"
"// x x x .\n"
"// . . . .\n"
"// new imagesize = oldimagesize - filtersize + 1\n"
"// padzeros = 1:\n"
"// x x\n"
"// x x . . x x . . . . . . .\n"
"// . . . x x . . x x . . .\n"
"// . . . . . . . x x . . x x\n"
"// outrow=0 outrow=1 outrow=2 x x\n"
"// outcol=0 outcol=1 outcol=2 outrow=3\n"
"// outcol=3\n"
"// when filtersize is even, and padzeros, imagesize grows by 1 each time...\n"
"// imagesize = oldimagesize + 1\n"
"// when filtersize is odd\n"
"// x x x\n"
"// x x x . x x x . . .\n"
"// x x x . x x x . x x x\n"
"// . . . x x x . x x x\n"
"// x x x\n"
"\n"
"// images are organized like [imageId][plane][row][col]\n"
"// filters are organized like [filterid][inplane][filterrow][filtercol]\n"
"// output are organized like [imageid][filterid][row][col]\n"
"// global id is organized like output, ie: [imageid][outplane][outrow][outcol]\n"
"// - no local memory used currently\n"
"// - each thread:\n"
"// - loads a whole upstream cube\n"
"// - loads a whole filter cube\n"
"// - writes one output...\n"
"void kernel convolve_imagecubes_float2(\n"
" const int numExamples,\n"
" global const float *inputs, global const float *filters,\n"
" global float *output ) {\n"
" int globalId = get_global_id(0);\n"
"\n"
" int outputImage2Id = globalId / gOutputImageSizeSquared;\n"
" int exampleId = outputImage2Id / gNumFilters;\n"
" int filterId = outputImage2Id % gNumFilters;\n"
"\n"
" // intraimage coords\n"
" int localid = globalId % gOutputImageSizeSquared;\n"
" int outputRow = localid / gOutputImageSize;\n"
" int outputCol = localid % gOutputImageSize;\n"
"\n"
" global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputImageSizeSquared;\n"
" global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;\n"
"\n"
" float sum = 0;\n"
" if( exampleId < numExamples ) {\n"
" for( int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++ ) {\n"
" global float const*inputPlane = inputCube + inputPlaneIdx * gInputImageSizeSquared;\n"
" global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;\n"
" for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {\n"
//.........这里部分代码省略.........
示例8: runtime_error
BackpropWeightsScratchLarge::BackpropWeightsScratchLarge(EasyCL *cl, LayerDimensions dim) :
BackpropWeights(cl, dim)
{
if(square(dim.filterSize) > cl->getMaxWorkgroupSize()) {
throw runtime_error("cannot use BackpropWeightsScratchLarge, since filterSize * filterSize > maxworkgroupsize");
}
// [[[cog
// import stringify
// # stringify.write_kernel("kernelSource", "ClConvolve.cl")
// ]]]
// [[[end]]]
// cout << "dim: " << dim << endl;
std::string options = dim.buildOptionsString();
int localMemoryRequirementsFullImage = dim.inputSize * dim.inputSize * 4 + dim.outputSize * dim.outputSize * 4;
int availableLocal = cl->getLocalMemorySize();
// cout << "localmemoryrequirementsfullimage: " << localMemoryRequirementsFullImage << endl;
// cout << "availablelocal: " << availableLocal << endl;
// make the local memory used about one quarter of what is available? half of what is available?
// let's try one quarter :-)
int localWeCanUse = availableLocal / 4;
numStripes = (localMemoryRequirementsFullImage + localWeCanUse - 1) / localWeCanUse;
// cout << "numStripes: " << numStripes << endl;
// make it a power of 2
numStripes = EasyCL::getNextPower2(numStripes);
// cout << "numStripes: " << numStripes << endl;
int inputStripeMarginRows = dim.filterSize - 1;
int inputStripeInnerNumRows = dim.inputSize / numStripes;
int inputStripeOuterNumRows = inputStripeInnerNumRows + 2 * inputStripeMarginRows;
int inputStripeInnerSize = inputStripeInnerNumRows * dim.inputSize;
inputStripeOuterSize = inputStripeOuterNumRows * dim.inputSize;
int inputStripeMarginSize = inputStripeMarginRows * dim.inputSize;
int outputStripeNumRows = (dim.outputSize + numStripes - 1) / numStripes;
outputStripeSize = outputStripeNumRows * dim.outputSize;
// [[[cog
// import cog_optionswriter
// cog_optionswriter.write_options(['numStripes','inputStripeMarginRows','inputStripeInnerNumRows',
// 'inputStripeOuterNumRows', 'inputStripeInnerSize', 'inputStripeOuterSize', 'inputStripeMarginSize',
// 'outputStripeNumRows', 'outputStripeSize' ])
// ]]]
// generated, using cog:
options += " -DgNumStripes=" + toString(numStripes);
options += " -DgInputStripeMarginRows=" + toString(inputStripeMarginRows);
options += " -DgInputStripeInnerNumRows=" + toString(inputStripeInnerNumRows);
options += " -DgInputStripeOuterNumRows=" + toString(inputStripeOuterNumRows);
options += " -DgInputStripeInnerSize=" + toString(inputStripeInnerSize);
options += " -DgInputStripeOuterSize=" + toString(inputStripeOuterSize);
options += " -DgInputStripeMarginSize=" + toString(inputStripeMarginSize);
options += " -DgOutputStripeNumRows=" + toString(outputStripeNumRows);
options += " -DgOutputStripeSize=" + toString(outputStripeSize);
// [[[end]]]
cout << "options: " << options << endl;
// [[[cog
// import stringify
// stringify.write_kernel2("kernel", "cl/BackpropWeightsScratchLarge.cl", "backprop_floats_withscratch_dobias_striped", 'options')
// ]]]
// generated using cog, from cl/BackpropWeightsScratchLarge.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014,2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"// expected defines:\n"
"// BIASED (or not)\n"
"\n"
"// workgroupId: [outputPlane][inputPlane]\n"
"// localId: [filterRow][filterCol]\n"
"// per-thread iteration: [n][outputRow][outputCol]\n"
"// local: errorimage: outputSize * outputSize\n"
"// imageimage: inputSize * inputSize\n"
"// specific characteristic: load one stripe of each image at a time,\n"
"// so we dont run out of memory\n"
"// number of stripes set in: gNumStripes\n"
"// note that whilst we can stripe the gradOutput simply,\n"
"// we actually need to add a half-filter widthed additional few rows\n"
"// onto the images stripe, otherwise we will be missing data\n"
"// we will call the size of the non-overlapping image stripes: gInputStripeInnerSize\n"
"// the outersize, including the two margins is: gInputStripeOuterSize\n"
"// of course, the first and last stripes will be missing a bit off the top/bottom, where the\n"
"// corresponding outer margin would be\n"
"void kernel backprop_floats_withscratch_dobias_striped(\n"
" const float learningRateMultiplier, const int batchSize,\n"
" global const float *gradOutput, global const float *images,\n"
" global float *gradWeights,\n"
" #ifdef BIASED\n"
" global float *gradBiasWeights,\n"
" #endif\n"
" local float *_errorStripe, local float *_imageStripe\n"
" ) {\n"
" // gHalfFilterSize\n"
" // gInputSize\n"
" //\n"
//.........这里部分代码省略.........
示例9: BackpropWeights
BackpropWeightsNaive::BackpropWeightsNaive(EasyCL *cl, LayerDimensions dim) :
BackpropWeights(cl, dim)
{
std::string options = dim.buildOptionsString();
// [[[cog
// import stringify
// stringify.write_kernel2("kernel", "cl/backpropweights.cl", "backprop_floats", 'options')
// ]]]
// generated using cog, from cl/backpropweights.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014,2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"// expected defines:\n"
"// BIASED (or not)\n"
"\n"
"// globalId: [outPlane][inputPlane][filterRow][filterCol]\n"
"// per-thread iteration: [n][outputRow][outputCol]\n"
"void kernel backprop_floats(const float learningRateMultiplier,\n"
" const int batchSize,\n"
" global const float *gradOutput, global const float *images,\n"
" global float *gradWeights\n"
" #ifdef BIASED\n"
" , global float *gradBiasWeights\n"
" #endif\n"
" ) {\n"
" int globalId = get_global_id(0);\n"
" if (globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize) {\n"
" return;\n"
" }\n"
"\n"
" int IntraFilterOffset = globalId % gFilterSizeSquared;\n"
" int filterRow = IntraFilterOffset / gFilterSize;\n"
" int filterCol = IntraFilterOffset % gFilterSize;\n"
"\n"
" int filter2Id = globalId / gFilterSizeSquared;\n"
" int outPlane = filter2Id / gInputPlanes;\n"
" int upstreamPlane = filter2Id % gInputPlanes;\n"
"\n"
" float thiswchange = 0;\n"
" // gradWeights: [outPlane][upstreamPlane][filterRow][filterCol]\n"
" // aggregate over: [outRow][outCol][n]\n"
"#ifdef BIASED\n"
" float thisbiaschange = 0;\n"
"#endif\n"
" for (int n = 0; n < batchSize; n++) {\n"
" for (int outRow = 0; outRow < gOutputSize; outRow++) {\n"
" int upstreamRow = outRow - gMargin + filterRow;\n"
" for (int outCol = 0; outCol < gOutputSize; outCol++) {\n"
" int upstreamCol = outCol - gMargin + filterCol;\n"
" bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize\n"
" && upstreamCol < gInputSize;\n"
" if (proceed) {\n"
" int resultIndex = (( n * gNumFilters\n"
" + outPlane) * gOutputSize\n"
" + outRow) * gOutputSize\n"
" + outCol;\n"
" float error = gradOutput[resultIndex];\n"
" int upstreamDataIndex = (( n * gInputPlanes\n"
" + upstreamPlane) * gInputSize\n"
" + upstreamRow) * gInputSize\n"
" + upstreamCol;\n"
" float upstreamResult = images[upstreamDataIndex];\n"
" float thisimagethiswchange = upstreamResult * error;\n"
" thiswchange += thisimagethiswchange;\n"
" #ifdef BIASED\n"
" thisbiaschange += error;\n"
" #endif\n"
" }\n"
" }\n"
" }\n"
" }\n"
" // gradWeights: [outPlane][upstreamPlane][filterRow][filterCol]\n"
" // aggregate over: [outRow][outCol][n]\n"
" gradWeights[ globalId ] = learningRateMultiplier * thiswchange;\n"
"#ifdef BIASED\n"
" bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n"
" if (writeBias) {\n"
" gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n"
" }\n"
"#endif\n"
"}\n"
"\n"
"\n"
"\n"
"";
kernel = cl->buildKernelFromString(kernelSource, "backprop_floats", options, "cl/backpropweights.cl");
// [[[end]]]
}
示例10: Forward
ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl, LayerDimensions dim ) :
Forward( cl, dim )
{
std::string options = ""; // "-D " + fn->getDefineName();
options += dim.buildOptionsString();
// [[[cog
// import stringify
// stringify.write_kernel2( "kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options' )
// stringify.write_kernel2( "kernel2", "cl/forward_fc.cl", "reduce_rows", 'options' )
// ]]]
// generated using cog, from cl/forward_fc_wgperrow.cl:
const char * kernel1Source =
"// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"void copyLocal( local float *restrict target, global float const *restrict source, int N ) {\n"
" int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n"
" for( int loop = 0; loop < numLoops; loop++ ) {\n"
" int offset = loop * get_local_size(0) + get_local_id(0);\n"
" if( offset < N ) {\n"
" target[offset] = source[offset];\n"
" }\n"
" }\n"
"}\n"
"\n"
"// concept:\n"
"// we want to share each input example across multiple filters\n"
"// but an entire filter plane is 19*19*4 = 1.4KB\n"
"// so eg 500 filter planes is 500* 1.4KB = 700KB, much larger than local storage\n"
"// of ~43KB\n"
"// - we could take eg 16 filters at a time, store one filter plane from each in local storage,\n"
"// and then bring down one example plane at a time, into local storage, during iteration over n\n"
"// - here though, we are going to store one row from one plane from each filter,\n"
"// and process against one row, from same plane, from each example\n"
"// so each workgroup will have one thread per filterId, eg 351 threads\n"
"// each thread will add up over its assigned row\n"
"// then, later we need to reduce over the rows\n"
"// ... and also over the input planes?\n"
"//\n"
"// workgroupid [inputplane][filterrow]\n"
"// localid: [filterId]\n"
"// each thread iterates over: [n][filtercol]\n"
"// each thread is assigned to: one row, of one filter\n"
"// workgroup is assigned to: same row, from each input plane\n"
"// local memory: one row from each output, = 128 * 19 * 4 = 9.8KB\n"
"// 1 * input row = \"0.076KB\"\n"
"// output1 structured as: [n][inputplane][filter][row], need to reduce again after\n"
"// this kernel assumes:\n"
"// padzeros == 0 (mandatory)\n"
"// filtersize == inputimagesize (mandatory)\n"
"// inputimagesize == 19\n"
"// filtersize == 19\n"
"// outputImageSize == 1\n"
"// lots of outplanes/filters, hundreds, but less than max work groupsize, eg 350, 500, 361\n"
"// lots of inplanes, eg 32-128\n"
"// inputimagesize around 19, not too small\n"
"#if (gFilterSize == gInputImageSize) && (gPadZeros == 0)\n"
"void kernel forward_fc_workgroup_perrow( const int batchSize,\n"
" global const float *images, global const float *filters,\n"
" global float *output1,\n"
" local float *_imageRow, local float *_filterRows ) {\n"
" const int globalId = get_global_id(0);\n"
"\n"
" const int workgroupId = get_group_id(0);\n"
" const int workgroupSize = get_local_size(0);\n"
" const int localId = get_local_id(0);\n"
"\n"
" const int inputPlaneId = workgroupId / gFilterSize;\n"
" const int filterRowId = workgroupId % gFilterSize;\n"
"\n"
" const int filterId = localId;\n"
"\n"
" // first copy down filter row, which is per-thread, so we have to copy it all ourselves...\n"
" global const float *filterRow = filters\n"
" + filterId * gNumInputPlanes * gFilterSizeSquared\n"
" + inputPlaneId * gFilterSizeSquared\n"
" + filterRowId * gFilterSize;\n"
" local float *_threadFilterRow = _filterRows + localId * gFilterSize;\n"
" for( int i = 0; i < gFilterSize; i++ ) {\n"
" _threadFilterRow[i] = filterRow[i];\n"
" }\n"
" const int loopsPerExample = ( gInputImageSize + workgroupSize - 1 ) / workgroupSize;\n"
" // now loop over examples...\n"
" for( int n = 0; n < batchSize; n++ ) {\n"
" // copy down example row, which is global to all threads in workgroup\n"
" // hopefully should be enough threads....\n"
" // but we should check anyway really, since depends on number of filters configured,\n"
" // not on relative size of filter and input image\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" copyLocal( _imageRow, images\n"
" + ( ( n\n"
" * gNumInputPlanes + inputPlaneId )\n"
" * gInputImageSize + filterRowId )\n"
" * gInputImageSize,\n"
" gInputImageSize );\n"
//.........这里部分代码省略.........
示例11: runtime_error
BackwardGpuCached::BackwardGpuCached(EasyCL *cl, LayerDimensions dim) :
Backward(cl, dim)
{
if(square(dim.inputSize) > cl->getMaxWorkgroupSize()) {
throw runtime_error("cannot use BackwardGpuCached, since inputSize * inputSize > maxworkgroupsize");
}
std::string options = dim.buildOptionsString();
options += ""; // " -D " + upstreamFn->getDefineName();
// [[[cog
// import stringify
// stringify.write_kernel2("kernel", "cl/backward_cached.cl", "calcGradInputCached", 'options')
// # stringify.write_kernel2("broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options')
// # stringify.write_kernel2("applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options')
// # stringify.write_kernel("kernelSource", "ClConvolve.cl")
// ]]]
// generated using cog, from cl/backward_cached.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"void copyLocal(local float *target, global float const *source, int N) {\n"
" int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n"
" for (int loop = 0; loop < numLoops; loop++) {\n"
" int offset = loop * get_local_size(0) + get_local_id(0);\n"
" if (offset < N) {\n"
" target[offset] = source[offset];\n"
" }\n"
" }\n"
"}\n"
"\n"
"// as calcGradInput, but with local cache\n"
"// convolve weights with gradOutput to produce gradInput\n"
"// workgroupid: [n][inputPlane]\n"
"// localid: [upstreamrow][upstreamcol]\n"
"// per-thread aggregation: [outPlane][filterRow][filterCol]\n"
"// need to store locally:\n"
"// - _gradOutputPlane. size = outputSizeSquared\n"
"// - _filterPlane. size = filtersizesquared\n"
"// note: currently doesnt use bias as input. thats probably an error?\n"
"// inputs: gradOutput :convolve: filters => gradInput\n"
"//\n"
"// global:\n"
"// gradOutput: [n][outPlane][outRow][outCol] 128 * 32 * 19 * 19 * 4\n"
"// weights: [filterId][upstreamplane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4\n"
"// per workgroup:\n"
"// gradOutput: [outPlane][outRow][outCol] 32 * 19 * 19 * 4 = 46KB\n"
"// weights: [filterId][filterRow][filterCol] 32 * 5 * 5 * 4 = 3.2KB\n"
"// gradOutputforupstream: [n][upstreamPlane][upstreamRow][upstreamCol]\n"
"void kernel calcGradInputCached(\n"
" const int batchSize,\n"
" global const float *gradOutputGlobal,\n"
" global const float *filtersGlobal,\n"
" global float *gradInput,\n"
" local float *_gradOutputPlane,\n"
" local float *_filterPlane) {\n"
"\n"
" #define globalId get_global_id(0)\n"
" #define localId get_local_id(0)\n"
" #define workgroupId get_group_id(0)\n"
" #define workgroupSize get_local_size(0)\n"
"\n"
" const int n = workgroupId / gInputPlanes;\n"
" const int upstreamPlane = workgroupId % gInputPlanes;\n"
"\n"
" const int upstreamRow = localId / gInputSize;\n"
" const int upstreamCol = localId % gInputSize;\n"
"\n"
" float sumWeightTimesOutError = 0;\n"
" for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" copyLocal(_filterPlane, filtersGlobal + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);\n"
" copyLocal(_gradOutputPlane, gradOutputGlobal + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {\n"
" int outRow = upstreamRow + gMargin - filterRow;\n"
" for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n"
" int outCol = upstreamCol + gMargin - filterCol;\n"
" if (outCol >= 0 && outCol < gOutputSize && outRow >= 0 && outRow < gOutputSize) {\n"
" float thisWeightTimesError =\n"
" _gradOutputPlane[outRow * gOutputSize + outCol] *\n"
" _filterPlane[filterRow * gFilterSize + filterCol];\n"
" sumWeightTimesOutError += thisWeightTimesError;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" const int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n"
" if (localId < gInputSizeSquared) {\n"
" gradInput[upstreamImageGlobalOffset + localId] = sumWeightTimesOutError;\n"
" }\n"
"}\n"
"\n"
"";
kernel = cl->buildKernelFromString(kernelSource, "calcGradInputCached", options, "cl/backward_cached.cl");
// [[[end]]]
// kernel = cl->buildKernel("backproperrorsv2.cl", "calcGradInput", options);
//.........这里部分代码省略.........
示例12: runtime_error
Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
Forward( cl, dim )
{
addBias = new AddBias( cl );
if( square( dim.outputImageSize ) > cl->getMaxWorkgroupSize() ) {
throw runtime_error("cannot use forward3, since outputimagesize * outputimagesize > maxworkgroupsize");
}
std::string options = ""; // "-D " + fn->getDefineName();
options += dim.buildOptionsString();
// [[[cog
// import stringify
// stringify.write_kernel2( "kernel", "cl/forward3.cl", "forward_3_by_n_outplane", 'options' )
// # stringify.write_kernel2( "repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options' )
// ]]]
// generated using cog, from cl/forward3.cl:
const char * kernelSource =
"// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n"
"//\n"
"// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
"// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
"// obtain one at http://mozilla.org/MPL/2.0/.\n"
"\n"
"// concept: each workgroup handles convolving one input example with one filtercube\n"
"// and writing out one single output plane\n"
"//\n"
"// workgroup id organized like: [imageid][outplane]\n"
"// local id organized like: [outrow][outcol]\n"
"// each thread iterates over: [upstreamplane][filterrow][filtercol]\n"
"// number workgroups = 32\n"
"// one filter plane takes up 5 * 5 * 4 = 100 bytes\n"
"// one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)\n"
"// all filter cubes = 3.2KB * 32 = 102KB (too big)\n"
"// output are organized like [imageid][filterid][row][col]\n"
"void kernel forward_3_by_n_outplane( const int batchSize,\n"
" global const float *images, global const float *filters,\n"
" global float *output,\n"
" local float *_upstreamImage, local float *_filterCube ) {\n"
" const int globalId = get_global_id(0);\n"
"\n"
" const int workgroupId = get_group_id(0);\n"
" const int workgroupSize = get_local_size(0);\n"
" const int n = workgroupId / gNumFilters;\n"
" const int outPlane = workgroupId % gNumFilters;\n"
"\n"
" const int localId = get_local_id(0);\n"
" const int outputRow = localId / gOutputImageSize;\n"
" const int outputCol = localId % gOutputImageSize;\n"
"\n"
" const int minu = gPadZeros ? max( -gHalfFilterSize, -outputRow ) : -gHalfFilterSize;\n"
" const int maxu = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputRow - gEven) : gHalfFilterSize - gEven;\n"
" const int minv = gPadZeros ? max( -gHalfFilterSize, -outputCol ) : - gHalfFilterSize;\n"
" const int maxv = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;\n"
"\n"
" const int numUpstreamsPerThread = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;\n"
"\n"
" const int filterCubeLength = gInputPlanes * gFilterSizeSquared;\n"
" const int filterCubeGlobalOffset = outPlane * filterCubeLength;\n"
" const int numPixelsPerThread = ( filterCubeLength + workgroupSize - 1 ) / workgroupSize;\n"
" for( int i = 0; i < numPixelsPerThread; i++ ) {\n"
" int thisOffset = localId + i * workgroupSize;\n"
" if( thisOffset < filterCubeLength ) {\n"
" _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset];\n"
" }\n"
" }\n"
" // dont need a barrier, since we'll just run behind the barrier from the upstream image download\n"
"\n"
" float sum = 0;\n"
" for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {\n"
" int thisUpstreamImageOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for( int i = 0; i < numUpstreamsPerThread; i++ ) {\n"
" int thisOffset = workgroupSize * i + localId;\n"
" if( thisOffset < gInputImageSizeSquared ) {\n"
" _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int filterImageOffset = upstreamPlane * gFilterSizeSquared;\n"
" for( int u = minu; u <= maxu; u++ ) {\n"
" int inputRow = outputRow + u;\n"
" #if gPadZeros == 0\n"
" inputRow += gHalfFilterSize;\n"
" #endif\n"
" int inputimagerowoffset = inputRow * gInputImageSize;\n"
" int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n"
" for( int v = minv; v <= maxv; v++ ) {\n"
" int inputCol = outputCol + v;\n"
" #if gPadZeros == 0\n"
" inputCol += gHalfFilterSize;\n"
" #endif\n"
" if( localId < gOutputImageSizeSquared ) {\n"
" sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];\n"
" }\n"
" }\n"
" }\n"
" }\n"
//.........这里部分代码省略.........